In [1]:
from ollama import Client
from dotenv import load_dotenv
import os

In [2]:

# Load environment variables from .env file
load_dotenv()

# Initialize the ollama client with environment variables
client = Client(
    host=os.getenv('HOST'),
)




In [3]:

# Send a message to ollama and get the response
response = client.chat(model='llama3.2', messages=[
    {
        'role': 'user',
        'content': 'Why is the sky blue?',
    },
])

# Output the response to the console
print(response['message']['content'])

The sky appears blue because of a phenomenon called Rayleigh scattering, named after the British physicist Lord Rayleigh. When sunlight enters Earth's atmosphere, it encounters tiny molecules of gases such as nitrogen and oxygen.

These molecules scatter the light in all directions, but they scatter shorter (blue) wavelengths more than longer (red) wavelengths. This is known as the "selective scattering" effect. As a result, the blue light is distributed throughout the atmosphere, giving the sky its blue appearance.

However, it's worth noting that the color of the sky can change depending on various factors such as:

1. Time of day: During sunrise and sunset, the sky often takes on hues of orange, pink, or red due to scattering by atmospheric particles.
2. Atmospheric conditions: Dust, pollution, and water vapor in the air can scatter light differently, altering the color of the sky.
3. Altitude and latitude: The angle of the sun's rays and the amount of atmosphere between you and the

In [4]:
import argparse
import os
import shutil
from langchain_community.document_loaders import PyPDFDirectoryLoader, CSVLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_chroma import Chroma
from typing import List



In [5]:
def load_documents() -> List[Document]:
    documents = []
    for root, _, files in os.walk(DATA_PATH):
        for file in files:
            file_path = os.path.join(root, file)
            if file.lower().endswith('.pdf'):
                loader = PyPDFDirectoryLoader(os.path.dirname(file_path))
                documents.extend(loader.load())
            elif file.lower().endswith('.csv'):
                loader = CSVLoader(file_path)
                documents.extend(loader.load())
            elif file.lower().endswith('.txt'):
                loader = TextLoader(file_path)
                documents.extend(loader.load())
            else:
                print(f"Unsupported file type: {file}")
    return documents

def split_documents(documents: List[Document]) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
    )
    return text_splitter.split_documents(documents)

def add_to_chroma(chunks: List[Document]):
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function_llama()
    )

    chunks_with_ids = calculate_chunk_ids(chunks)

    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids]

    if len(new_chunks):
        print(f"Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        try:
            db.add_documents(new_chunks, ids=new_chunk_ids)
            print("Documents added")
        except ValueError as e:
            print(f"Failed to add documents: {e}")
    else:
        print("No new documents to add")

def calculate_chunk_ids(chunks: List[Document]) -> List[Document]:
    last_file_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page", 0)  # Default to 0 for non-PDF files
        current_file_id = f"{source}:{page}"

        if current_file_id == last_file_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        chunk_id = f"{current_file_id}:{current_chunk_index}"
        last_file_id = current_file_id
        chunk.metadata["id"] = chunk_id
        chunk.metadata["file_type"] = os.path.splitext(source)[1][1:].lower()  # Add file type to metadata

    return chunks


def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

In [11]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

client = Client(
    host=os.getenv('HOST'),
)


def get_embedding_function_llama():
    embeddings = OllamaEmbeddings(model="llama3.2", base_url = os.getenv('HOST'))
    return embeddings


In [12]:


CHROMA_PATH = "chroma"
DATA_PATH = r"D:\projects\HugginRAG\data"

# clear_database()

documents = load_documents()
chunks = split_documents(documents)
add_to_chroma(chunks)

Number of existing documents in DB: 0
Adding new documents: 31
Documents added
