In [1]:
from ollama import Client
from dotenv import load_dotenv
import os

In [2]:

# Load environment variables from .env file
load_dotenv()

# Initialize the ollama client with environment variables
client = Client(
    host=os.getenv('HOST'),
)




In [3]:

# Send a message to ollama and get the response
response = client.chat(model='llama3.2', messages=[
    {
        'role': 'user',
        'content': 'Why is the sky blue?',
    },
])

# Output the response to the console
print(response['message']['content'])

The sky appears blue because of a phenomenon called scattering, which occurs when sunlight interacts with the tiny molecules of gases in the Earth's atmosphere.

Here's what happens:

1. Sunlight enters the Earth's atmosphere and encounters the tiny molecules of gases such as nitrogen (N2) and oxygen (O2).
2. These molecules scatter the light in all directions, but they scatter shorter (blue) wavelengths more than longer (red) wavelengths.
3. This is known as Rayleigh scattering, named after the British physicist Lord Rayleigh, who first described the phenomenon in the late 19th century.
4. As a result of this scattering, the blue light is dispersed in all directions and reaches our eyes from all parts of the sky.
5. Our brains perceive this scattered blue light as the color of the sky.

It's worth noting that the color of the sky can vary depending on several factors, such as:

* Time of day: During sunrise and sunset, the sky can take on hues of red, orange, and pink due to the scatt

In [4]:
import argparse
import os
import shutil
from langchain_community.document_loaders import PyPDFDirectoryLoader, CSVLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_chroma import Chroma
from typing import List



In [5]:
def load_documents() -> List[Document]:
    documents = []
    for root, _, files in os.walk(DATA_PATH):
        for file in files:
            file_path = os.path.join(root, file)
            if file.lower().endswith('.pdf'):
                loader = PyPDFDirectoryLoader(os.path.dirname(file_path))
                documents.extend(loader.load())
            elif file.lower().endswith('.csv'):
                loader = CSVLoader(file_path)
                documents.extend(loader.load())
            elif file.lower().endswith('.txt'):
                loader = TextLoader(file_path)
                documents.extend(loader.load())
            else:
                print(f"Unsupported file type: {file}")
    return documents

def split_documents(documents: List[Document]) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=80,
        length_function=len,
    )
    return text_splitter.split_documents(documents)

def add_to_chroma(chunks: List[Document]):
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function_llama()
    )

    chunks_with_ids = calculate_chunk_ids(chunks)

    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids]

    if len(new_chunks):
        print(f"Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        try:
            db.add_documents(new_chunks, ids=new_chunk_ids)
            print("Documents added")
        except ValueError as e:
            print(f"Failed to add documents: {e}")
    else:
        print("No new documents to add")
        
    del db

def calculate_chunk_ids(chunks: List[Document]) -> List[Document]:
    last_file_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page", 0)  # Default to 0 for non-PDF files
        current_file_id = f"{source}:{page}"

        if current_file_id == last_file_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        chunk_id = f"{current_file_id}:{current_chunk_index}"
        last_file_id = current_file_id
        chunk.metadata["id"] = chunk_id
        chunk.metadata["file_type"] = os.path.splitext(source)[1][1:].lower()  # Add file type to metadata

    return chunks


def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

In [6]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

client = Client(
    host=os.getenv('HOST'),
)


def get_embedding_function_llama():
    embeddings = OllamaEmbeddings(model="llama3.2", base_url = os.getenv('HOST'))
    return embeddings


In [7]:


CHROMA_PATH = "chroma"
DATA_PATH = r"D:\projects\HugginRAG\data"

clear_database()

documents = load_documents()
chunks = split_documents(documents)
chunks = calculate_chunk_ids(chunks)
chunks = list(dict((v.metadata["id"], v) for v in chunks).values())

add_to_chroma(chunks)


Number of existing documents in DB: 0
Adding new documents: 145
Failed to add documents: Error raised by inference endpoint: HTTPSConnectionPool(host='f643-158-129-2-158.ngrok-free.app', port=443): Max retries exceeded with url: /api/embeddings (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1000)')))


In [8]:
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain_community.chat_models import ChatOllama

In [9]:
# Set up the chat model
model = ChatOllama(model="llama3.2", base_url=os.getenv('HOST'))

system_message = "You are a helpful AI assistant."
chat_history = [system_message]

# Use ollama to generate a response
user_input = "Hello"
response = model.invoke(user_input)
bot_reply = response.content
chat_history.append((user_input, bot_reply))

user_input = "What's 2+2"
response = model.invoke(user_input)
bot_reply = response.content
chat_history.append((user_input, bot_reply))

for message in chat_history:
    print(f"  * {message}")


SSLError: HTTPSConnectionPool(host='f643-158-129-2-158.ngrok-free.app', port=443): Max retries exceeded with url: /api/chat (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1000)')))

In [12]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
# Set up the chat model
model = ChatOllama(model="llama3.2", base_url=os.getenv('HOST'))

# Define the embedding model
embeddings = OllamaEmbeddings(model="llama3.2", base_url = os.getenv('HOST'))

# Load the existing vector store with the embedding function
db = Chroma(persist_directory=CHROMA_PATH, 
            embedding_function=embeddings)

# Define the user's question
query = "What's secret message?"

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.4},
)
relevant_docs = retriever.invoke(query)

# Display the relevant results with metadata
print("\n--- Relevant Documents ---\n")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")




  self.vectorstore.similarity_search_with_relevance_scores(
No relevant docs were retrieved using the relevance score threshold 0.0



--- Relevant Documents ---

