In [None]:
!pip install chromadb

In [None]:
import chromadb
import hashlib
chroma_client = chromadb.Client()

In [None]:
documents = chroma_client.create_collection(name="rag_documents")

In [None]:
def generate_hash_id(document):
    """
    Generates a hash-based ID for a given document string.

    Args:
        document (str): The document string for which the ID is generated.

    Returns:
        str: A hash-based ID for the document.
    """
    # Use SHA-256 hash function for generating a consistent ID
    return hashlib.sha256(document.encode('utf-8')).hexdigest()

def load_documents_from_text_file(file_path, collection):
    """
    Loads documents from a text file, assigns a hash-based ID to each document, 
    and adds them to a collection.

    Args:
        file_path (str): The path to the text file containing documents (one per line).
        collection (object): The collection object to which the documents and IDs will be added.

    Functionality:
        - Reads a text file line by line.
        - Strips whitespace from each line and skips empty lines.
        - Generates a hash-based ID for each document.
        - Adds the documents and IDs to the collection using `collection.upsert()`.

    Example Usage:
        collection = some_vector_database.collection("my_collection")
        load_documents_from_text_file("trivia.txt", collection)
    """
    try:
        # Open and read the text file line by line
        with open(file_path, 'r') as file:
            lines = file.readlines()

        # Process each line to strip whitespace and remove empty entries
        documents = [line.strip() for line in lines if line.strip()]

        # Generate hash-based IDs for each document
        ids = [generate_hash_id(doc) for doc in documents]

        # Add the documents and IDs to the collection
        collection.upsert(documents=documents, ids=ids)

        print(f"Successfully added {len(documents)} documents to the collection.")

    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
    

In [None]:
load_documents_from_text_file("trivia.txt", documents)


In [None]:
results = documents.query(
    query_texts=["university sport"], # Chroma will embed this for you
    n_results=4 # how many results to return
)
print(results["documents"])

In [None]:
import chromadb.utils.embedding_functions as embedding_functions
# Make sure that you have pulled the embedding model nomic-embed-text
ollama_embedding = embedding_functions.OllamaEmbeddingFunction(
    url="http://localhost:11434/api/embeddings",
    model_name="nomic-embed-text",
)


In [None]:
embeddings = ollama_embedding(["This is a sample text to try ollama embedding at the workshop"])

In [None]:
print(embeddings)

In [None]:
cc_client = chromadb.PersistentClient(path="./vector-db/made-with-cc")

In [None]:
def generate_line_number_id(index):
    """
    Generates an ID based on the line number.

    Args:
        index (int): The zero-based index of the line in the file.

    Returns:
        str: The line number ID as a string (1-based index).
    """
    return str(index + 1)

def get_embedding_for_document(document):
    """
    Generates an embedding for a given document using the `ollama_embedding` function.

    Args:
        document (str): The document text.

    Returns:
        list: The embedding for the document.
    """
    try:
        # Replace this with the actual call to the ollama embedding API
        return ollama_embedding([document])[0]
    except Exception as e:
        print(f"Error generating embedding for document: {document[:30]}... Error: {e}")
        return None

def load_documents_with_line_ids_and_embeddings(file_path, collection):
    """
    Loads documents from a text file, assigns a line number as the ID to each document,
    generates embeddings for each document, and adds them to a collection.

    Args:
        file_path (str): The path to the text file containing documents (one per line).
        collection (object): The collection object to which the documents, IDs, and embeddings will be added.

    Functionality:
        - Reads a text file line by line.
        - Strips whitespace from each line and skips empty lines.
        - Generates a line number ID for each document.
        - Generates embeddings for each document sequentially.
        - Adds the documents, IDs, and embeddings to the collection using `collection.upsert()`.

    Example Usage:
        collection = some_vector_database.collection("my_collection")
        load_documents_with_line_ids_and_embeddings("trivia.txt", collection)
    """
    try:
        # Open and read the text file line by line
        with open(file_path, 'r') as file:
            lines = file.readlines()

        # Process each line to strip whitespace and remove empty entries
        documents = [line.strip() for line in lines if line.strip()]

        # Generate line number IDs for each document
        ids = [generate_line_number_id(i) for i in range(len(documents))]

        # Generate embeddings sequentially
        embeddings = []
        for doc in documents:
            embedding = get_embedding_for_document(doc)
            if embedding is not None:
                embeddings.append(embedding)
            else:
                embeddings.append([])  # Append an empty list for documents that fail

        # Add the documents, IDs, and embeddings to the collection
        collection.upsert(documents=documents, ids=ids, embeddings=embeddings)

        print(f"Successfully added {len(documents)} documents with embeddings to the collection.")

    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
# cc_collection = cc_client.create_collection(name="made-with-cc")
cc_collection = cc_client.get_collection(name="made-with-cc")

In [None]:
# load_documents_with_line_ids_and_embeddings("made-with-cc.txt", cc_collection)

In [None]:
results = cc_collection.query(
    query_embeddings=ollama_embedding(["What are the best examples for sharing economy use-cases?"]),
    n_results=10 # how many results to return
)
print(results["documents"])

In [None]:
from ollama import chat
stream = chat(
    model='llama3.2:3b',
    messages=[{'role': 'user', 'content': 'What are the best examples for sharing economy use-cases?'}],
    stream=True,
)

for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)

In [None]:
query = "What are the best examples for sharing economy use-cases?"
updated_query = f"{query} - Answer that question using the following text as a resource: {results["documents"]}"

In [None]:
stream = chat(
    model='llama3.2:3b',
    messages=[{'role': 'user', 'content': updated_query}],
    stream=True,
)

for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)