In [None]:
import shutil
import os 
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

# Step 1: Load the Embedding Model
# This will download the model if you don't have it already.
# Use "cuda" if you have a GPU, "cpu" if you don't.
model = SentenceTransformer('BAAI/bge-large-en-v1.5', device="cpu")

# The model works best if you add this instruction for retrieval tasks.
# You can also use model.encode("...", prompt_name="retrieval")
instruction = "Represent this sentence for searching relevant passages: "

# Step 2: Prepare Your Documents (Your Knowledge Base) - Larger Set
documents = [
    # Technology
    "The iPhone 15 Pro represents a significant design shift, featuring a aerospace-grade titanium chassis that makes it both lighter and more durable than previous stainless-steel models. It is powered by the new A17 Pro chip, which delivers console-level gaming performance and improved machine learning capabilities.",
    "The Samsung Galaxy S24 Ultra is renowned for its advanced AI features integrated directly into the phone's core applications. It boasts a brilliant 6.8-inch Dynamic AMOLED 2X display and a versatile camera system with a 200MP main sensor, designed for both photography enthusiasts and productivity power users.",
    
    # Animals
    "Penguins are a group of flightless aquatic birds primarily living in the Southern Hemisphere. They are highly adapted for life in the water, with flippers for wings and countershaded dark and white plumage that provides camouflage while swimming. The largest species, the Emperor Penguin, can stand up to 1.2 meters tall.",
    "Elephants are the largest existing land animals, characterized by their long trunks, tusks, and large ears. They are highly intelligent creatures with complex social structures and are known for their remarkable memory. African elephants typically have larger ears and concave backs compared to their Asian counterparts.",
    "Dolphins are highly intelligent marine mammals known for their playful behavior and complex social structures. They use echolocation, a biological sonar system, to navigate the ocean depths, hunt for fish and squid, and communicate with one another. Species like the bottlenose dolphin are found in warm and temperate seas worldwide and are often noted for their acrobatic leaps and interactions with humans.",
    "The anglerfish is a fascinating and fearsome-looking denizen of the deep ocean, adapted to live in extreme pressure and perpetual darkness. Females possess a bioluminescent lure, called an esca, which dangles from a modified spine on their head to attract unsuspecting prey in the vast, food-scarce depths. Many species exhibit extreme sexual dimorphism, where tiny males permanently attach to and fuse with the much larger females, a unique reproductive strategy known as sexual parasitism.",

    # Baking & Cooking
    "Yeast is a single-celled fungus essential in baking as a leavening agent. It converts fermentable sugars in dough into carbon dioxide and ethanol, causing the dough to expand and rise. This process gives bread its airy texture and characteristic flavor.",
    "Sous-vide is a cooking technique that involves sealing food in an airtight bag and immersing it in a precisely controlled water bath. This method allows for extremely precise temperature control, resulting in food that is cooked evenly throughout without overcooking. It's particularly popular for cooking proteins like steak and chicken to perfect doneness.",
    
    # History
    "The Treaty of Versailles was signed on June 28, 1919, in the Hall of Mirrors at the Palace of Versailles, officially ending World War I. The treaty placed full blame for the war on Germany and its allies, imposing heavy reparations and territorial losses. Its harsh terms are often cited as a contributing factor to the rise of Nazism and the outbreak of World War II.",
    "The Industrial Revolution was a period of major industrialization and innovation that began in Great Britain in the late 18th century. It marked a shift from agrarian societies to industrialized ones, with the development of new machinery, steam power, and factory systems. This transformation radically changed almost every aspect of daily life and economic structures worldwide.",
    
    # Programming
    "Python is a high-level, interpreted programming language known for its clear syntax and readability. It supports multiple programming paradigms, including object-oriented, imperative, and functional programming. The language's extensive standard library and vast ecosystem of third-party packages make it suitable for web development, data science, AI, and scientific computing.",
    "JavaScript is a versatile scripting language primarily used to create dynamic and interactive web content. It is an essential component of web browsers, enabling client-side scripting to interact with users, control the browser, and communicate asynchronously. With the advent of Node.js, JavaScript can now also be run server-side, making it a full-stack development language."
]

# Give unique IDs to each document (can be any IDs you want)
document_ids = [f"doc_{i}" for i in range(len(documents))]

# Step 3: Generate Embeddings and Create a Vector Database

# Define the path and DELETE it if it exists
db_path = "./chroma_db"
if os.path.exists(db_path):
    print(f"Deleting old database at {db_path}...")
    shutil.rmtree(db_path) # This is the key command - it deletes the folder
    
# Initialize a persistent Chroma client. This will create a `chroma_db` directory.
chroma_client = chromadb.PersistentClient(path=db_path)

# Create a collection. This is like a table in a database.
collection = chroma_client.get_or_create_collection(
    name="my_knowledge_base",
    metadata={"hnsw:space": "cosine"} # Cosine similarity is often a good choice
)

# Check if the collection is empty to avoid re-adding the same data
if collection.count() == 0:
    print("Indexing documents...")
    
    # Create the embeddings in bulk.
    # We add the instruction for each document for optimal performance.
    document_embeddings = model.encode([instruction + doc for doc in documents], normalize_embeddings=True)
    
    # Add the documents, their IDs, and their embeddings to the collection.
    collection.add(
        documents=documents,
        ids=document_ids,
        embeddings=document_embeddings.tolist() # Chroma expects a list of lists
    )
    print("Documents indexed successfully!")
else:
    print("Collection already populated.")



In [None]:
# Step 4: Query the System
def retrieve_documents(query, top_k=2):
    """
    Queries the vector database for the most relevant documents.
    
    Args:
        query (str): The user's question or search term.
        top_k (int): How many results to return.
    """
    
    # Encode the query. USE THE SAME INSTRUCTION.
    query_embedding = model.encode(instruction + query, normalize_embeddings=True).tolist()
    
    # Query the collection
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=top_k
    )
    
    return results

# Example Queries
queries = [
    "What materials are used in smartphone manufacturing?",
    "Tell me about animals that live in water",
    "How does baking work chemically?",
    "What were the consequences of World War I treaties?",
    "What languages are used for web development?",
    "Fish",
]

for query in queries:
    print(f"\nQuery: '{query}'")
    results = retrieve_documents(query)
    
    # `results` contains 'ids', 'documents', 'distances'
    for i, doc in enumerate(results['documents'][0]): 
        print(f"Result {i+1}: {doc}")

In [None]:

import os
from dotenv import load_dotenv
# First, install the openai package: pip install openai
from openai import OpenAI

# Set your API key (get it from https://platform.openai.com/)
load_dotenv()
TOGETHER_AI_API_KEY = os.getenv("TOGETHER_API_KEY","")

def rag_with_openai(user_query, top_k=2):
    # 1. Retrieve relevant context
    results = retrieve_documents(user_query, top_k=top_k)
    context = "\n\n".join(results['documents'][0])
    
    # 2. Create a prompt for the LLM
    prompt = f"""Based on the following information, 
    answer the user's question. If the answer isn't in the context, say you don't know.

Context: ```{context}```

User Question: ```{user_query}```

Answer:"""
    
    # 3. Call the LLM (e.g., GPT-3.5-Turbo)
    client = OpenAI(
        base_url="https://api.together.xyz/v1",  # Together AI's API endpoint
        api_key=TOGETHER_AI_API_KEY,  # API key for authentication
    )    
    response = client.chat.completions.create(
        # model="meta-llama/Llama-3.2-3B-Instruct-Turbo",
        model="openai/gpt-oss-20b",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
            {"role": "user", "content": prompt}
        ],
        # max_tokens=150
    )
    
    return response.choices[0].message.content

# Test the full RAG pipeline
user_question = "Tell me more about fish"
answer = rag_with_openai(user_question)
print(f"\nQuestion: {user_question}")
print(f"Answer: {answer}")

In [None]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

# Load the embedding model
model = SentenceTransformer('BAAI/bge-large-en-v1.5') # Using a smaller model for speed

# Initialize ChromaDB client
client = chromadb.PersistentClient(path="./chroma_metadata_db")

# Create a collection. We'll specify we want to use cosine similarity.
collection = client.get_or_create_collection(
    name="tech_docs",
    metadata={"hnsw:space": "cosine"}
)

# Define our documents with METADATA
documents = [
    "The iPhone 15 Pro features a new titanium chassis.",
    "The MacBook Pro is powered by the M3 chip for incredible performance.",
    "The iPad Pro has a stunning Liquid Retina XDR display.",
    "Apple Watch Series 9 introduces a new double-tap gesture."
]

# Define metadata for each document
metadatas = [
    {"category": "phone", "release_year": 2023},
    {"category": "laptop", "release_year": 2023},
    {"category": "tablet", "release_year": 2022},
    {"category": "wearable", "release_year": 2023}
]

ids = ["doc1", "doc2", "doc3", "doc4"]

# Add everything to the collection
# Chroma can generate embeddings for you, but we provide our own for consistency.
embeddings = model.encode(documents).tolist()

collection.add(
    documents=documents,
    embeddings=embeddings, # We provide the embeddings
    metadatas=metadatas,   # We provide the metadata
    ids=ids
)

# Query 1: Basic Semantic Search
print("=== Basic Semantic Search ===")
results = collection.query(
    query_embeddings=model.encode("new Apple phone").tolist(),
    n_results=2
)
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
    print(f"Document: {doc}")
    print(f"Metadata: {meta}\n")

# Query 2: Semantic Search WITH Metadata Filtering (Powerful!)
print("=== Search Filtered to Laptops Only ===")
results = collection.query(
    query_embeddings=model.encode("powerful device").tolist(),
    n_results=2,
    where={"category": "laptop"} # <-- THE KEY DIFFERENCE!
)
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
    print(f"Document: {doc}")
    print(f"Metadata: {meta}\n")