In [1]:
# 📦 Install packages
%pip install openai chromadb python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# 🔍 RAG using ChromaDB + OpenAI (Manual Version)
This notebook loads documents, creates embeddings manually, and retrieves chunks from ChromaDB.

In [2]:
import os
from dotenv import load_dotenv
import chromadb
from openai import OpenAI
from chromadb.utils import embedding_functions

# Load environment variables from .env file
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_key, model_name="text-embedding-3-small"
)

# Initialize the Chroma client with persistence
chroma_client = chromadb.PersistentClient(path="./db/chroma_persistent_storage")
collection_name = "document_qa_collection"
collection = chroma_client.get_or_create_collection(
    name=collection_name, embedding_function=openai_ef
)

client = OpenAI(api_key=openai_key)


In [3]:
# Load documents
def load_documents_from_directory(directory_path):
    print("==== Loading documents from directory ====")
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            with open(os.path.join(directory_path, filename), "r", encoding="utf-8") as file:
                documents.append({"id": filename, "text": file.read()})
    return documents

# Split documents
def split_text(text, chunk_size=1000, chunk_overlap=20):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

directory_path = "./data/new_articles"
documents = load_documents_from_directory(directory_path)

chunked_documents = []
for doc in documents:
    chunks = split_text(doc["text"])
    for i, chunk in enumerate(chunks):
        chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})


==== Loading documents from directory ====


In [None]:
# Generate embeddings, this can take 1 min+
def get_openai_embedding(text):
    response = client.embeddings.create(input=text, model="text-embedding-3-small")
    return response.data[0].embedding

for doc in chunked_documents:
    doc["embedding"] = get_openai_embedding(doc["text"])
    collection.upsert(ids=[doc["id"]], documents=[doc["text"]], embeddings=[doc["embedding"]])

# comment above, and execute below to see how it processing
#for idx, doc in enumerate(chunked_documents):
#    print(f"Embedding chunk {idx+1}/{len(chunked_documents)}: {doc['id']}")
#    doc["embedding"] = get_openai_embedding(doc["text"])
#    collection.upsert(ids=[doc["id"]], documents=[doc["text"]], embeddings=[doc["embedding"]])



Embedding chunk 1/184: 05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt_chunk1
Embedding chunk 2/184: 05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt_chunk2
Embedding chunk 3/184: 05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt_chunk3
Embedding chunk 4/184: 05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt_chunk4
Embedding chunk 5/184: 05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt_chunk5
Embedding chunk 6/184: 05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt_chunk6
Embedding chunk 7/184: 05-03-ai-replace-tv-writers-strike.txt_chunk1
Embedding chunk 8/184: 05-03-ai-replace-tv-writers-strike.txt_chunk2
Embedding chunk 9/184: 05-03-ai-replace-tv-writers-strike.txt_chunk3
Embedding chunk 10/184: 05-03-ai-replace-tv-writers-strike.txt_chunk4
Embedding chunk 11/184: 05-03-ai-replace-tv-writers-strike.txt_chunk5
Embedding chunk 12/184: 05-03-chatgpt-everything-you-need-t

In [5]:
# RAG Logic
def query_documents(question, n_results=2):
    results = collection.query(query_texts=question, n_results=n_results)
    return [doc for sublist in results["documents"] for doc in sublist]

def generate_response(question, relevant_chunks):
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are an assistant for question-answering tasks. Use the following pieces of "
        "retrieved context to answer the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the answer concise."
        "\n\nContext:\n" + context + "\n\nQuestion:\n" + question
    )
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "system", "content": prompt}, {"role": "user", "content": question}],
    )
    return response.choices[0].message.content

question = "give me a brief overview of the articles. Be concise, please."
relevant_chunks = query_documents(question)
answer = generate_response(question, relevant_chunks)

print("==== Answer ====")
print(answer)


==== Answer ====
The articles in the TechCrunch newsletter cover topics such as Lyft's equity struggles and new strategic direction, as well as the impact of down rounds on late-stage companies in the current funding market. Additionally, there are mentions of upcoming events like TC City Spotlight: Atlanta, TechCrunch Live with Persona and Index Ventures, and the Disrupt conference in San Francisco. TechCrunch+ subscribers gain access to detailed commentary, analysis, and surveys.
