<p style="color:#153462; 
          font-weight: bold; 
          font-size: 30px; 
          font-family: Gill Sans, sans-serif;
          text-align: center;">
          Basic Implemetation of RAG</p>

### <span style="color:#C738BD; font-weight: bold;">Required Packages</span>

In [1]:
import os
import chromadb
from openai import OpenAI
from chromadb.utils import embedding_functions
from tqdm import tqdm
import pickle

### <span style="color:#C738BD; font-weight: bold;">Setting up the Environment</span>

In [2]:
openai_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key = openai_key)

In [3]:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
            api_key=openai_key,
            model_name="text-embedding-3-small" 
            )

### <span style="color:#C738BD; font-weight: bold;">Setting up Chromadb</span>

In [4]:
chromadb_client = chromadb.PersistentClient(path="chroma_persistant_path")

In [5]:
collection_name = "document_qa_collection"

In [6]:
collection = chromadb_client.get_or_create_collection(name=collection_name, embedding_function=openai_ef)

### <span style="color:#C738BD; font-weight: bold;">Loading Documents</span>

In [7]:
def load_documents_from_directory(dir_path: str) -> list[dict]:
    """
    Function which reads the text from documents and stores in the list of dictories
    
    :param dir_path: Directory of text documents
    :type  dir_path: str
    
    :returns : List of directories which holds extracted text
    :rtype: list[dict]
    """
    print("Loading documents...")
    documents = []
    count = 0
    for filename in os.listdir(dir_path):
        if filename.endswith(".txt"):
            with open(file=os.path.join(directory_path, filename), mode="r", encoding="utf-8") as file:
                documents.append({"id": filename, "text": file.read()})
        count += 1
    print(f"Loading Completed. Loaded {count} documents")
    return documents

In [8]:
def split_text(text: str, chunk_size: int=1000, chunk_overlap: int=20) -> list[str]:
    """
    Function which creates a chunks of given text
    
    :param text: Text
    :type  text: str
    :param chunk_size: Large text divided into provide chunk size
    :type  chunk_size: int
    :param chunk_overlap: Which is the text overlap between chunk to next chunk. Helps in keep good context
    :type  chunk_overlap: int
    
    :returns: List of string of specified chunk_size
    :rtype: list[str]
    """
    chunks = []
    start = 0
    while start <= len(text):
        end = start + chunk_size
        chunks.append(text[start: end])
        start = end - chunk_overlap
    return chunks

In [9]:
# Load documents from the directory
directory_path = "./news_articles"
documents = load_documents_from_directory(directory_path)

Loading documents...
Loading Completed. Loaded 21 documents


In [10]:
# Creating chunks
chunked_documents = []
print("Creating chunks data...")
for doc in documents:
    chunks = split_text(doc["text"])
    for i, chunk in enumerate(chunks):
        chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})
print(f"Creating chunks data is completed. Total chunks: {len(chunked_documents)}")

Creating chunks data...
Creating chunks data is completed. Total chunks: 184


### <span style="color:#C738BD; font-weight: bold;">Creating Embeddings and Insterting in Chromadb</span>

In [11]:
# Function to generate embeddings using OpenAI API
def get_openai_embedding(text: str):
    response = client.embeddings.create(input=text, model="text-embedding-3-small")
    embedding = response.data[0].embedding
    return embedding

In [13]:
# Generate embeddings for the document chunks
# for doc in tqdm(chunked_documents):
#     doc["embedding"] = get_openai_embedding(doc["text"])

100%|██████████| 184/184 [01:30<00:00,  2.04it/s]


In [None]:
# Saving embeddings into a pickle file to save some dollars of calling repeatively 
# with open("gpt-embedding.pkl", "wb") as file:
#     pickle.dump(chunked_documents, file)

In [12]:
with open("gpt-embedding.pkl", "rb") as file:
    chunked_documents = pickle.load(file)

In [13]:
print("Upsert documents with embeddings into Chroma")
for doc in tqdm(chunked_documents):
    collection.upsert(
        ids=[doc["id"]], documents=[doc["text"]], embeddings=[doc["embedding"]]
    )
print("Upsert documents with embeddings into Chroma")

Upsert documents with embeddings into Chroma


  0%|          | 0/184 [00:00<?, ?it/s]

: 