# Retrieval Augmented Generation

## Dataset


## Chosen Embedding models:

- GloVe: Global Vectors for World Representation by Jeffrey Pennington,   Richard Socher,   Christopher D. Manning 
    - github: https://github.com/stanfordnlp/GloVe
    - research paper:
        - https://nlp.stanford.edu/pubs/glove.pdf
        - https://nlp.stanford.edu/projects/glove/
- 
- 


## 2.0 Implement Embedding Generation



In [1]:
from sentence_transformers import SentenceTransformer
from openai import OpenAI
import numpy as np

def preprocess_text(text):
    return text.strip().replace('\n', ' ')

def sbert_embed(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model.encode(text)

def openai_embed(text):
    client = OpenAI()
    response = client.embeddings.create(input=text, model="text-embedding-ada-002")
    return response.data[0].embedding

# def glove_embed(text):
def glove_embed(text, glove_embeddings, vector_size=300):
    words = text.split()
    valid_vectors = []

    for word in words:
        word = word.lower()
        if word in glove_embeddings:
            valid_vectors.append(glove_embeddings[word])

    if valid_vectors:
        return np.mean(valid_vectors, axis=0)
    else:
        return np.zeros(vector_size)



### 3.0 Generate Embeddings

In [None]:
from pathlib import Path
from PyPDF2 import PdfReader

def chunk_text(text, max_tokens=500):
    """Splits long text into chunks of approximately max_tokens (words ≈ tokens)."""
    words = text.split()
    for i in range(0, len(words), max_tokens):
        yield ' '.join(words[i:i + max_tokens])


def load_pdfs_from_folder(folder_path):
    pdf_texts = []
    pdf_paths = Path(folder_path).glob("*.pdf") 
    
    for pdf_path in pdf_paths:
        reader = PdfReader(str(pdf_path))
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""  # sometimes pages have no text
        pdf_texts.append((str(pdf_path), text))
    
    return pdf_texts # returns tuples, filename, full_text

# load embeddings
def load_embeddings(glove_file_path):
    embeddings = {}
    with open(glove_file_path, "r", encoding="utf8") as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = list(map(float, parts[1:]))
            embeddings[word] = vector
    return embeddings


def test_embedding(embedding_model):
    pdfs = load_pdfs_from_folder("./data/v1/docs") # 1. Load PDFs
    embedded_pdfs = []

    if (embedding_model=="glove"):
        glove_path = "glove.6B/glove.6B.300d.txt" # 2. Load GloVe
        glove_embeddings = load_embeddings(glove_path)
    # 3. Embed each document
        # here is where the actual embeddings are done (where glove_embed is called) and where they are added to a list of the embedded_pdfs through glove
        for filename, text in pdfs:
            for chunk in chunk_text(text, max_tokens=500):  # CHUNKING HERE
                embedding = glove_embed(chunk, glove_embeddings, vector_size=300)
                embedded_pdfs.append((filename, embedding, chunk))
        return embedded_pdfs, glove_embeddings
    
    elif (embedding_model=="sbert"):
        for filename, text in pdfs:
            for chunk in chunk_text(text, max_tokens=500):
                embedding = sbert_embed(chunk)
                embedded_pdfs.append((filename, embedding, chunk))
    
    elif (embedding_model=="open_ai"):
        for filename, text in pdfs:
            for chunk in chunk_text(text, max_tokens=500):
                embedding = openai_embed(chunk)
                embedded_pdfs.append((filename, embedding, chunk))

    else:
        print(f"embedding model {embedding_model} is not in this testing code")

    return embedded_pdfs

# Run each embedding

In [8]:
glove_embedded_pdfs, glove_embeddings = test_embedding("glove")

In [24]:
sbert_embedded_pdfs = test_embedding("sbert")

In [10]:
openai_embedded_pdfs = test_embedding("open_ai")

# Save to Json files

In [5]:
import json

def save_embeddings_to_jsonl(data, output_path="openai_embedded_pdfs.jsonl"):
    with open(output_path, 'w', encoding='utf-8') as f:
        for filename, embedding, text in data:
            record = {
                "filename": filename,
                "embedding": embedding.tolist() if isinstance(embedding, (np.ndarray, list)) else list(embedding),
                "text": text
            }
            f.write(json.dumps(record) + "\n")


In [6]:
save_embeddings_to_jsonl(glove_embedded_pdfs, "glove_embedded_pdfs.jsonl")

In [25]:
save_embeddings_to_jsonl(sbert_embedded_pdfs, "sbert_embedded_pdfs.jsonl")

In [None]:
save_embeddings_to_jsonl(openai_embedded_pdfs)

# Retrieve saved data from Json

In [None]:
def load_embeddings_from_jsonl(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data



### 4.0 Set Up Vector Database

* Command to type in terminal: ./qdrant

In [None]:
# from qdrant_client import QdrantClient
# from qdrant_client.models import VectorParams, Distance

# client = QdrantClient("localhost", port=6333)

# client.create_collection(
#     collection_name="glove_embedding_collection",
#     vectors_config=VectorParams(size=384, distance=Distance.COSINE)
# )

In [None]:
# # Suppose you have:
# glove_embedded_pdfs = [vector1, vector2, vector3, ...]
# texts = ["Text for doc 1", "Text for doc 2", "Text for doc 3", ...]

# batch_size = 64

# for i in range(0, len(glove_embedded_pdfs), batch_size):
#     batch = glove_embedded_pdfs[i:i+batch_size]
    
#     points = [
#         PointStruct(
#             id=i+j,
#             vector=embedding,
#             payload={"filename": filename}  # you can also store the filename if you want
#         )
#         for j, (filename, embedding) in enumerate(batch)
#     ]

#     client.upsert(
#         collection_name="glove_embedding_collection",
#         points=points
#     )

# 4.0 and 5.0 Set Up Vector Database and Index Enbeddings

Store the embeddings from each model in separate collections or with different
naming conventions.

## remember .qdrant to make the program run

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
import numpy as np
import json

client = QdrantClient("localhost", port=6333)

def upsert_to_qdrant(embedded_data, model_name, vector_size):
    collection_name = f"{model_name}_embedding_collection"

    # Create collection (run only once)
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=vector_size,
            distance=Distance.COSINE
        )
    )

    batch_size = 64
    for i in range(0, len(embedded_data), batch_size):
        batch = embedded_data[i:i+batch_size]
        points = [
            PointStruct(
                id=i + j,
                vector=embedding.tolist() if hasattr(embedding, "tolist") else list(embedding),
                payload={
                    "filename": filename,
                    "text": text
                }
            )
            for j, (filename, embedding, text) in enumerate(batch)
        ]

        client.upsert(collection_name=collection_name, points=points)

    print(f"Upserted {len(embedded_data)} vectors to {collection_name}.")


# RAG Search
def search_qdrant(query_text, model_name, vector_size, limit=3):
    collection_name = f"{model_name}_embedding_collection"
    
    if model_name == "glove":
        query_vector = glove_embed(query_text, glove_embeddings, vector_size=vector_size)
    elif model_name == "sbert":
        query_vector = sbert_embed(query_text)
    elif model_name == "open_ai":
        query_vector = openai_embed(query_text)
    else:
        raise ValueError("Invalid model name")

    results = client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=limit
    )

    docs = []
    for hit in results:
        payload = hit.payload
        if 'text' in payload:
            docs.append(payload['text'])
        else:
            print("Missing 'text' in payload:", payload)

    return docs


# 6.0 Implement RAG
Create a simple RAG pipeline using each set of embeddings

In [None]:
query_embedding = embed_query("What is photosynthesis?")  # Get embedding of query

search_result = client.search(
    collection_name="glove_embedding_collection",
    query_vector=query_embedding,
    limit=3
)

# Extract relevant documents
docs = [hit.payload['text'] for hit in search_result]

# Combine docs + user query to create a context-rich prompt for LLM
rag_prompt = "Use the following documents to answer:\n\n" + "\n\n".join(docs) + "\n\nQuestion: What is photosynthesis?"


# 7.0 Evaluate Performance

Test the RAG system with a set of queries and evaluate based on metrics like
- relevance, 
- accuracy, and 
- response quality