# Retrieval Augmented Generation

## Dataset


## Chosen Embedding models:

- GloVe: Global Vectors for World Representation by Jeffrey Pennington,   Richard Socher,   Christopher D. Manning 
    - github: https://github.com/stanfordnlp/GloVe
    - research paper:
        - https://nlp.stanford.edu/pubs/glove.pdf
        - https://nlp.stanford.edu/projects/glove/
- OpenAI
- SBert


## 2.0 Implement Embedding Generation



In [4]:
from sentence_transformers import SentenceTransformer
from openai import OpenAI
import numpy as np

def preprocess_text(text):
    return text.strip().replace('\n', ' ')

def sbert_embed(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model.encode(text)

def openai_embed(text):
    client = OpenAI()
    response = client.embeddings.create(input=text, model="text-embedding-ada-002")
    return response.data[0].embedding

# def glove_embed(text):
def glove_embed(text, glove_embeddings, vector_size=300):
    words = text.split()
    valid_vectors = []

    for word in words:
        word = word.lower()
        if word in glove_embeddings:
            valid_vectors.append(glove_embeddings[word])

    if valid_vectors:
        return np.mean(valid_vectors, axis=0)
    else:
        return np.zeros(vector_size)



In [None]:
from qdrant_client import QdrantClient

# Connect to your Qdrant instance (adjust host/port if needed)
client = QdrantClient(host="localhost", port=6333)

# List all collections
collections = client.get_collections()
print(collections)


collections=[CollectionDescription(name='sbert_embedding_collection'), CollectionDescription(name='my_collection'), CollectionDescription(name='glove_embedding_collection')]


  client = QdrantClient(host="localhost", port=6333)


### 3.0 Generate Embeddings

In [13]:
from pathlib import Path
from PyPDF2 import PdfReader

def chunk_text(text, max_tokens=500):
    """Splits long text into chunks of approximately max_tokens (words ≈ tokens)."""
    words = text.split()
    for i in range(0, len(words), max_tokens):
        yield ' '.join(words[i:i + max_tokens])


def load_pdfs_from_folder(folder_path):
    pdf_texts = []
    pdf_paths = Path(folder_path).glob("*.pdf") 
    
    for pdf_path in pdf_paths:
        reader = PdfReader(str(pdf_path))
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""  # sometimes pages have no text
        pdf_texts.append((str(pdf_path), text))
    
    return pdf_texts # returns tuples, filename, full_text

# load embeddings
def load_embeddings(glove_file_path):
    embeddings = {}
    with open(glove_file_path, "r", encoding="utf8") as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = list(map(float, parts[1:]))
            embeddings[word] = vector
    return embeddings


def test_embedding(embedding_model):
    pdfs = load_pdfs_from_folder("./data/v1/docs") # 1. Load PDFs
    embedded_pdfs = []

    if (embedding_model=="glove"):
        glove_path = "glove.6B/glove.6B.300d.txt" # 2. Load GloVe
        glove_embeddings = load_embeddings(glove_path)
    # 3. Embed each document
        # here is where the actual embeddings are done (where glove_embed is called) and where they are added to a list of the embedded_pdfs through glove
        for filename, text in pdfs:
            for chunk in chunk_text(text, max_tokens=500):  # CHUNKING HERE
                embedding = glove_embed(chunk, glove_embeddings, vector_size=300)
                embedded_pdfs.append((filename, embedding, chunk))
        return embedded_pdfs, glove_embeddings
    
    elif (embedding_model=="sbert"):
        for filename, text in pdfs:
            for chunk in chunk_text(text, max_tokens=500):
                embedding = sbert_embed(chunk)
                embedded_pdfs.append((filename, embedding, chunk))
    
    elif (embedding_model=="open_ai"):
        for filename, text in pdfs:
            for chunk in chunk_text(text, max_tokens=500):
                embedding = openai_embed(chunk)
                embedded_pdfs.append((filename, embedding, chunk))

    else:
        print(f"embedding model {embedding_model} is not in this testing code")

    return embedded_pdfs

# Run each embedding

In [14]:
glove_embedded_pdfs, glove_embeddings = test_embedding("glove")

In [None]:
sbert_embedded_pdfs = test_embedding("sbert")

In [None]:
openai_embedded_pdfs = test_embedding("open_ai")

# Save to Json files

In [12]:
import json

def save_embeddings_to_jsonl(data, output_path="openai_embedded_pdfs.jsonl"):
    with open(output_path, 'w', encoding='utf-8') as f:
        for filename, embedding, text in data:
            record = {
                "filename": filename,
                "embedding": embedding.tolist() if isinstance(embedding, (np.ndarray, list)) else list(embedding),
                "text": text
            }
            f.write(json.dumps(record) + "\n")


In [None]:
save_embeddings_to_jsonl(glove_embedded_pdfs, "glove_embedded_pdfs.jsonl")

In [None]:
save_embeddings_to_jsonl(sbert_embedded_pdfs, "sbert_embedded_pdfs.jsonl")

In [None]:
save_embeddings_to_jsonl(openai_embedded_pdfs, "openai_embedded_pdfs.jsonl")

# Retrieve saved data from Json

In [None]:
import json
def load_embeddings_from_jsonl(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

glove_embedded_pdfs = load_embeddings_from_jsonl("./glove_embedded_pdfs.jsonl")
sbert_embedded_pdfs = load_embeddings_from_jsonl("./sbert_embedded_pdfs.jsonl")
openai_embedded_pdfs = load_embeddings_from_jsonl("./openai_embedded_pdfs.jsonl")

### 4.0 Set Up Vector Database

* Command to type in terminal: ./qdrant

In [3]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

client = QdrantClient("localhost", port=6333)

client.create_collection(
    collection_name="glove_embedding_collection",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

True

In [None]:
# # Suppose you have:
# glove_embedded_pdfs = [vector1, vector2, vector3, ...]
# texts = ["Text for doc 1", "Text for doc 2", "Text for doc 3", ...]

# batch_size = 64

# for i in range(0, len(glove_embedded_pdfs), batch_size):
#     batch = glove_embedded_pdfs[i:i+batch_size]
    
#     points = [
#         PointStruct(
#             id=i+j,
#             vector=embedding,
#             payload={"filename": filename}  # you can also store the filename if you want
#         )
#         for j, (filename, embedding) in enumerate(batch)
#     ]

#     client.upsert(
#         collection_name="glove_embedding_collection",
#         points=points
#     )

# 4.0 and 5.0 Set Up Vector Database and Index Enbeddings

Store the embeddings from each model in separate collections or with different
naming conventions.

## remember .qdrant to make the program run

In [4]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
import numpy as np
import json

client = QdrantClient("localhost", port=6333)

def upsert_to_qdrant(embedded_data, model_name, vector_size):
    collection_name = f"{model_name}_embedding_collection"

    # Create collection (run only once)
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=vector_size,
            distance=Distance.COSINE
        )
    )

    batch_size = 64
    for i in range(0, len(embedded_data), batch_size):
        batch = embedded_data[i:i+batch_size]
        points = [
            PointStruct(
                id=i + j,
                vector=embedding.tolist() if hasattr(embedding, "tolist") else list(embedding),
                payload={
                    "filename": filename,
                    "text": text
                }
            )
            for j, (filename, embedding, text) in enumerate(batch)
        ]

        client.upsert(collection_name=collection_name, points=points)

    print(f"Upserted {len(embedded_data)} vectors to {collection_name}.")

def upsert_to_qdrant_AI(embedded_data, model_name, vector_size):
    collection_name = f"{model_name}_embedding_collection"

    # Create collection (run only once)
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=vector_size,
            distance=Distance.COSINE
        )
    )

    batch_size = 64
    for i in range(0, len(embedded_data), batch_size):
        batch = embedded_data[i:i+batch_size]


        points = [
            PointStruct(
                id=i + j,
                vector=entry["embedding"],
                payload={
                    "filename": entry["filename"],
                    "text": entry["text"]
                }
            )
            for j, entry in enumerate(batch)
        ]

        client.upsert(collection_name=collection_name, points=points)

    print(f"Upserted {len(embedded_data)} vectors to {collection_name}.")

# RAG Search
def search_qdrant(query_text, model_name, vector_size, limit=3):
    collection_name = f"{model_name}_embedding_collection"
    
    if model_name == "glove":
        query_vector = glove_embed(query_text, glove_embeddings, vector_size=vector_size)
    elif model_name == "sbert":
        query_vector = sbert_embed(query_text)
    elif model_name == "open_ai":
        query_vector = openai_embed(query_text)
    else:
        raise ValueError("Invalid model name")

    results = client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=limit
    )

    docs = []
    for hit in results:
        payload = hit.payload
        if 'text' in payload:
            docs.append(payload['text'])
        else:
            print("Missing 'text' in payload:", payload)

    return docs


In [5]:
import openai
print(openai.__version__)  # MUST show 0.28.0


1.78.1


In [9]:
from sentence_transformers import SentenceTransformer
import openai

# sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
pdfs = load_pdfs_from_folder("./data/v1/docs")
# sbert_embedded_pdfs = []

# for filename, text in pdfs:  # Assuming `pdfs` = List[(filename, full_text)]
#     for chunk in chunk_text(text, max_tokens=500):  # Define your chunk_text() earlier
#         embedding = sbert_model.encode(chunk)
#         sbert_embedded_pdfs.append((filename, embedding, chunk))

# upsert_to_qdrant(sbert_embedded_pdfs, model_name="sbert", vector_size=384)

embedded_openai = []

for filename, text in pdfs:
    for chunk in chunk_text(text, max_tokens=500):
        embedding = openai_embed(chunk)
        embedded_openai.append((filename, embedding, chunk))




In [10]:
print(openai_embedded_pdfs[0])


{'filename': 'data/v1/docs/2014-annual-performance-report.pdf', 'embedding': [-0.0009930044179782271, 0.03040623851120472, -0.05109354108572006, -0.017868496477603912, -0.05982931703329086, 0.03523042052984238, -0.009483269415795803, 0.03301677107810974, -0.06632290780544281, 0.04859337583184242, 0.008469829335808754, 0.02169446460902691, 0.03288279473781586, 0.013537784107029438, -0.035120826214551926, 0.07195242494344711, 0.08960121124982834, -0.006811867468059063, -0.048532407730817795, -0.016011826694011688, 0.042757805436849594, -0.009645757265388966, 0.04629439488053322, 0.0009178844629786909, -0.06630199402570724, 0.018715113401412964, -0.015667200088500977, -0.024834593757987022, -0.07042499631643295, -0.05278418958187103, -0.03252672404050827, 0.09703652560710907, 0.059942156076431274, 0.030862312763929367, 0.025270432233810425, 0.11491391807794571, 0.05684321001172066, -0.012333068065345287, 0.0303468219935894, -0.0031784474849700928, -0.06252648681402206, -0.1085655093193054

In [None]:
print(embedded_openai)

In [None]:
save_embeddings_to_jsonl(embedded_openai, "openai_embedded_pdfs.jsonl")

In [None]:
upsert_to_qdrant_AI(openai_embedded_pdfs, model_name="openai", vector_size=1536)

# 6.0 Implement RAG
Create a simple RAG pipeline using each set of embeddings

In [23]:
def simple_rag(query, embed_fn, collection_name, llm_fn, top_k=5, **embed_args):
    from qdrant_client import QdrantClient

    # Connect to Qdrant
    client = QdrantClient(host="localhost", port=6333)
    
    # Embed query
    query_vec = embed_fn(query, **embed_args)

    # Search Qdrant
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_vec,
        limit=top_k
    )

    # Extract top-k text chunks
    chunks = [hit.payload['text'] for hit in search_result]
    context = "\n\n".join(chunks)

    # Create prompt for LLM
    prompt = f"Use the context to answer:\n\n{context}\n\nQuestion: {query}\nAnswer:"

    # Generate answer using LLM
    return llm_fn(prompt)


In [6]:
import openai

def openai_generate(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300,
        temperature=0.2
    )
    return response['choices'][0]['message']['content']


In [10]:
collections = client.get_collections()
print(collections)


NameError: name 'client' is not defined

In [None]:
search_result = client.search(
    collection_name=f"{embed_type}_collection",
    query_vector=query_embedding,
    limit=5
)

In [26]:
query = "What was Microsoft's net cash from operating activities in Q3 2022?"

# glove_answer = simple_rag(query, glove_embed, "glove_embedding_collection", openai_generate, glove_embeddings=glove_embeddings)
# print("Glove RAG Answer:\n", glove_answer)

sbert_answer = simple_rag(query, sbert_embed, "sbert_embedding_collection", openai_generate)
print("SBERT RAG Answer:\n", sbert_answer)

# openai_answer = simple_rag(query, openai_embed, "openai_embedding_collection", openai_generate)
# print("OpenAI RAG Answer:\n", openai_answer)


SafetensorError: Error while deserializing header: HeaderTooLarge

In [None]:
expected = "23.4 billion"
from difflib import SequenceMatcher

def compare(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

print("🔍 SBERT score:", compare(sbert_answer, expected))
print("🔍 OpenAI score:", compare(openai_answer, expected))

In [None]:
# query_embedding = embed_query("What is photosynthesis?")  # Get embedding of query

# search_result = client.search(
#     collection_name="glove_embedding_collection",
#     query_vector=query_embedding,
#     limit=3
# )

# # Extract relevant documents
# docs = [hit.payload['text'] for hit in search_result]

# # Combine docs + user query to create a context-rich prompt for LLM
# rag_prompt = "Use the following documents to answer:\n\n" + "\n\n".join(docs) + "\n\nQuestion: What is photosynthesis?"

# 7.0 Evaluate Performance

Test the RAG system with a set of queries and evaluate based on metrics like
- relevance, 
- accuracy, and 
- response quality

This project ended up not working, so I couldn't evaluate anything.

In [3]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("paraphrase-albert-small-v2")

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.84k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(3, 768)
tensor([[1.0000, 0.5926, 0.1182],
        [0.5926, 1.0000, 0.1695],
        [0.1182, 0.1695, 1.0000]])
