# Retrieval-Augmented Generation (RAG)

Install the Hugging Face libraries to run this notebook.

In [None]:
!pip install transformers wikipedia

In [None]:
import torch
import torch.nn.functional as F

## Document ingestion

In [None]:
import wikipedia

def extract_wikipedia_pages(page_titles):
    """
    Extracts Wikipedia pages and stores them in a dictionary.

    Args:
        page_titles: A list of Wikipedia page titles to extract.

    Returns:
        A dictionary containing the text of each Wikipedia page.
    """

    page_data = {}
    for title in page_titles:
        try:
            page = wikipedia.page(title)
            content = page.content.strip()
            content = content.replace("\n", "")
            page_data[page.title] = content
        except wikipedia.exceptions.PageError:
            print(f"Page '{title}' not found.")
        except wikipedia.exceptions.DisambiguationError as e:
            print(f"Disambiguation error for '{title}': {e.options}")

    return page_data

In [None]:
page_titles = [
               "Roger Apéry",
               "Owen Willans Richardson",
               "Otto Sackur",
               "Ludvig Lorenz",
               "Klaus von Klitzing",
               "Henri Victor Regnault",
               "Erwin Madelung",
              ]

# Uncomment the next line to scroll through Wikipedia
# wikipedia_data = extract_wikipedia_pages(page_titles)

Save the dictionary using `json.dump()`:

In [None]:
import json

# with open('wikipedia_data.json', 'w') as f:
#     json.dump(wikipedia_data, f, indent=4)

Load the dictionary using `json.load()`:

In [None]:
with open('wikipedia_data.json', 'r') as f:
    wikipedia_data = json.load(f)

In [None]:
for doc in wikipedia_data:
    print(len(wikipedia_data[doc]))

## Document pre-processing

We load just the tokenizer:

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nomic-ai/modernbert-embed-base")
model_max_length = tokenizer.model_max_length
model_max_length

In [None]:
encoded_text = tokenizer.encode(["hello", "how are you?"])
tokenizer.decode(encoded_text)

In [None]:
def text_splitting(text, chunk_length = 300, chunk_overlap = 100):
    """
    returns a list of splits
    """
    pass

In [None]:
wikipedia_data_splits = {}

for doc in wikipedia_data.keys():
    wikipedia_data_splits[doc] = text_splitting(wikipedia_data[doc])

first_key = page_titles[0]
wikipedia_data_splits[first_key][:2]

In [None]:
min_doc = min(len(wikipedia_data_splits[doc]) for doc in wikipedia_data_splits)
max_doc = max(len(wikipedia_data_splits[doc]) for doc in wikipedia_data_splits)
av_doc = sum(len(wikipedia_data_splits[doc]) for doc in wikipedia_data_splits) / len(wikipedia_data_splits)

min_doc,max_doc,av_doc

## Generating embeddings

Now we load the embedder:

In [None]:
from transformers import AutoModel

model = AutoModel.from_pretrained("nomic-ai/modernbert-embed-base")
model

In [None]:
inputs = tokenizer("Hello, world!", return_tensors="pt")
outputs = model(**inputs)

output_dim = outputs.last_hidden_state.size(2)
output_dim

In [None]:
def embed(chunk_list, doc_type="document"):
    encoded_docs = tokenizer(["search_{}: {}".format(doc_type, chunk) for chunk in chunk_list],
                                 padding = True,
                                 return_tensors="pt")
    output = model(**encoded_docs) # (batch, input_length, output_dim)
    token_embeddings = output.last_hidden_state
    output_embeddings = torch.sum(token_embeddings, 1)
    output_embeddings = F.normalize(output_embeddings, p=2, dim=1)
    return output_embeddings # (batch, output_dim)

In [None]:
embed(["hello", "another document", "and another one"]).shape

**Exercise**: chunks may lack context. The ideal of `contextual embeddings` is to ask an LLM to write some context about the chunk (given the full document and the chunk), and to embed the chunk together with the context.
Implement this idea here (choose a simple enough model and the appropriate task!).

In [None]:
def populate_database(dic_splits, batch_size = 1):
    n_chunks = sum([len(dic_splits[doc]) for doc in dic_splits])
    vectorial_database = torch.zeros([n_chunks, output_dim], requires_grad = False)
    chunk_list = []

    pass

    return chunk_list, vectorial_database

# Uncomment this to populate the database
# chunk_list, vectorial_database = populate_database(wikipedia_data_splits)

Save the vectorial database using `torch.save()`:

In [None]:
# torch.save(vectorial_database, 'vectorial_database.pth')

# with open('chunk_list.json', 'w') as f:
#     json.dump(chunk_list, f, indent=4)

Load the database using `torch.load()`:

In [None]:
vectorial_database = torch.load('vectorial_database.pth')
vectorial_database.requires_grad_(False)

with open('chunk_list.json', 'r') as f:
    chunk_list = json.load(f)

In [None]:
len(chunk_list), vectorial_database.shape

In [None]:
for i, embedding_vector in enumerate(vectorial_database[:20]):
    print(embedding_vector[:5], chunk_list[i][:50])

## Retrieval

In [None]:
def similarity(query_embeddings, doc_embeddings):
    pass

In [None]:
query_embeddings = embed([
    "What is TSNE?",
    "Who is Laurens van der Maaten?",
], "query")

doc_embeddings = embed([
    "TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten",
], "document")

similarity(query_embeddings, doc_embeddings)

In [None]:
def retrieve(query, 
             vectorial_database = vectorial_database, 
             chunk_list = chunk_list, 
             batch_size = 5, 
             topk = 5):
    pass

In [None]:
retrieve("When was Erwin Madelung born?")

**Exercise**: The similarity measure is based on embeddings. A completely different approach is `lexical matching`, meaning by matching keywords from the query to the documents. It is based on `TF-IDF (Term Frequency-Inverse Document Frequency)`, as follows:
* Compute TF-IDF for each chunk
* BM25 returns the 25 most relevant chunks based on their TF-IDF match to the query

A `reranker` is (yet another) LLM which looks at the query and some chunks and ranks them by relevance. 

Claude combines BM25 with similarity measures as follows:
* Use BM25 to retrieve 25 chunks
* independently, use similarity measure on embeddings to retrieve 25 chunks
* Use a reranker to combine and deduplicate the obtained 50 chunks

Implement this approach.

### Alternative retrieval: SVM

In [None]:
import numpy as np
from sklearn import svm

def retrieve_SVM(query, 
             vectorial_database = vectorial_database, 
             chunk_list = chunk_list, 
             topk = 5):
    query_embedding = embed([query], "query")
    x = np.concatenate([query_embedding.detach().numpy(), vectorial_database.detach().numpy()])
    y = np.zeros(vectorial_database.size(0) + 1)
    y[0] = 1 # we have a single positive example

    clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1, dual="auto")
    clf.fit(x, y)
    similarities = clf.decision_function(x)
    sorted_ix = np.argsort(-similarities)
    for k in sorted_ix[1:topk+1]:
        print(f"Score: {similarities[k]:.4f}\nText:\n", chunk_list[k-1], "\n")
    return "\n".join([chunk_list[k-1] for k in sorted_ix[1:topk+1]])

In [None]:
retrieve_SVM("When was Erwin Madelung born?")

## Full pipeline

This model only does extractive question answering!

In [None]:
from transformers import AutoModelForQuestionAnswering, pipeline

model_name = "deepset/tinyroberta-squad2"

QA = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [None]:
def query(prompt):
    topk_chunks = retrieve(prompt)
#     topk_chunks = retrieve_SVM(prompt)
    return QA(question=prompt, context=topk_chunks)

In [None]:
query("When was Erwin Madelung born?")