In [None]:
import json
import os 
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from rag_fine_tuning import create_vector_store, convert_context_to_langchain_docs
from rag_fine_tuning import query_vector_store, save_llm_answers, fine_tune_rag
from data_utils import convert_json_to_dataframe, create_json_subset, collect_all_results, merge_results
sns.set_style("whitegrid")
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()

In [11]:
%load_ext autoreload

# Useful material 
# SQuAD Evaluation guidelines: 
# https://worksheets.codalab.org/worksheets/0x8212d84ca41c4150b555a075b19ccc05/
# https://rajpurkar.github.io/SQuAD-explorer/

# Convert json data to pandas dataframe 

In [None]:
convert_json_to_dataframe()

In [None]:
df_all_data = pd.read_csv("dataset.csv")
df_all_data.shape

In [14]:
df_selected = df_all_data.copy()

In [15]:
# some rough statistics for the context length 
df_selected.loc[:, "context_chars"] = df_selected["context"].apply(lambda x: len(x))
df_selected.loc[:, "context_words"] = df_selected.loc[:, "context"].apply(lambda x: len(x.split(" ")))

In [None]:
df_selected.head(2)

In [None]:
plt.figure(figsize=[8, 5])
sns.histplot(df_selected.drop_duplicates(subset="context")["context_chars"])

In [None]:
plt.figure(figsize=[8, 5])
sns.histplot(df_selected.drop_duplicates(subset="context")["context_words"])

In [None]:
# create original json structure for only a subset of questions, used for tests and fine-tuning 
# this file will be used by the evaluation.py file 

df = pd.read_csv("dataset.csv")
df_sel = df[0:500]
df_sel.head(2)

create_json_subset(df_sel)

# RAG architecture

Steps:

**Data Indexing**

Converting text data into a searchable database of vector embeddings, which represent the meaning of the text in a format that computers can easily understand.
- **Documents Chunking**: The collection of documents is split into smaller chunks of text. This allows for more precise and relevant pieces of information to be fed into the language model when needed, avoiding information overload.
- **Vector Embeddings**: The chunks of text are then transformed into vector embeddings. These embeddings encode the meaning of natural language text into numerical representations.
- **Vector Database**: Finally, the vector embeddings are stored in a vector database, making them easily searchable.

**Documents -> Text chunks -> Vector Embeddings -> Vector DB**

**Load -> Split -> Embed -> Store**

## Convert the pandas context to Langchain documents 

In [2]:
df = pd.read_csv("dataset.csv")

langchain_docs = convert_context_to_langchain_docs(df)

In [None]:
print(len(langchain_docs))
print(langchain_docs[0])
print(langchain_docs[1])

## Vector database

In [3]:
embeddings_model = "text-embedding-3-small"
embeddings = OpenAIEmbeddings(model=embeddings_model)

db_dir = os.path.join(os.getcwd(), "vector_databases")


In [None]:
create_vector_store(langchain_docs, embeddings, store_name="test_vector_store", db_dir=db_dir, chunk_size=200, chunk_overlap=15)

## Querying the vector database 

In [None]:
query = "How is the weather today in Milan?"
store_name =  "test_vector_store"
relevant_docs = query_vector_store(store_name, query, embeddings, db_dir, k=3, score_threshold=0.1)

print(relevant_docs)

In [None]:
query = "Who were the normans?"
store_name =  "test_vector_store"
relevant_docs = query_vector_store(store_name, query, embeddings, db_dir, k=3, score_threshold=0.1)

print(len(relevant_docs))

for doc in relevant_docs:
    print(doc.page_content)

## Run the RAG over a subset of questions and save the answers 

In [5]:
df_to_test = pd.read_csv("dataset.csv")
df_to_test = df_to_test[0:500]

In [None]:
embeddings_model = "text-embedding-3-small"
embeddings = OpenAIEmbeddings(model=embeddings_model)

save_llm_answers(df_to_test, langchain_docs, embeddings, embeddings_model, chunk_size=150, filename="test_predictions.json")

## RAG Fine-tuning 

Comments
Peak distribution for number of characters in the documents is ~600 words. 
- smaller chunks: reduced noise from irrelevant content - works well with dense embeddings*
- larger chunks: preserves context better; ideal if queries require full document context, works well with hybrid search*

TEXT CHUNKING 

1. CHARACTER SPLITTING : divide the text into N-character sized chunks. Can split words in the middle. 
2. RECURSIVE CHARACTER SPLITTING: preserves sentences. Avoids splitting sentences midword (note that RecursiveCharacterTextSplitter with separator does exactly that). Split the
document where a double new line is present, then, if the chunk size is still exceeded, split at new lines, and so on.
3. SEMANTIC SPLITTING: keeps related content together. Use embeddings to split based on meaning.
+ other techniques

EMBEDDINGS 
Create fixed-length vector representation of text, focusing on semanting meaning for tasks like similarity comparison. 
Most up to date embedding models, both proprietary and open source, with performance metrics across different tasks: https://huggingface.co/spaces/mteb/leaderboard 
This contains also a "retrieval" column with performance metrics. Click on the column to sort the models.
Interesting article: https://www.mongodb.com/developer/products/atlas/choose-embedding-model-rag/


In [None]:
df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_context_to_langchain_docs(df)

fine_tune_rag(df_to_test, langchain_docs)

In [2]:
df_all_res = collect_all_results('/Users/mariadancianu/Desktop/Git Projects/SQuAD_RAG_experiments/eval_results/initial_eval_results')
df_all_res.sort_values(by="HasAns_f1", ascending=False, inplace=True)
df_all_res.to_csv("eval_results/initial_eval_results/df_all_results.csv", index=False)

In [None]:
df_all_res

# Investigate the results

In [None]:
# Pick the best results and merge the scores by question id to the original df in order to inspect the errors.
# The idea is to understand why the results are so poor for the NoAns questions, when the HasAns questions have 
# a high f1 score, in order to understand how the workflow can be optimized

In [None]:
best_result_path = os.path.join(os.getcwd(), "eval_results/initial_eval_results", df_all_res.experiment.iloc[0])
split_path = best_result_path.split("/")
split_path[-1] = split_path[-1].replace("eval_", "")
best_result_path = "/".join(split_path)
best_result_path

In [5]:
# run the RAG with best parameters, and save also the context
parameters_dict = {
    "chunk_sizes": [400],
    "embed_options": { 
        "text-embedding-3-large": "OpenAI", 
        },
    "models": ["gpt-3.5-turbo"]
}

In [6]:
results_folder = os.path.join(os.getcwd(), "eval_results/debugging_eval_results")
results_folder

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

In [None]:
df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_context_to_langchain_docs(df)

fine_tune_rag(df_to_test, langchain_docs, parameters_dict, results_folder=results_folder, save_context=True)

In [None]:
! python eval_results/evaluation.py "/Users/mariadancianu/Desktop/Git Projects/SQuAD_RAG_experiments/eval_results/data_updated_500.json" "/Users/mariadancianu/Desktop/Git Projects/SQuAD_RAG_experiments/eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"

In [None]:
df_merged = merge_results(f1_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/f1_thresh_by_qid.json"), 
                          exact_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/exact_thresh_by_qid.json"), 
                          pred_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"), 
                          filepath_500=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"),
                          context_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/context_500_400_text-embedding-3-large_gpt-3.5-turbo.json"), 
                          df_questions_filepath="dataset.csv", 
                          filter_500=True)

df_merged.shape

In [None]:
df_merged.columns

In [None]:
df_merged[df_merged.is_impossible][["id", "is_impossible", "f1_score", "exact_score", "question", "pred"]].tail(10)

In [None]:
print(df_merged.loc[479, "context"].replace(". ", ".\n"))

In [None]:
print(df_merged.loc[479, "rag_retrieved_context"])

# Evaluate RAG SOTA embeddings: snowflake-artic-embed-l-v2.0

In [2]:
# run the RAG with SOTA embeddings 
parameters_dict = {
    "chunk_sizes": [400],
    "embed_options": { 
        "Snowflake/snowflake-arctic-embed-l-v2.0": "HuggingFace_SentenceTransformers" # ranked 6th, 568M params, released in december 2024 
        },
    "models": ["gpt-3.5-turbo"]
}

results_folder = os.path.join(os.getcwd(), "eval_results/optimize_results")
results_folder

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

In [None]:
df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_context_to_langchain_docs(df)

fine_tune_rag(df_to_test, langchain_docs, parameters_dict, results_folder=results_folder, save_context=False)