In [1]:
import json
import os 
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from rag_fine_tuning import convert_knowledge_base_to_langchain_docs, fine_tune_rag
from data_utils import convert_json_to_dataframe, create_json_subset, collect_all_results, merge_results
sns.set_style("whitegrid")
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
%load_ext autoreload

# Useful material 
# SQuAD Evaluation guidelines: 
# https://worksheets.codalab.org/worksheets/0x8212d84ca41c4150b555a075b19ccc05/
# https://rajpurkar.github.io/SQuAD-explorer/

# Convert json data to pandas dataframe 

In [None]:
convert_json_to_dataframe()

In [None]:
df_all_data = pd.read_csv("dataset.csv")
df_all_data.shape

In [None]:
df_selected = df_all_data.copy()

In [None]:
# some rough statistics for the context length 
df_selected.loc[:, "context_chars"] = df_selected["context"].apply(lambda x: len(x))
df_selected.loc[:, "context_words"] = df_selected.loc[:, "context"].apply(lambda x: len(x.split(" ")))

In [None]:
df_selected.head(2)

In [None]:
plt.figure(figsize=[8, 5])
sns.histplot(df_selected.drop_duplicates(subset="context")["context_chars"])

In [None]:
plt.figure(figsize=[8, 5])
sns.histplot(df_selected.drop_duplicates(subset="context")["context_words"])

In [None]:
# create original json structure for only a subset of questions, used for tests and fine-tuning 
# this file will be used by the evaluation.py file 

df = pd.read_csv("dataset.csv")
df_sel = df[0:500]
df_sel.head(2)

create_json_subset(df_sel)

# RAG architecture

Steps:

**Data Indexing**

Converting text data into a searchable database of vector embeddings, which represent the meaning of the text in a format that computers can easily understand.
- **Documents Chunking**: The collection of documents is split into smaller chunks of text. This allows for more precise and relevant pieces of information to be fed into the language model when needed, avoiding information overload.
- **Vector Embeddings**: The chunks of text are then transformed into vector embeddings. These embeddings encode the meaning of natural language text into numerical representations.
- **Vector Database**: Finally, the vector embeddings are stored in a vector database, making them easily searchable.

**Documents -> Text chunks -> Vector Embeddings -> Vector DB**

**Load -> Split -> Embed -> Store**

## Convert the pandas context to Langchain documents 

In [None]:
df = pd.read_csv("dataset.csv")

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

In [None]:
print(len(langchain_docs))
print(langchain_docs[0])
print(langchain_docs[1])

## Vector database

In [None]:
from rag_fine_tuning import CustomRAG, prompt_message, convert_knowledge_base_to_langchain_docs

parameters_dict = {
    "chunk_size": 400,
    "chunk_overlap": 15,
    "vector_database": "chromadb",
    "embeddings_function": {
        "model_name": "text-embedding-3-large",    
        "platform": "OpenAI"
        }, 
    "llm": {
        "model_name": "gpt-3.5-turbo",
        "client": "OpenAI"
        }
}

df_to_test = df[0:2]

rag = CustomRAG(knowledge_base=langchain_docs, 
                prompt_message=prompt_message,
                config=parameters_dict, 
                results_folder='/Users/mariadancianu/Desktop/Git Projects/SQuAD_RAG_experiments/eval_results/test_new_class', 
                vector_db_folder='/Users/mariadancianu/Desktop/Git Projects/SQuAD_RAG_experiments/vector_databases/test_new_class')

In [None]:
rag.initialize_embeddings_function()

In [None]:
embeddings_model = "text-embedding-3-small"
embeddings = OpenAIEmbeddings(model=embeddings_model)

db_dir = os.path.join(os.getcwd(), "vector_databases")

rag.create_vector_database()

## Querying the vector database 

In [None]:
query = "How is the weather today in Milan?"
relevant_docs = rag.query_vector_store(query, n_results=3, score_threshold=0.1)

print(relevant_docs)

In [None]:
query = "Who were the normans?"
relevant_docs = rag.query_vector_store(query, n_results=3, score_threshold=0.1)

print(len(relevant_docs))

for doc in relevant_docs:
    print(doc.page_content)

## Run the RAG over a subset of questions and save the answers 

In [None]:
df_to_test = pd.read_csv("dataset.csv")
df_to_test = df_to_test[0:5]

In [None]:

rag.get_llm_multiple_questions_answers(df_to_test)

## RAG Fine-tuning 

TEXT CHUNKING 

1. CHARACTER SPLITTING : divide the text into N-character sized chunks. Can split words in the middle. 
2. RECURSIVE CHARACTER SPLITTING: preserves sentences. Avoids splitting sentences midword (note that RecursiveCharacterTextSplitter with separator does exactly that). Split the
document where a double new line is present, then, if the chunk size is still exceeded, split at new lines, and so on.
3. SEMANTIC SPLITTING: keeps related content together. Use embeddings to split based on meaning.
+ other techniques

EMBEDDINGS 
Create fixed-length vector representation of text, focusing on semanting meaning for tasks like similarity comparison. 
Most up to date embedding models, both proprietary and open source, with performance metrics across different tasks: https://huggingface.co/spaces/mteb/leaderboard.

This contains also a "retrieval" column with performance metrics. 


In [2]:
df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

fine_tune_rag(df_to_test, 
              langchain_docs, 
              results_folder="eval_results/test_new_class", 
              vector_db_folder="vector_databases/test_new_class",)

Fine tuning RAG with the following parameters: 
{'chunk_sizes': [100, 200, 400, 500, 600],
 'embed_options': {'text-embedding-3-large': 'OpenAI',
                   'text-embedding-3-small': 'OpenAI',
                   'text-embedding-ada-002': 'OpenAI'},
 'models': {'gpt-3.5-turbo': 'OpenAI', 'mistral-large-latest': 'Mistral'}}
Running gpt-3.5-turbo - 100 - text-embedding-3-small
Results already exist for these settings: skipping!
Running gpt-3.5-turbo - 200 - text-embedding-3-small
Results already exist for these settings: skipping!
Running gpt-3.5-turbo - 400 - text-embedding-3-small
("CustomRAG config: {'chunk_size': 400, 'chunk_overlap': 15, "
 "'vector_database': 'chromadb', 'embeddings_function': {'model_name': "
 "'text-embedding-3-small', 'platform': 'OpenAI'}, 'llm': {'model_name': "
 "'gpt-3.5-turbo', 'client': 'OpenAI'}}")
Vector store 400_text-embedding-3-small already exists. No need to initialize.


  db = Chroma(
 13%|█▎        | 67/500 [01:18<06:05,  1.18it/s]No relevant docs were retrieved using the relevance score threshold 0.1
 14%|█▍        | 71/500 [01:23<07:44,  1.08s/it]No relevant docs were retrieved using the relevance score threshold 0.1
 14%|█▍        | 72/500 [01:24<07:18,  1.02s/it]No relevant docs were retrieved using the relevance score threshold 0.1
  self.vectorstore.similarity_search_with_relevance_scores(
No relevant docs were retrieved using the relevance score threshold 0.1
  self.vectorstore.similarity_search_with_relevance_scores(
No relevant docs were retrieved using the relevance score threshold 0.1
 21%|██        | 106/500 [02:02<09:24,  1.43s/it]No relevant docs were retrieved using the relevance score threshold 0.1
 23%|██▎       | 113/500 [02:08<05:03,  1.27it/s]No relevant docs were retrieved using the relevance score threshold 0.1
 23%|██▎       | 114/500 [02:08<04:59,  1.29it/s]No relevant docs were retrieved using the relevance score threshold 0.

Running gpt-3.5-turbo - 500 - text-embedding-3-small
("CustomRAG config: {'chunk_size': 500, 'chunk_overlap': 15, "
 "'vector_database': 'chromadb', 'embeddings_function': {'model_name': "
 "'text-embedding-3-small', 'platform': 'OpenAI'}, 'llm': {'model_name': "
 "'gpt-3.5-turbo', 'client': 'OpenAI'}}")
Creating vector store 500_text-embedding-3-small
Finished creating vector store 500_text-embedding-3-small


 13%|█▎        | 67/500 [01:04<06:18,  1.14it/s]No relevant docs were retrieved using the relevance score threshold 0.1
 14%|█▍        | 71/500 [01:08<06:32,  1.09it/s]No relevant docs were retrieved using the relevance score threshold 0.1
 14%|█▍        | 72/500 [01:09<06:21,  1.12it/s]No relevant docs were retrieved using the relevance score threshold 0.1
  self.vectorstore.similarity_search_with_relevance_scores(
No relevant docs were retrieved using the relevance score threshold 0.1
  self.vectorstore.similarity_search_with_relevance_scores(
 21%|██        | 106/500 [01:39<06:55,  1.06s/it]No relevant docs were retrieved using the relevance score threshold 0.1
 23%|██▎       | 113/500 [01:46<06:27,  1.00s/it]No relevant docs were retrieved using the relevance score threshold 0.1
 23%|██▎       | 114/500 [01:47<05:51,  1.10it/s]No relevant docs were retrieved using the relevance score threshold 0.1
 23%|██▎       | 116/500 [01:48<05:24,  1.18it/s]No relevant docs were retrieved usin

Running gpt-3.5-turbo - 600 - text-embedding-3-small
("CustomRAG config: {'chunk_size': 600, 'chunk_overlap': 15, "
 "'vector_database': 'chromadb', 'embeddings_function': {'model_name': "
 "'text-embedding-3-small', 'platform': 'OpenAI'}, 'llm': {'model_name': "
 "'gpt-3.5-turbo', 'client': 'OpenAI'}}")
Creating vector store 600_text-embedding-3-small
Finished creating vector store 600_text-embedding-3-small


 13%|█▎        | 65/500 [01:04<06:38,  1.09it/s]No relevant docs were retrieved using the relevance score threshold 0.1
 13%|█▎        | 67/500 [01:06<06:11,  1.17it/s]No relevant docs were retrieved using the relevance score threshold 0.1
 14%|█▍        | 69/500 [01:07<05:37,  1.28it/s]No relevant docs were retrieved using the relevance score threshold 0.1
 14%|█▍        | 71/500 [01:10<07:46,  1.09s/it]No relevant docs were retrieved using the relevance score threshold 0.1
 14%|█▍        | 72/500 [01:11<06:52,  1.04it/s]No relevant docs were retrieved using the relevance score threshold 0.1
  self.vectorstore.similarity_search_with_relevance_scores(
No relevant docs were retrieved using the relevance score threshold 0.1
  self.vectorstore.similarity_search_with_relevance_scores(
 21%|██        | 106/500 [01:42<05:35,  1.17it/s]No relevant docs were retrieved using the relevance score threshold 0.1
  self.vectorstore.similarity_search_with_relevance_scores(
No relevant docs were retri

Running mistral-large-latest - 100 - text-embedding-3-small
("CustomRAG config: {'chunk_size': 100, 'chunk_overlap': 15, "
 "'vector_database': 'chromadb', 'embeddings_function': {'model_name': "
 "'text-embedding-3-small', 'platform': 'OpenAI'}, 'llm': {'model_name': "
 "'mistral-large-latest', 'client': 'Mistral'}}")
Vector store 100_text-embedding-3-small already exists. No need to initialize.


  0%|          | 0/500 [00:00<?, ?it/s]


AttributeError: 'Chat' object has no attribute 'completions'

In [None]:
df_all_res = collect_all_results('/Users/mariadancianu/Desktop/Git Projects/SQuAD_RAG_experiments/eval_results/initial_eval_results')
df_all_res.sort_values(by="HasAns_f1", ascending=False, inplace=True)
df_all_res.to_csv("eval_results/initial_eval_results/df_all_results.csv", index=False)

In [None]:
df_all_res

# Investigate the results

In [None]:
# Pick the best results and merge the scores by question id to the original df in order to inspect the errors.
# The idea is to understand why the results are so poor for the NoAns questions, when the HasAns questions have 
# a high f1 score, in order to understand how the workflow can be optimized

In [None]:
best_result_path = os.path.join(os.getcwd(), "eval_results/initial_eval_results", df_all_res.experiment.iloc[0])
split_path = best_result_path.split("/")
split_path[-1] = split_path[-1].replace("eval_", "")
best_result_path = "/".join(split_path)
best_result_path

In [None]:
! python eval_results/evaluation.py "/Users/mariadancianu/Desktop/Git Projects/SQuAD_RAG_experiments/eval_results/data_updated_500.json" "/Users/mariadancianu/Desktop/Git Projects/SQuAD_RAG_experiments/eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"

In [None]:
df_merged = merge_results(f1_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/f1_thresh_by_qid.json"), 
                          exact_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/exact_thresh_by_qid.json"), 
                          pred_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"), 
                          filepath_500=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"),
                          context_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/context_500_400_text-embedding-3-large_gpt-3.5-turbo.json"), 
                          df_questions_filepath="dataset.csv", 
                          filter_500=True)

df_merged.shape

In [None]:
df_merged.columns

In [None]:
df_merged[df_merged.is_impossible][["id", "is_impossible", "f1_score", "exact_score", "question", "pred"]].tail(10)

In [None]:
print(df_merged.loc[479, "context"].replace(". ", ".\n"))

In [None]:
print(df_merged.loc[479, "rag_retrieved_context"])

# Evaluate with other LLMS

In [None]:
# run the RAG with best parameters, and save also the context
parameters_dict = {
    "chunk_sizes": [400],
    "embed_options": { 
        "text-embedding-3-large": "OpenAI", 
        },
    "models": {"mistral-large-latest": "Mistral"}
}

results_folder = os.path.join(os.getcwd(), "eval_results/optimize_results")
vector_db_folder = os.path.join(os.getcwd(), "vector_dabases/optimize_results")

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

fine_tune_rag(df_to_test, 
              langchain_docs, 
              parameters_dict,
              results_fodler=results_folder, 
              vector_db_folder=vector_db_folder)

# Evaluate RAG SOTA embeddings: snowflake-artic-embed-l-v2.0

In [None]:
# very slow - likely due to GPU/CPU memory issues

In [None]:
# run the RAG with SOTA embeddings 
parameters_dict = {
    "chunk_sizes": [400],
    "embed_options": { 
        "Snowflake/snowflake-arctic-embed-l-v2.0": "SentenceTransformers" # ranked 6th, 568M params, released in december 2024 
        },
    "models": {"gpt-3.5-turbo": "OpenAI"}
}

results_folder = os.path.join(os.getcwd(), "eval_results/optimize_results")
vector_db_folder = os.path.join(os.getcwd(), "vector_dabases/optimize_results")

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

fine_tune_rag(df_to_test, 
              langchain_docs, 
              parameters_dict,
              results_fodler=results_folder, 
              vector_db_folder=vector_db_folder)