In [1]:
import json
import os 
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from rag_optimization import convert_knowledge_base_to_langchain_docs, optimize_rag_parameters
from data_utils import convert_json_to_dataframe, create_json_subset, collect_all_results, merge_results
sns.set_style("whitegrid")
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

%load_ext autoreload

In [None]:
# Useful material 
# SQuAD Evaluation guidelines: 
# https://worksheets.codalab.org/worksheets/0x8212d84ca41c4150b555a075b19ccc05/
# https://rajpurkar.github.io/SQuAD-explorer/

# Convert json data to pandas dataframe 

In [None]:
convert_json_to_dataframe()

In [None]:
df_all_data = pd.read_csv("dataset.csv")
df_all_data.shape

In [None]:
df_selected = df_all_data.copy()

In [None]:
# some rough statistics for the context length 
df_selected.loc[:, "context_chars"] = df_selected["context"].apply(lambda x: len(x))
df_selected.loc[:, "context_words"] = df_selected.loc[:, "context"].apply(lambda x: len(x.split(" ")))

In [None]:
df_selected.head(2)

In [None]:
plt.figure(figsize=[8, 5])
sns.histplot(df_selected.drop_duplicates(subset="context")["context_chars"])

In [None]:
plt.figure(figsize=[8, 5])
sns.histplot(df_selected.drop_duplicates(subset="context")["context_words"])

In [None]:
# create original json structure for only a subset of questions, used for tests and fine-tuning 
# this file will be used by the evaluation.py file 

df = pd.read_csv("dataset.csv")
df_sel = df[0:500]
df_sel.head(2)

create_json_subset(df_sel)

# RAG architecture

Steps:

**Data Indexing**

Converting text data into a searchable database of vector embeddings, which represent the meaning of the text in a format that computers can easily understand.
- **Documents Chunking**: The collection of documents is split into smaller chunks of text. This allows for more precise and relevant pieces of information to be fed into the language model when needed, avoiding information overload.
- **Vector Embeddings**: The chunks of text are then transformed into vector embeddings. These embeddings encode the meaning of natural language text into numerical representations.
- **Vector Database**: Finally, the vector embeddings are stored in a vector database, making them easily searchable.

**Documents -> Text chunks -> Vector Embeddings -> Vector DB**

**Load -> Split -> Embed -> Store**

## Convert the pandas context to Langchain documents 

In [None]:
df = pd.read_csv("dataset.csv")

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

In [None]:
print(len(langchain_docs))
print(langchain_docs[0])
print(langchain_docs[1])

## Vector database

In [None]:
from rag_optimization import CustomRAG, prompt_message, convert_knowledge_base_to_langchain_docs

parameters_dict = {
    "chunk_size": 400,
    "chunk_overlap": 15,
    "vector_database": "chromadb",
    "embeddings_function": {
        "model_name": "text-embedding-3-large",    
        "platform": "OpenAI"
        }, 
    "llm": {
        "model_name": "gpt-3.5-turbo",
        "client": "OpenAI"
        }
}

df_to_test = df[0:10]

rag = CustomRAG(knowledge_base=langchain_docs, 
                prompt_message=prompt_message,
                config=parameters_dict, 
                results_folder='/eval_results/test_new_class', 
                vector_db_folder='/vector_databases/test_new_class')

In [None]:
rag.initialize_embeddings_function()

In [None]:
embeddings_model = "text-embedding-3-small"
embeddings = OpenAIEmbeddings(model=embeddings_model)

db_dir = os.path.join(os.getcwd(), "vector_databases")

rag.create_vector_database()

## Querying the vector database 

In [None]:
query = "How is the weather today in Milan?"
relevant_docs = rag.query_vector_store(query, n_results=3, score_threshold=0.1)

print(relevant_docs)

In [None]:
query = "Who were the normans?"
relevant_docs = rag.query_vector_store(query, n_results=3, score_threshold=0.1)

print(len(relevant_docs))

for doc in relevant_docs:
    print(doc.page_content)

## Run the RAG over a subset of questions and save the answers 

In [None]:
df_to_test = pd.read_csv("dataset.csv")
df_to_test = df_to_test[0:5]

In [None]:

rag.get_llm_multiple_questions_answers(df_to_test)

## RAG Fine-tuning 

TEXT CHUNKING 

1. CHARACTER SPLITTING : divide the text into N-character sized chunks. Can split words in the middle. 
2. RECURSIVE CHARACTER SPLITTING: preserves sentences. Avoids splitting sentences midword (note that RecursiveCharacterTextSplitter with separator does exactly that). Split the
document where a double new line is present, then, if the chunk size is still exceeded, split at new lines, and so on.
3. SEMANTIC SPLITTING: keeps related content together. Use embeddings to split based on meaning.
+ other techniques

EMBEDDINGS 
Create fixed-length vector representation of text, focusing on semanting meaning for tasks like similarity comparison. 
Most up to date embedding models, both proprietary and open source, with performance metrics across different tasks: https://huggingface.co/spaces/mteb/leaderboard.

This contains also a "retrieval" column with performance metrics. 


In [None]:
df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

optimize_rag_parameters(
    df_to_test, 
    langchain_docs, 
    results_folder="eval_results/test_new_class", 
    vector_db_folder="vector_databases/test_new_class",
)

In [None]:
path = "eval_results/test_new_class"
df_all_res = collect_all_results(path)
df_all_res.sort_values(by="HasAns_f1", ascending=False, inplace=True)
df_all_res.to_csv(f"{path}/df_all_results.csv", index=False)

In [None]:
df_all_res

Unnamed: 0,exact,f1,total,HasAns_exact,HasAns_f1,HasAns_total,NoAns_exact,NoAns_f1,NoAns_total,experiment
3,49.6,54.188264,500,61.603376,71.283257,237,38.78327,38.78327,263,eval_pred_400_text-embedding-3-small_gpt-3.5-t...
2,50.0,55.196172,500,59.493671,70.45606,237,41.444867,41.444867,263,eval_pred_200_text-embedding-3-small_gpt-3.5-t...
1,44.6,49.323149,500,59.493671,69.45812,237,31.178707,31.178707,263,eval_pred_500_text-embedding-3-small_gpt-3.5-t...
0,43.4,48.797472,500,56.962025,68.349097,237,31.178707,31.178707,263,eval_pred_600_text-embedding-3-small_gpt-3.5-t...
4,50.6,54.960261,500,46.413502,55.612365,237,54.372624,54.372624,263,eval_pred_100_text-embedding-3-small_gpt-3.5-t...


# Investigate the results

In [None]:
# Pick the best results and merge the scores by question id to the original df in order to inspect the errors.
# The idea is to understand why the results are so poor for the NoAns questions, when the HasAns questions have 
# a high f1 score, in order to understand how the workflow can be optimized

In [None]:
best_result_path = os.path.join(os.getcwd(), "eval_results/initial_eval_results", df_all_res.experiment.iloc[0])
split_path = best_result_path.split("/")
split_path[-1] = split_path[-1].replace("eval_", "")
best_result_path = "/".join(split_path)
best_result_path

In [None]:
! python $(pwd)/eval_results/evaluation.py "$(pwd)/eval_results/data_updated_500.json" "$(pwd)/eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"

In [None]:
df_merged = merge_results(f1_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/f1_thresh_by_qid.json"), 
                          exact_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/exact_thresh_by_qid.json"), 
                          pred_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"), 
                          filepath_500=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"),
                          context_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/context_500_400_text-embedding-3-large_gpt-3.5-turbo.json"), 
                          df_questions_filepath="dataset.csv", 
                          filter_500=True)

df_merged.shape

In [None]:
df_merged.columns

In [None]:
df_merged[df_merged.is_impossible][["id", "is_impossible", "f1_score", "exact_score", "question", "pred"]].tail(10)

In [None]:
print(df_merged.loc[479, "context"].replace(". ", ".\n"))

In [None]:
print(df_merged.loc[479, "rag_retrieved_context"])

# Evaluate with other LLMS

In [None]:
# run the RAG with best parameters, and save also the context
parameters_dict = {
    "chunk_sizes": [400],
    "embed_options": { 
        "text-embedding-3-small": "OpenAI", 
        },
    "models": {"meta/meta-llama-3-70b-instruct": "Replicate"}
}

results_folder = os.path.join(os.getcwd(), "eval_results/optimize_results")
vector_db_folder = os.path.join(os.getcwd(), "vector_databases")

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

optimize_rag_parameters(
    df_to_test, 
    langchain_docs, 
    parameters_dict,
    results_folder=results_folder, 
    vector_db_folder=vector_db_folder
)

Fine tuning RAG with the following parameters: 
{'chunk_sizes': [400],
 'embed_options': {'text-embedding-3-small': 'OpenAI'},
 'models': {'meta/meta-llama-3-70b-instruct': 'Replicate'}}
Running meta/meta-llama-3-70b-instruct - 400 - text-embedding-3-small
("CustomRAG config: {'chunk_size': 400, 'chunk_overlap': 15, "
 "'vector_database': 'chromadb', 'embeddings_function': {'model_name': "
 "'text-embedding-3-small', 'platform': 'OpenAI'}, 'llm': {'model_name': "
 "'meta/meta-llama-3-70b-instruct', 'client': 'Replicate'}}")
Creating vector store 400_text-embedding-3-small
Finished creating vector store 400_text-embedding-3-small


  1%|          | 3/500 [00:03<10:24,  1.26s/it]

# Evaluate RAG SOTA embeddings: snowflake-artic-embed-l-v2.0

In [None]:
# very slow - likely due to GPU/CPU memory issues

In [None]:
# run the RAG with SOTA embeddings 
parameters_dict = {
    "chunk_sizes": [400],
    "embed_options": { 
        "Snowflake/snowflake-arctic-embed-l-v2.0": "SentenceTransformers" # ranked 6th, 568M params, released in december 2024 
        },
    "models": {"gpt-3.5-turbo": "OpenAI"}
}

results_folder = os.path.join(os.getcwd(), "eval_results/optimize_results")
vector_db_folder = os.path.join(os.getcwd(), "vector_databases/optimize_results")

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

optimize_rag_parameters(
    df_to_test, 
    langchain_docs, 
    parameters_dict,
    results_fodler=results_folder, 
    vector_db_folder=vector_db_folder
)

# Explore replicate 

In [None]:
# allows to run generative AI models in the Cloud 
# Claude-3-5-sonnet is the best llm for short context (less than 5k tokens) according to this research: https://www.galileo.ai/blog/best-llms-for-rag
# the second one is llama-3-70b-instruct

In [3]:
custom_system_prompt = f"""
You are a highly accurate and reliable assistant. Answer the user's question using **only** the provided context.
If the answer is not in the context, return an empty response (**""**) without making up information.

Context:
%s

Instructions:
- Answer concisely and precisely.
- If the answer is explicitly stated in the context, extract it as-is.
- If the answer is not in the context, return **""** (empty string).
- Do **not** infer, assume, or add external information.

Example:
    **Question:** What is the capital of Italy?
    **Answer:** Rome

Question: %s
Answer (just the answer, no extra words, or "" if unknown):
"""

query = "Who were the normans?"


In [33]:
import replicate

output = replicate.run(
   # "anthropic/claude-3.5-sonnet", 
   "meta/meta-llama-3-70b-instruct",
    input={
    "prompt": query,
    "system_prompt": custom_system_prompt,
    "max_tokens": 512,
    "prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(system_prompt=custom_system_prompt, prompt="{prompt}"),})

output_merged = "".join(s for s in output if s not in ['\n', '\t', '\r', '""'])
output_merged

''