In [1]:
import json
import os 
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from rag_optimization import convert_knowledge_base_to_langchain_docs, optimize_rag_parameters
from data_utils import convert_json_to_dataframe, create_json_subset, collect_all_results, merge_results
sns.set_style("whitegrid")
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

%load_ext autoreload

In [None]:
# Useful material 
# SQuAD Evaluation guidelines: 
# https://worksheets.codalab.org/worksheets/0x8212d84ca41c4150b555a075b19ccc05/
# https://rajpurkar.github.io/SQuAD-explorer/

# Convert json data to pandas dataframe 

In [None]:
convert_json_to_dataframe()

In [None]:
df_all_data = pd.read_csv("dataset.csv")
df_all_data.shape

In [None]:
df_selected = df_all_data.copy()

In [None]:
# some rough statistics for the context length 
df_selected.loc[:, "context_chars"] = df_selected["context"].apply(lambda x: len(x))
df_selected.loc[:, "context_words"] = df_selected.loc[:, "context"].apply(lambda x: len(x.split(" ")))

In [None]:
df_selected.head(2)

In [None]:
plt.figure(figsize=[8, 5])
sns.histplot(df_selected.drop_duplicates(subset="context")["context_chars"])

In [None]:
plt.figure(figsize=[8, 5])
sns.histplot(df_selected.drop_duplicates(subset="context")["context_words"])

In [None]:
# create original json structure for only a subset of questions, used for tests and fine-tuning 
# this file will be used by the evaluation.py file 

df = pd.read_csv("dataset.csv")
df_sel = df[0:500]
df_sel.head(2)

create_json_subset(df_sel)

# RAG architecture

Steps:

**Data Indexing**

Converting text data into a searchable database of vector embeddings, which represent the meaning of the text in a format that computers can easily understand.
- **Documents Chunking**: The collection of documents is split into smaller chunks of text. This allows for more precise and relevant pieces of information to be fed into the language model when needed, avoiding information overload.
- **Vector Embeddings**: The chunks of text are then transformed into vector embeddings. These embeddings encode the meaning of natural language text into numerical representations.
- **Vector Database**: Finally, the vector embeddings are stored in a vector database, making them easily searchable.

**Documents -> Text chunks -> Vector Embeddings -> Vector DB**

**Load -> Split -> Embed -> Store**

## Convert the pandas context to Langchain documents 

In [None]:
df = pd.read_csv("dataset.csv")

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

In [None]:
print(len(langchain_docs))
print(langchain_docs[0])
print(langchain_docs[1])

## Vector database

In [None]:
from rag_optimization import CustomRAG, prompt_message, convert_knowledge_base_to_langchain_docs

parameters_dict = {
    "chunk_size": 400,
    "chunk_overlap": 15,
    "vector_database": "chromadb",
    "embeddings_function": {
        "model_name": "text-embedding-3-large",    
        "platform": "OpenAI"
        }, 
    "llm": {
        "model_name": "gpt-3.5-turbo",
        "client": "OpenAI"
        }
}

df_to_test = df[0:10]

rag = CustomRAG(knowledge_base=langchain_docs, 
                prompt_message=prompt_message,
                config_dict=parameters_dict, 
                results_folder='./eval_results/test_new_class', 
                vector_db_folder='./vector_databases/')

In [None]:
rag.vector_store.initialize_embeddings_function()

In [None]:
embeddings_model = "text-embedding-3-small"
embeddings = OpenAIEmbeddings(model=embeddings_model)

db_dir = os.path.join(os.getcwd(), "vector_databases")

rag.vector_store.create_vector_database()

## Querying the vector database 

In [None]:
query = "How is the weather today in Milan?"
relevant_docs = rag.vector_store.query_vector_store(query, n_results=3, score_threshold=0.1)

print(relevant_docs)

In [None]:
query = "Who were the normans?"
context = rag.vector_store.query_vector_store(query, n_results=3, score_threshold=0.1)

print(context)

## Run the RAG over a subset of questions and save the answers 

In [None]:
df_to_test = pd.read_csv("dataset.csv")
df_to_test = df_to_test[0:5]

In [None]:

rag.get_llm_multiple_questions_answers(df_to_test)

## RAG Fine-tuning 

TEXT CHUNKING 

1. CHARACTER SPLITTING : divide the text into N-character sized chunks. Can split words in the middle. 
2. RECURSIVE CHARACTER SPLITTING: preserves sentences. Avoids splitting sentences midword (note that RecursiveCharacterTextSplitter with separator does exactly that). Split the
document where a double new line is present, then, if the chunk size is still exceeded, split at new lines, and so on.
3. SEMANTIC SPLITTING: keeps related content together. Use embeddings to split based on meaning.
+ other techniques

EMBEDDINGS 
Create fixed-length vector representation of text, focusing on semanting meaning for tasks like similarity comparison. 
Most up to date embedding models, both proprietary and open source, with performance metrics across different tasks: https://huggingface.co/spaces/mteb/leaderboard.

This contains also a "retrieval" column with performance metrics. 


In [None]:
df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

optimize_rag_parameters(
    df_to_test, 
    langchain_docs, 
    results_folder="eval_results/test_new_class", 
    vector_db_folder="vector_databases/test_new_class",
)

In [None]:
path = "eval_results/optimize_results"
df_all_res = collect_all_results(path)
df_all_res.sort_values(by="HasAns_f1", ascending=False, inplace=True)
df_all_res.to_csv(f"{path}/df_all_results.csv", index=False)

In [None]:
df_all_res

# Investigate the results

In [None]:
# Pick the best results and merge the scores by question id to the original df in order to inspect the errors.
# The idea is to understand why the results are so poor for the NoAns questions, when the HasAns questions have 
# a high f1 score, in order to understand how the workflow can be optimized

In [None]:
best_result_path = os.path.join(os.getcwd(), "eval_results/initial_eval_results", df_all_res.experiment.iloc[0])
split_path = best_result_path.split("/")
split_path[-1] = split_path[-1].replace("eval_", "")
best_result_path = "/".join(split_path)
best_result_path

In [None]:
! python "$(pwd)/eval_results/evaluation.py" "$(pwd)/eval_results/data_updated_500.json" "$(pwd)/eval_results/optimize_results/pred_400_all-MiniLM-L6-v2_gpt-3.5-turbo.json" --out-file "$(pwd)/eval_results/optimize_results/eval_pred_400_all-MiniLM-L6-v2_gpt-3.5-turbo.json"

In [None]:
df_merged = merge_results(f1_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/f1_thresh_by_qid.json"), 
                          exact_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/exact_thresh_by_qid.json"), 
                          pred_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"), 
                          filepath_500=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"),
                          context_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/context_500_400_text-embedding-3-large_gpt-3.5-turbo.json"), 
                          df_questions_filepath="dataset.csv", 
                          filter_500=True)

df_merged.shape

In [None]:
df_merged.columns

In [None]:
df_merged[df_merged.is_impossible][["id", "is_impossible", "f1_score", "exact_score", "question", "pred"]].tail(10)

In [None]:
print(df_merged.loc[479, "context"].replace(". ", ".\n"))

In [None]:
print(df_merged.loc[479, "rag_retrieved_context"])

# Evaluate with other LLMS

In [None]:
# run the RAG with best parameters, and save also the context
parameters_dict = {
    "chunk_sizes": [400],
    "embed_options": { 
        "text-embedding-3-small": "OpenAI", 
        },
    "models": {
       # "meta/meta-llama-3-70b-instruct":
       "anthropic/claude-3.5-sonnet":
        "Replicate"}
}

results_folder = os.path.join(os.getcwd(), "eval_results/optimize_results")
vector_db_folder = os.path.join(os.getcwd(), "vector_databases")

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

optimize_rag_parameters(
    df_to_test, 
    langchain_docs, 
    parameters_dict,
    results_folder=results_folder, 
    vector_db_folder=vector_db_folder
)

# Evaluate RAG SOTA embeddings: snowflake-artic-embed-l-v2.0

In [None]:
# very slow - likely due to GPU/CPU memory issues
# TODO: use cloud computing

In [None]:
# run the RAG with SOTA embeddings 
parameters_dict = {
    "chunk_sizes": [400],
    "embed_options": { 
      #  "Snowflake/snowflake-arctic-embed-l-v2.0": "SentenceTransformers" # ranked 6th, 568M params, released in december 2024 
        "all-MiniLM-L6-v2": "SentenceTransformers"
        },
    "models": {"gpt-3.5-turbo": "OpenAI"}
}

results_folder = os.path.join(os.getcwd(), "eval_results/optimize_results")
vector_db_folder = os.path.join(os.getcwd(), "vector_databases")

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

optimize_rag_parameters(
    df_to_test, 
    langchain_docs, 
    parameters_dict,
    results_folder=results_folder, 
    vector_db_folder=vector_db_folder
)

# Explore replicate 

In [None]:
# allows to run generative AI models in the Cloud 
# Claude-3-5-sonnet is the best llm for short context (less than 5k tokens) according to this research: https://www.galileo.ai/blog/best-llms-for-rag
# the second one is llama-3-70b-instruct
# TODO: try/compare other cloud providers. Try also HuggingFace inference API

In [None]:
custom_system_prompt = f"""
You are a highly accurate and reliable assistant. Answer the user's question using **only** the provided context.
If the answer is not in the context, return an empty response (**""**) without making up information.

Context:
%s

Instructions:
- Answer concisely and precisely.
- If the answer is explicitly stated in the context, extract it as-is.
- If the answer is not in the context, return **""** (empty string).
- Do **not** infer, assume, or add external information.

Example:
    **Question:** What is the capital of Italy?
    **Answer:** Rome

Question: %s
Answer (just the answer, no extra words, or "" if unknown):
"""

query = "Who were the normans?"


In [None]:
import replicate

output = replicate.run(
   # "anthropic/claude-3.5-sonnet", 
   "meta/meta-llama-3-70b-instruct",
    input={
    "prompt": query,
    "system_prompt": custom_system_prompt,
    "max_tokens": 512,
    "prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(system_prompt=custom_system_prompt, prompt="{prompt}"),})

output_merged = "".join(s for s in output if s not in ['\n', '\t', '\r', '""'])
output_merged

# Trye Qdrant vector database

In [None]:
# supports hybrid search: combine sparse (ie TF-IDF, BM25) and dense vectors 
# supports reranking and more advanced search strategies for RAG

In [13]:
from vector_store import QdrantVectorStore

df = pd.read_csv("dataset.csv")

print(df.shape)

(11873, 14)


In [15]:
df = df.loc[5937:]
df.index.max()


11872

In [17]:

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

parameters_dict = {
    "chunk_size": 400,
    "chunk_overlap": 15,
    "vector_database": "qdrant",
    "sparse_text_model": "Qdrant/bm25",
    "embeddings_function": {
        "model_name": "text-embedding-3-small",    
        "platform": "OpenAI"
        }, 
}


qdrant_vs = QdrantVectorStore(knowledge_base=langchain_docs, config_dict=parameters_dict)
qdrant_vs.create_vector_store()

Collection exists! Skipping creation!
status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=588 points_count=588 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors={'text-dense': VectorParams(size=1536, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None)}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'text-sparse': SparseVectorParams(index=SparseIndexParams(full_scan_threshold=None, on_disk=False, datatype=None), modifier=None)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_

In [24]:
# check the content of the vector datbae
res = qdrant_vs.client.scroll(
    collection_name=qdrant_vs.vector_database_name,
    limit=1,
    with_payload=True,
    with_vectors=True,
)

res[0][0].vector

{'text-dense': [-0.009786591,
  0.060062587,
  0.05024297,
  0.007111516,
  -0.022457419,
  0.008718763,
  -0.0025773742,
  -0.026398476,
  -0.014861527,
  0.04557535,
  0.03888216,
  -0.01507069,
  -0.010204915,
  -0.009621463,
  -0.00065294397,
  0.015323886,
  0.001717332,
  0.016380705,
  0.029216662,
  0.028468082,
  0.022039095,
  0.023117932,
  -0.010204915,
  -0.02787362,
  0.0049951244,
  0.02930473,
  0.013892775,
  0.01303411,
  0.012373597,
  -0.02644251,
  -0.03434664,
  -0.007518832,
  0.030713823,
  0.053985875,
  0.0036603392,
  -0.050595243,
  0.030933993,
  -0.00204346,
  -0.030911976,
  0.06411373,
  -0.027213108,
  -0.05609951,
  0.060150657,
  0.043968104,
  -0.018560397,
  0.014663373,
  0.02606822,
  -0.018593421,
  -0.037979458,
  0.009990249,
  -0.0073481994,
  -0.008581156,
  -0.00964348,
  0.023095913,
  -0.0010939735,
  0.053325363,
  0.00990218,
  -0.009252677,
  7.340459e-05,
  -0.058433324,
  -0.0025842544,
  -0.04308742,
  0.020156635,
  -0.030405585,
  

In [8]:
qdrant_vs.delete_vector_store()

Deleting the collection
Collections before deletion:  collections=[CollectionDescription(name='my_documents'), CollectionDescription(name='400_text-embedding-3-small'), CollectionDescription(name='qdrant_test'), CollectionDescription(name='400_text-embedding-3-large')]
Collections after deletion:  collections=[CollectionDescription(name='my_documents'), CollectionDescription(name='qdrant_test'), CollectionDescription(name='400_text-embedding-3-large')]


In [3]:
context = qdrant_vs.query_vector_store("normans")
context

id='33415122-da0c-4aae-88d9-1d1caded5d48' version=0 score=0.8333334 payload={'context': 'The Normans thereafter adopted the growing feudal doctrines of the rest of France, and worked them into a functional hierarchical system in both Normandy and in England. The new Norman rulers were culturally and ethnically distinct from the old French aristocracy, most of whom traced their lineage to Franks of the Carolingian dynasty. Most Norman knights remained poor and land-hungry, and by 1066 Normandy had been exporting fighting horsemen for more than a generation. Many Normans of Italy, France and England eventually served as avid Crusaders under the Italo-Norman prince Bohemund I and the Anglo-Norman king Richard the Lion-Heart.'} vector=None shard_key=None order_value=None

id='4ce66c78-165a-4a08-b8d2-0af8ead65cb1' version=0 score=0.6666667 payload={'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their nam

'The Normans thereafter adopted the growing feudal doctrines of the rest of France, and worked them into a functional hierarchical system in both Normandy and in England. The new Norman rulers were culturally and ethnically distinct from the old French aristocracy, most of whom traced their lineage to Franks of the Carolingian dynasty. Most Norman knights remained poor and land-hungry, and by 1066 Normandy had been exporting fighting horsemen for more than a generation. Many Normans of Italy, France and England eventually served as avid Crusaders under the Italo-Norman prince Bohemund I and the Anglo-Norman king Richard the Lion-Heart.\n\nThe Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles