In [1]:
import json
import os 
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from rag_optimization import convert_knowledge_base_to_langchain_docs, optimize_rag_parameters
from data_utils import convert_json_to_dataframe, create_json_subset, collect_all_results, merge_results
sns.set_style("whitegrid")
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

%load_ext autoreload

In [None]:
# Useful material 
# SQuAD Evaluation guidelines: 
# https://worksheets.codalab.org/worksheets/0x8212d84ca41c4150b555a075b19ccc05/
# https://rajpurkar.github.io/SQuAD-explorer/

# Convert json data to pandas dataframe 

In [None]:
convert_json_to_dataframe()

In [None]:
df_all_data = pd.read_csv("dataset.csv")
df_all_data.shape

In [None]:
df_selected = df_all_data.copy()

In [None]:
# some rough statistics for the context length 
df_selected.loc[:, "context_chars"] = df_selected["context"].apply(lambda x: len(x))
df_selected.loc[:, "context_words"] = df_selected.loc[:, "context"].apply(lambda x: len(x.split(" ")))

In [None]:
df_selected.head(2)

In [None]:
plt.figure(figsize=[8, 5])
sns.histplot(df_selected.drop_duplicates(subset="context")["context_chars"])

In [None]:
plt.figure(figsize=[8, 5])
sns.histplot(df_selected.drop_duplicates(subset="context")["context_words"])

In [None]:
# create original json structure for only a subset of questions, used for tests and fine-tuning 
# this file will be used by the evaluation.py file 

df = pd.read_csv("dataset.csv")
df_sel = df[0:500]
df_sel.head(2)

create_json_subset(df_sel)

# RAG architecture

Steps:

**Data Indexing**

Converting text data into a searchable database of vector embeddings, which represent the meaning of the text in a format that computers can easily understand.
- **Documents Chunking**: The collection of documents is split into smaller chunks of text. This allows for more precise and relevant pieces of information to be fed into the language model when needed, avoiding information overload.
- **Vector Embeddings**: The chunks of text are then transformed into vector embeddings. These embeddings encode the meaning of natural language text into numerical representations.
- **Vector Database**: Finally, the vector embeddings are stored in a vector database, making them easily searchable.

**Documents -> Text chunks -> Vector Embeddings -> Vector DB**

**Load -> Split -> Embed -> Store**

## Convert the pandas context to Langchain documents 

In [None]:
df = pd.read_csv("dataset.csv")

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

In [None]:
print(len(langchain_docs))
print(langchain_docs[0])
print(langchain_docs[1])

## Vector database

In [None]:
from rag_optimization import CustomRAG, prompt_message, convert_knowledge_base_to_langchain_docs

parameters_dict = {
    "chunk_size": 400,
    "chunk_overlap": 15,
    "vector_database": "chromadb",
    "embeddings_function": {
        "model_name": "text-embedding-3-large",    
        "platform": "OpenAI"
        }, 
    "llm": {
        "model_name": "gpt-3.5-turbo",
        "client": "OpenAI"
        }
}

df_to_test = df[0:10]

rag = CustomRAG(knowledge_base=langchain_docs, 
                prompt_message=prompt_message,
                config_dict=parameters_dict, 
                results_folder='./eval_results/test_new_class', 
                vector_db_folder='./vector_databases/')

In [None]:
rag.vector_store.initialize_embeddings_function()

In [None]:
embeddings_model = "text-embedding-3-small"
embeddings = OpenAIEmbeddings(model=embeddings_model)

db_dir = os.path.join(os.getcwd(), "vector_databases")

rag.vector_store.create_vector_database()

## Querying the vector database 

In [None]:
query = "How is the weather today in Milan?"
relevant_docs = rag.vector_store.query_vector_store(query, n_results=3, score_threshold=0.1)

print(relevant_docs)

In [None]:
query = "Who were the normans?"
context = rag.vector_store.query_vector_store(query, n_results=3, score_threshold=0.1)

print(context)

## Run the RAG over a subset of questions and save the answers 

In [None]:
df_to_test = pd.read_csv("dataset.csv")
df_to_test = df_to_test[0:5]

In [None]:

rag.get_llm_multiple_questions_answers(df_to_test)

## RAG Fine-tuning 

TEXT CHUNKING 

1. CHARACTER SPLITTING : divide the text into N-character sized chunks. Can split words in the middle. 
2. RECURSIVE CHARACTER SPLITTING: preserves sentences. Avoids splitting sentences midword (note that RecursiveCharacterTextSplitter with separator does exactly that). Split the
document where a double new line is present, then, if the chunk size is still exceeded, split at new lines, and so on.
3. SEMANTIC SPLITTING: keeps related content together. Use embeddings to split based on meaning.
+ other techniques

EMBEDDINGS 
Create fixed-length vector representation of text, focusing on semanting meaning for tasks like similarity comparison. 
Most up to date embedding models, both proprietary and open source, with performance metrics across different tasks: https://huggingface.co/spaces/mteb/leaderboard.

This contains also a "retrieval" column with performance metrics. 


In [None]:
df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

optimize_rag_parameters(
    df_to_test, 
    langchain_docs, 
    results_folder="eval_results/test_new_class", 
    vector_db_folder="vector_databases/test_new_class",
)

In [None]:
path = "eval_results/optimize_results"
df_all_res = collect_all_results(path)
df_all_res.sort_values(by="HasAns_f1", ascending=False, inplace=True)
df_all_res.to_csv(f"{path}/df_all_results.csv", index=False)

In [None]:
df_all_res

# Investigate the results

In [None]:
# Pick the best results and merge the scores by question id to the original df in order to inspect the errors.
# The idea is to understand why the results are so poor for the NoAns questions, when the HasAns questions have 
# a high f1 score, in order to understand how the workflow can be optimized

In [None]:
best_result_path = os.path.join(os.getcwd(), "eval_results/initial_eval_results", df_all_res.experiment.iloc[0])
split_path = best_result_path.split("/")
split_path[-1] = split_path[-1].replace("eval_", "")
best_result_path = "/".join(split_path)
best_result_path

In [None]:
! python "$(pwd)/eval_results/evaluation.py" "$(pwd)/eval_results/data_updated_500.json" "$(pwd)/eval_results/optimize_results/pred_400_all-MiniLM-L6-v2_gpt-3.5-turbo.json" --out-file "$(pwd)/eval_results/optimize_results/eval_pred_400_all-MiniLM-L6-v2_gpt-3.5-turbo.json"

In [None]:
df_merged = merge_results(f1_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/f1_thresh_by_qid.json"), 
                          exact_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/exact_thresh_by_qid.json"), 
                          pred_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"), 
                          filepath_500=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/pred_500_400_text-embedding-3-large_gpt-3.5-turbo.json"),
                          context_filepath=os.path.join(os.getcwd(), "eval_results/debugging_eval_results/context_500_400_text-embedding-3-large_gpt-3.5-turbo.json"), 
                          df_questions_filepath="dataset.csv", 
                          filter_500=True)

df_merged.shape

In [None]:
df_merged.columns

In [None]:
df_merged[df_merged.is_impossible][["id", "is_impossible", "f1_score", "exact_score", "question", "pred"]].tail(10)

In [None]:
print(df_merged.loc[479, "context"].replace(". ", ".\n"))

In [None]:
print(df_merged.loc[479, "rag_retrieved_context"])

# Evaluate with other LLMS

In [None]:
# run the RAG with best parameters, and save also the context
parameters_dict = {
    "chunk_sizes": [400],
    "embed_options": { 
        "text-embedding-3-small": "OpenAI", 
        },
    "models": {
       # "meta/meta-llama-3-70b-instruct":
       "anthropic/claude-3.5-sonnet":
        "Replicate"}
}

results_folder = os.path.join(os.getcwd(), "eval_results/optimize_results")
vector_db_folder = os.path.join(os.getcwd(), "vector_databases")

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

optimize_rag_parameters(
    df_to_test, 
    langchain_docs, 
    parameters_dict,
    results_folder=results_folder, 
    vector_db_folder=vector_db_folder
)

# Evaluate RAG SOTA embeddings: snowflake-artic-embed-l-v2.0

In [None]:
# very slow - likely due to GPU/CPU memory issues
# TODO: use cloud computing

In [None]:
# run the RAG with SOTA embeddings 
parameters_dict = {
    "chunk_sizes": [400],
    "embed_options": { 
      #  "Snowflake/snowflake-arctic-embed-l-v2.0": "SentenceTransformers" # ranked 6th, 568M params, released in december 2024 
        "all-MiniLM-L6-v2": "SentenceTransformers"
        },
    "models": {"gpt-3.5-turbo": "OpenAI"}
}

results_folder = os.path.join(os.getcwd(), "eval_results/optimize_results")
vector_db_folder = os.path.join(os.getcwd(), "vector_databases")

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

df = pd.read_csv("dataset.csv")
df_to_test = df[0:500]

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

optimize_rag_parameters(
    df_to_test, 
    langchain_docs, 
    parameters_dict,
    results_folder=results_folder, 
    vector_db_folder=vector_db_folder
)

# Explore replicate 

In [None]:
# allows to run generative AI models in the Cloud 
# Claude-3-5-sonnet is the best llm for short context (less than 5k tokens) according to this research: https://www.galileo.ai/blog/best-llms-for-rag
# the second one is llama-3-70b-instruct
# TODO: try/compare other cloud providers. Try also HuggingFace inference API

In [None]:
custom_system_prompt = f"""
You are a highly accurate and reliable assistant. Answer the user's question using **only** the provided context.
If the answer is not in the context, return an empty response (**""**) without making up information.

Context:
%s

Instructions:
- Answer concisely and precisely.
- If the answer is explicitly stated in the context, extract it as-is.
- If the answer is not in the context, return **""** (empty string).
- Do **not** infer, assume, or add external information.

Example:
    **Question:** What is the capital of Italy?
    **Answer:** Rome

Question: %s
Answer (just the answer, no extra words, or "" if unknown):
"""

query = "Who were the normans?"


In [None]:
import replicate

output = replicate.run(
   # "anthropic/claude-3.5-sonnet", 
   "meta/meta-llama-3-70b-instruct",
    input={
    "prompt": query,
    "system_prompt": custom_system_prompt,
    "max_tokens": 512,
    "prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(system_prompt=custom_system_prompt, prompt="{prompt}"),})

output_merged = "".join(s for s in output if s not in ['\n', '\t', '\r', '""'])
output_merged

# Trye Qdrant vector database

In [None]:
# supports hybrid search: combine sparse (ie TF-IDF, BM25) and dense vectors 
# supports reranking and more advanced search strategies for RAG

In [None]:
from langchain_qdrant import FastEmbedSparse, RetrievalMode
from langchain_qdrant import QdrantVectorStore
from langchain_openai import OpenAIEmbeddings

In [None]:
df = pd.read_csv("dataset.csv")

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

embeddings = OpenAIEmbeddings()
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/BM25")

In [None]:
embeddings

In [None]:
# Note that if you’ve added documents with HYBRID mode, you can switch to any retrieval mode when searching. Since both the dense and sparse vectors are available in the collection.


"""
vector_store = QdrantVectorStore.from_documents(
    langchain_docs,
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    #location=":memory:",  # Local mode with in-memory storage only
    path="./vector_databases_qdrant/",  # Local mode, without using the Qdrant server, may also store your vectors on disk so they’re persisted between runs.
    collection_name="my_documents",
    retrieval_mode=RetrievalMode.HYBRID, # RetrievalMode.DENSE, RetrievalMode.SPARSE
)"""


vector_store = QdrantVectorStore.from_existing_collection(
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    collection_name="my_documents",
    path="./vector_databases_qdrant/",
    retrieval_mode=RetrievalMode.HYBRID
)


#You can also transform the vector store into a retriever for easier usage in your chains.
#retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 1})
#retriever.invoke(query)

query = "Who were the normans?"
found_docs = vector_store.similarity_search_with_score(query, k=3)


for doc, score in found_docs:
    print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")

In [None]:
# metadata filtering
from qdrant_client import models

results = vector_store.similarity_search(
    query=query,
    k=1,
    filter=models.Filter(
        should=[
            models.FieldCondition(
                key="page_content",
                match=models.MatchValue(
                    value="The top 10 soccer players in the world right now."
                ),
            ),
        ]
    ),
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

In [None]:
from qdrant_client import QdrantClient
from qdrant_client import models


In [None]:
docs = []
metadata = []
print(len(langchain_docs))

for doc in langchain_docs:
    docs.append(doc.page_content)
    metadata.append(doc.metadata)

In [None]:
client = QdrantClient(url="http://localhost:6333")

collections = client.get_collections()

print(collections)

collection_name = "my_documents"

client.delete_collection(collection_name=collection_name)
collections = client.get_collections()

print("here")
print(collections)

#client.set_model("sentence-transformers/all-MiniLM-L6-v2")
client.set_model("snowflake/snowflake-arctic-embed-s")
#client.set_model("openai/OpenAI text-embedding-3-small") # OpenAI Embeddings are supported but with a different workflow

# comment this line to use dense vectors only
client.set_sparse_model("prithivida/Splade_PP_en_v1")#"Qdrant/BM25") #"prithivida/Splade_PP_en_v1")


# Methods get_fastembed_vector_params and get_fastembed_sparse_vector_params help you to get the corresponding parameters for the models you are using. These parameters include vector size, distance function, etc.

# Without fastembed integration, you would need to specify the vector size and distance function manually. 
if not client.collection_exists(collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=client.get_fastembed_vector_params(),
        # comment this line to use dense vectors only
        sparse_vectors_config=client.get_fastembed_sparse_vector_params(),  
    )
    client.add(
        collection_name=collection_name,
        documents=docs,
        metadata=metadata,
        parallel=0,  # Use all available CPU cores to encode data. 
    )



"""

if not client.collection_exists(collection_name):
    
    client.create_collection(
        collection_name=collection_name,
        # TODO: review vectors_config and sparse_vectors_config
        vectors_config={
        "text-dense": models.VectorParams(
            size=1536,  # OpenAI Embeddings
            distance=models.Distance.COSINE,
            )
        },
        sparse_vectors_config={
        "text-sparse": models.SparseVectorParams(
            index=models.SparseIndexParams(
                on_disk=False,
            )
        )
        }
    )

    vector_store = QdrantVectorStore.from_documents(
        langchain_docs,
        embedding=embeddings,
        client=client,
        sparse_embedding=sparse_embeddings,
        collection_name=collection_name,
        retrieval_mode=RetrievalMode.HYBRID, # RetrievalMode.DENSE, RetrievalMode.SPARSE
    )
else:
    
    vector_store = QdrantVectorStore.from_existing_collection(
        embedding=embeddings,
        sparse_embedding=sparse_embeddings,
        client=client,
        collection_name=collection_name,
      #  path="./vector_databases_qdrant/",
        retrieval_mode=RetrievalMode.HYBRID
    )"""


In [None]:
docs[0]

In [None]:
import uuid
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct, CollectionStatus, UpdateStatus
from qdrant_client.http.models import Filter, FieldCondition, MatchValue
from qdrant_client.http import models
from typing import List

import openai
#from openai.embeddings_utils import get_embedding

client = QdrantClient(
    url="localhost",
    port=6333,
)

collection_name = "qdrant_test"
vector_size = 1536
vector_distance=Distance.COSINE

def set_up_collection(collection_name: str, vector_size: int, vector_distance: str):

    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=vector_distance)
    )

    collection_info = client.get_collection(collection_name=collection_name)

try:
    collection_info = client.get_collection(collection_name=collection_name)
except Exception as e:
    print("Collection does not exist, creating collection now")
    set_up_collection(collection_name,  vector_size, vector_distance)

embeddings_model_name = "text-embedding-3-small"

embeddings_function = OpenAIEmbeddings(
    model=embeddings_model_name
)

def upsert_data(data: List, client):
    from openai import OpenAI
    openaiclient = OpenAI()
    points = []
    for doc in data:
       # quote = item.get("quote")
        #person = item.get("person")

        text_vector = openaiclient.embeddings.create(input = [doc], model=embeddings_model_name).data[0].embedding #embeddings_function.encode(doc) #get_embedding(quote, engine="text-embedding-ada-002")
        text_id = str(uuid.uuid4())
        #payload = {"quote": quote, "person": person}
        point = PointStruct(id=text_id, vector=text_vector) #payload=payload)
        points.append(point)

    operation_info = client.upsert(
        collection_name=collection_name,
        wait=True,
        points=points)

    if operation_info.status == UpdateStatus.COMPLETED:
        print("Data inserted successfully!")
    else:
        print("Failed to insert data")

In [None]:
upsert_data(docs[0:10], client)

In [None]:
docs[0]

In [None]:
def search(input_query: str, limit: int = 3):
    from openai import OpenAI
    openaiclient = OpenAI()
    embeddings_model = "text-embedding-3-small"
    input_vector = openaiclient.embeddings.create(input = [input_query], model=embeddings_model).data[0].embedding #embeddings_function.encode(doc) #get_embedding(quote, engine="text-embedding-a#get_embedding(input_query, engine="text-embedding-ada-002")
    search_result = client.search(
        collection_name=collection_name,
        query_vector=input_vector,
        limit=limit
    )

    print("search results:")
    print(search_result)

    result = []
    for item in search_result:
        similarity_score = item.score
        payload = item.payload
        data = {"id": item.id, "similarity_score": similarity_score, "quote": payload.get("quote"), "person": payload.get("person")}
        result.append(data)

    return result

result = search("who were the normans?")

In [2]:
from vector_store import QdrantVectorStore

df = pd.read_csv("dataset.csv")

langchain_docs = convert_knowledge_base_to_langchain_docs(df)

parameters_dict = {
    "chunk_size": 400,
    "chunk_overlap": 15,
    "vector_database": "qdrant",
    "embeddings_function": {
        "model_name": "text-embedding-3-small",    
        "platform": "OpenAI"
        }, 
}


qdrant_vs = QdrantVectorStore(knowledge_base=langchain_docs, config_dict=parameters_dict)
qdrant_vs.create_vector_store()

Collection exists! Skipping creation!
status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=10 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1536, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segme

In [3]:
qdrant_vs.delete_vector_store()

Deleting the collection
Collections before deletion:  collections=[CollectionDescription(name='my_documents'), CollectionDescription(name='qdrant_test'), CollectionDescription(name='400_text-embedding-3-small'), CollectionDescription(name='400_text-embedding-3-large')]
Collections after deletion:  collections=[CollectionDescription(name='my_documents'), CollectionDescription(name='qdrant_test'), CollectionDescription(name='400_text-embedding-3-large')]


In [4]:
qdrant_vs.create_vector_store()

Collection does not exist, creating collection now
Adding data in the database


ValidationError: 1 validation error for PointStruct
context
  Extra inputs are not permitted [type=extra_forbidden, input_value={'context': 'The Normans ... succeeding centuries.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/extra_forbidden

In [4]:
context = qdrant_vs.query_vector_store("normans")
context

[ScoredPoint(id='5a40b8dc-65ed-49e8-ba75-f05a8a57ac54', version=0, score=0.7553296, payload={}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='8eb1623c-4bd1-4117-91de-33f0577df8d6', version=0, score=0.70617324, payload={}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='22d0fbc8-db66-4344-b45d-9fd0c6275f23', version=0, score=0.6593559, payload={}, vector=None, shard_key=None, order_value=None)]