## Setup

In [1]:
import pandas as pd
import ast
import asyncio
import google.generativeai as genai
import numpy as np
from typing import List, Dict, Any
import torch
from tqdm import tqdm 
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import os
import yaml




In [35]:
from dotenv import load_dotenv
load_dotenv()  

True

In [33]:
df = pd.read_csv(r'C:\Users\admin\Desktop\AIO\ParetoRAG\data\validation_data.csv')
df = df.head(300)

## Embedding

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentenceTransformer('all-MiniLM-L6-v2')  
model = model.to(device)

In [12]:
BATCH_SIZE = 64 

In [13]:
async def get_embeddings_batch(texts: List[str], batch_size: int = 32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        batch_emb = await asyncio.to_thread(model.encode, batch, convert_to_numpy=True)
        embeddings.extend(batch_emb)
    return np.array(embeddings)

In [14]:
async def process_row_async(row: pd.Series) -> List[Dict[str, Any]]:
    sentences_dict = ast.literal_eval(row.values[5])
    results = []
    question_id, question = row.values[0], row.values[1]
    
    for title, sentences in sentences_dict.items():
        contexts = [' '.join(s for s in sentences if s != sent) for sent in sentences]        
        core_embs = await get_embeddings_batch(sentences)
        context_embs = await get_embeddings_batch(contexts)
        weighted_embs = 0.8 * core_embs + 0.2 * context_embs
        
        results.extend([{
            'question_id': question_id,
            'question': question,
            'title': title,
            'sentence': sent,
            'weighted_embedding': w_emb
        } for sent, w_emb in zip(sentences, weighted_embs)])
    
    return results

In [15]:
async def process_all_rows(df: pd.DataFrame) -> pd.DataFrame:
    tasks = [process_row_async(row) for _, row in df.iterrows()]
    results = await tqdm.gather(*tasks, desc="Processing rows")
    all_results = [item for sublist in results for item in sublist]
    return pd.DataFrame(all_results)

In [19]:
embeddings_df = await process_all_rows(df)

Processing rows: 100%|██████████| 300/300 [01:47<00:00,  2.78it/s]


In [20]:
embeddings_df

Unnamed: 0,question_id,question,title,sentence,weighted_embedding
0,5a8b57f25542995d1e6f1371,Were Scott Derrickson and Ed Wood of the same ...,Adam Collis,Adam Collis Adam Collis is an American filmmak...,"[-0.060843006, -0.07372631, 0.030862343, -0.04..."
1,5a8b57f25542995d1e6f1371,Were Scott Derrickson and Ed Wood of the same ...,Adam Collis,Adam Collis He attended the Duke University f...,"[-0.0066994987, -0.06440425, 0.049298123, -0.0..."
2,5a8b57f25542995d1e6f1371,Were Scott Derrickson and Ed Wood of the same ...,Adam Collis,Adam Collis He also studied cinema at the Uni...,"[-0.01357059, -0.09846686, 0.007369948, -0.025..."
3,5a8b57f25542995d1e6f1371,Were Scott Derrickson and Ed Wood of the same ...,Adam Collis,Adam Collis Collis first work was the assista...,"[-0.12149089, -0.053364065, 0.029267848, -0.03..."
4,5a8b57f25542995d1e6f1371,Were Scott Derrickson and Ed Wood of the same ...,Adam Collis,"Adam Collis In 1998, he played ""Crankshaft"" i...","[-0.08738548, -0.04439761, -0.032737877, -0.06..."
...,...,...,...,...,...
12803,5adfbca255429942ec259b2c,"When did the rock band that sang ""All Join Han...",Tom Morello discography,"Tom Morello discography After graduating """" f...","[-0.038539037, -0.06785561, -0.031276815, -0.0..."
12804,5adfbca255429942ec259b2c,"When did the rock band that sang ""All Join Han...",Tom Morello discography,Tom Morello discography Later Adam Jones move...,"[-0.105251186, -0.07850574, -0.06657188, -0.05..."
12805,5adfbca255429942ec259b2c,"When did the rock band that sang ""All Join Han...",Tom Morello discography,Tom Morello discography In the late 80's More...,"[-0.1281535, -0.02820184, -0.07985569, -0.0425..."
12806,5adfbca255429942ec259b2c,"When did the rock band that sang ""All Join Han...",Tom Morello discography,Tom Morello discography In 1991 Morello left ...,"[-0.110214725, -0.046852585, -0.07335095, -0.0..."


In [24]:
question_df = df[['question_id', 'question']]

async def add_question_embeddings(df: pd.DataFrame) -> pd.DataFrame:
    questions = df['question'].tolist()
    question_embeddings = await get_embeddings_batch(questions, batch_size=BATCH_SIZE)
    df['question_embedding'] = question_embeddings.tolist()
    return df

In [None]:
question_df = await add_question_embeddings(question_df)
question_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_embedding'] = question_embeddings.tolist()


Unnamed: 0,question_id,question,question_embedding
0,5a8b57f25542995d1e6f1371,Were Scott Derrickson and Ed Wood of the same ...,"[-0.07346513122320175, -0.01779698207974434, -..."
1,5a8c7595554299585d9e36b6,What government position was held by the woman...,"[-0.06768488883972168, -0.04196927696466446, -..."
2,5a85ea095542994775f606a8,"What science fantasy young adult series, told ...","[-0.011420877650380135, 0.02307860180735588, -..."
3,5adbf0a255429947ff17385a,Are the Laleli Mosque and Esma Sultan Mansion ...,"[0.09680207818746567, 0.031308747828006744, -0..."
4,5a8e3ea95542995a26add48d,"The director of the romantic comedy ""Big Stone...","[0.061934322118759155, -0.044251229614019394, ..."
...,...,...,...
295,5a835c9f5542992ef85e228a,What army did the namesake of the ship launche...,"[-0.02701864205300808, 0.03475065529346466, -0..."
296,5ae7edee554299540e5a56ad,The Church of the Guanche People was founded i...,"[0.0434095524251461, -0.0071853455156087875, -..."
297,5a8ee4315542990e94052ba7,What officially ended the first phase of the m...,"[-0.08006496727466583, 0.030587373301386833, 0..."
298,5a8cb288554299585d9e3726,"The mass killing that took place at Oakland, C...","[0.11990100890398026, 0.04099593311548233, 0.0..."


## Embedding Similarity a.k.a Top K selection

In [27]:
def consine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    vec1 = torch.tensor(vec1, dtype=torch.float32)
    vec2 = torch.tensor(vec2, dtype=torch.float32)
    return torch.nn.functional.cosine_similarity(vec1, vec2, dim=0).item()

In [None]:
for i, row in embeddings_df.iterrows():
    weighted_encoding = row['weighted_embedding']
    question_encoding = question_df.loc[question_df['question_id'] == row['question_id'], 'question_embedding'].values[0]
    similarity = consine_similarity(weighted_encoding, question_encoding)
    embeddings_df.at[i, 'similarity'] = similarity

In [28]:
def batch_cosine_similarity(embeddings_df: pd.DataFrame, question_df: pd.DataFrame) -> pd.DataFrame:
    weighted_encodings = np.stack(embeddings_df['weighted_embedding'].values)
    
    question_embeddings_dict = dict(zip(
        question_df['question_id'],
        question_df['question_embedding']
    ))
    
    question_encodings = np.stack([
        question_embeddings_dict[qid] 
        for qid in embeddings_df['question_id']
    ])
    
    weighted_tensor = torch.tensor(weighted_encodings, dtype=torch.float32)
    question_tensor = torch.tensor(question_encodings, dtype=torch.float32)
    
    similarities = torch.nn.functional.cosine_similarity(
        weighted_tensor, 
        question_tensor,
        dim=1
    ).numpy()
    
    embeddings_df['similarity'] = similarities
    return embeddings_df

In [29]:
embeddings_df = batch_cosine_similarity(embeddings_df, question_df)

In [52]:
# Sort within each question_id group and display relevant columns
grouped_results = (embeddings_df
    .groupby('question_id', group_keys=False)
    .apply(lambda x: x.sort_values('similarity', ascending=False))
    .reset_index(drop=True)
)[['question_id', 'question', 'title', 'sentence', 'similarity']]

# Display results
grouped_results

  .apply(lambda x: x.sort_values('similarity', ascending=False))


Unnamed: 0,question_id,question,title,sentence,similarity
0,5a713a5a5542994082a3e6a9,What football club plays in the area between t...,"Old Trafford, Greater Manchester","Old Trafford, Greater Manchester The crossroa...",0.708351
1,5a713a5a5542994082a3e6a9,What football club plays in the area between t...,"Old Trafford, Greater Manchester","Old Trafford, Greater Manchester Old Trafford ...",0.567476
2,5a713a5a5542994082a3e6a9,What football club plays in the area between t...,Trafford Bar tram stop,Trafford Bar tram stop Trafford Bar is a tram ...,0.561708
3,5a713a5a5542994082a3e6a9,What football club plays in the area between t...,Gorse Hill,Gorse Hill Gorse Hill shares a border with Ol...,0.513169
4,5a713a5a5542994082a3e6a9,What football club plays in the area between t...,Trafford Bar tram stop,Trafford Bar tram stop It opened on 15 June 1...,0.481306
...,...,...,...,...,...
12803,5ae82ae555429952e35eaa71,"Martin Patterson ""Pat"" Hingle was a close frie...",Dirás que estoy loco,Dirás que estoy loco The song was originally ...,0.058856
12804,5ae82ae555429952e35eaa71,"Martin Patterson ""Pat"" Hingle was a close frie...",Dirás que estoy loco,Dirás que estoy loco It was released two year...,0.011634
12805,5ae82ae555429952e35eaa71,"Martin Patterson ""Pat"" Hingle was a close frie...",Dirás que estoy loco,"Dirás que estoy loco ""Diras que estoy loco"" is...",0.009202
12806,5ae82ae555429952e35eaa71,"Martin Patterson ""Pat"" Hingle was a close frie...",Dirás que estoy loco,Dirás que estoy loco It was the lead single f...,-0.000019


In [None]:
top_10 = (grouped_results
    .groupby('question_id')
    .head(10)
    .reset_index(drop=True)
)

## LLM (generating answer)

In [8]:
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [9]:
model = genai.GenerativeModel("gemini-1.5-flash")

In [15]:
question_chunks = {}

In [16]:
for i, row in top_10.iterrows():
    question = row['question']
    if question not in question_chunks:
        question_chunks[question] = []
    question_chunks[question].append(row['sentence'])

In [27]:
with open(r'C:\Users\admin\Desktop\AIO\ParetoRAG\prompt\prompt_template.yaml', 'r', encoding='utf-8') as file:
    prompt_template = yaml.safe_load(file)

In [30]:
system_prompt = prompt_template['prompt']['pareto_rag'][0]['prompt_template']
user_prompt = prompt_template['prompt']['pareto_rag'][1]['prompt_template']

In [None]:
import time
from tqdm import tqdm

answers = {}

# Combine system and user prompts
for k, v in tqdm(question_chunks.items(), desc="Processing questions"):
    # Combine prompts into single user message
    combined_prompt = f"{system_prompt}\n\nQuestion: {k}\nContext: {' '.join(v)}"
    
    try:
        response = model.generate_content(
            combined_prompt,
            generation_config={
                "temperature": 0.3,
                "top_p": 0.8,
                "top_k": 40,
                "max_output_tokens": 1024,
            }
        )
        
        answers[k] = response.text
        # print(f"\nProcessed: {k[:50]}...")
        time.sleep(10)  # Rate limiting
        
    except Exception as e:
        print(f"\nError processing question '{k[:50]}...': {str(e)}")
        answers[k] = f"Error: {str(e)}"

# Create DataFrame with results
answers_df = pd.DataFrame(list(answers.items()), columns=['question', 'answer'])

In [34]:
final_df = answers_df.head(41)
final_df['ground_truth'] = df['answer'].head(41).values
final_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['ground_truth'] = df['answer'].head(41).values


Unnamed: 0,question,answer,ground_truth
0,What football club plays in the area between t...,Manchester United F.C.\n,yes
1,Alvaro Mexia had a diplomatic mission with whi...,Ais\n,Chief of Protocol
2,What is the name of the fight song of the univ...,The provided text does not contain the name of...,Animorphs
3,"Who has written more than 300 papers, Semyon A...",Patrick Corrigan\n,no
4,Who was the writer of These Boots Are Made for...,"Lee Hazlewood wrote ""These Boots Are Made for ...","Greenwich Village, New York City"
5,who is younger Keith Bostic or Jerry Glanville ?,"Keith Bostic was born on January 17, 1961. Th...",YG Entertainment
6,"Which board game was published most recently, ...","The provided text states that ""The Settlers of...",Eenasul Fateh
7,"Who was born first, Erika Jayne or Marco Da Silva",Erika Jayne\n,"3,677 seated"
8,What film was written and directed by Joby Har...,Robin Hood (2018 film)\n,Terry Richardson
9,What american actress/singer born in 1956 reco...,Paige O'Hara\n,yes
