# Install packages

# Run Contriever

In [None]:
from dexter.retriever.dense.Contriever import Contriever
from dexter.config.constants import Split
from dexter.data.loaders.RetrieverDataset import RetrieverDataset
from dexter.utils.metrics.retrieval.RetrievalMetrics import RetrievalMetrics
from dexter.data.datastructures.hyperparameters.dpr import DenseHyperParams
import torch
import json
import numpy as np

config_path = "/kaggle/input/contriever/config.ini"

# Configuration setup
config_instance = DenseHyperParams(
    query_encoder_path="facebook/contriever",
    document_encoder_path="facebook/contriever",
    batch_size=32
)
config = config_instance.get_all_params()

# Data loading
loader = RetrieverDataset("wikimultihopqa", "wikimultihopqa-corpus", config_path, Split.DEV)
queries, qrels, corpus = loader.qrels()
print(type(corpus))
print("queries", len(queries), len(qrels), len(corpus), queries[0])

In [None]:
tasb_search = Contriever(config_instance)

In [None]:
from tqdm import tqdm
from dexter.data.datastructures.evidence import Evidence
from transformers import AutoTokenizer, AutoModel
from torch.cuda.amp import autocast
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

def compute_document_embeddings_batched(corpus, document_encoder, tokenizer):
    embeddings = []
    
    for i in tqdm(range(0, len(corpus))):
        batch = corpus[i]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to("cuda") for k, v in inputs.items()}  # Move to GPU
        
        with torch.no_grad():
            with autocast():  # Mixed precision
                outputs = document_encoder(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu())
        
        torch.cuda.empty_cache()
    return torch.cat(embeddings, dim=0)

# Extract texts from Evidence objects
corpus_texts = [evidence.__dict__['_text'] for evidence in corpus]

# Initialize tokenizer and document encoder
document_tokenizer = AutoTokenizer.from_pretrained(config_instance.document_encoder_path)
document_encoder = AutoModel.from_pretrained(config_instance.document_encoder_path, output_hidden_states=True).to("cuda")
document_encoder.gradient_checkpointing_enable()


# Compute Embeddings

In [None]:
# Compute embeddings
document_embeddings = compute_document_embeddings_batched(corpus_texts, document_encoder, document_tokenizer)

# Save embeddings
torch.save(document_embeddings, "document_embeddings.pt")

In [None]:
import torch
document_embeddings = torch.load("/kaggle/input/contriever/document_embeddings.pt")
# Convert embeddings to NumPy
embeddings_np = document_embeddings.cpu().numpy()

# Set up index for documents

In [None]:
import faiss

# Create a FAISS index
faiss_index = faiss.IndexFlatIP(document_embeddings.shape[1])
faiss_index.add(embeddings_np)  # Add embeddings to the index

# Calculate Query Embeddings

In [None]:
# Initialize the query encoder
query_encoder = AutoModel.from_pretrained("facebook/contriever").to('cuda')
query_texts = [q.__dict__['_text'] for q in queries] 
query_inputs = document_tokenizer(query_texts, return_tensors="pt", padding=True, truncation=True).to('cuda')

with torch.no_grad():
    query_outputs = query_encoder(
        input_ids=query_inputs["input_ids"],
        attention_mask=query_inputs["attention_mask"]
    )

    query_embeddings = query_outputs.last_hidden_state[:, 0, :]
_, hard_negatives = faiss_index.search(query_embeddings.cpu().numpy(), k=3)
query_encoder.save_pretrained("trained_query_encoder")
document_tokenizer.save_pretrained("trained_query_encoder")
torch.save(query_embeddings.cpu().numpy(), "query_embeddings.pt")

# Prepare training data

In [None]:
training_data = []

# Prepare training data
for idx, (query_id, positive_docs) in enumerate(qrels.items()):
    # Extract positive document indices
    positive_indices = list(map(int, positive_docs.keys()))
    
    # Get the hard negatives for the current query
    negative_indices = hard_negatives[idx]  # Use the index of the query in hard_negatives

    # Convert hard negative indices to document texts
    negative_texts = [corpus[neg_idx].__dict__['_text'] for neg_idx in negative_indices]

    # Add training samples
    for positive_idx in positive_indices:
        training_data.append({
            "query": query_texts[idx],
            "positive": corpus[positive_idx].__dict__['_text'],     # Positive document
            "negatives": negative_texts           # Hard negatives
        })


In [None]:
import torch

def lambda_loss(scores_positive, scores_negative, sigma=1.0):
    """
    Lambda Loss for pairwise ranking tasks.

    Args:
        scores_positive (torch.Tensor): Tensor of similarity scores for positive pairs (batch_size, 1).
        scores_negative (torch.Tensor): Tensor of similarity scores for negative pairs (batch_size, num_negatives).
        sigma (float): Smoothing factor (default=1.0).

    Returns:
        torch.Tensor: The computed Lambda Loss.
    """
    # Compute pairwise differences
    pairwise_diff = scores_positive.unsqueeze(-1) - scores_negative  # Shape: (batch_size, num_negatives)

    # Compute pairwise probabilities
    probabilities = torch.sigmoid(sigma * pairwise_diff)  # Shape: (batch_size, num_negatives)

    # Lambda Loss
    loss = -torch.log(probabilities + 1e-12).mean()  # Add a small constant to avoid log(0)

    return loss

In [None]:
import torch
import torch.nn.functional as F

def contrastive_loss(pos_score, neg_scores, margin=1.0):

    # Loss
    loss = F.relu(margin + neg_scores - pos_score.unsqueeze(1)).mean()
    return loss


# Fine tune query encoder

In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW

subset_size = 1200  # Adjust the size as needed
subset_training_data = training_data[:subset_size]

# Training loop
epochs = 3
batch_size = 1

query_encoder_path = "trained_query_encoder"
query_encoder = AutoModel.from_pretrained(query_encoder_path).to('cuda')
# document_tokenizer = AutoTokenizer.from_pretrained(query_encoder_path)

dataloader = DataLoader(subset_training_data, batch_size=batch_size, shuffle=True)

# Optimizer
optimizer = AdamW(query_encoder.parameters(), lr=5e-6)

for epoch in range(epochs):
    epoch_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}", leave=True)
    
    for batch_idx, batch in enumerate(progress_bar):
        # Process negatives in chunks
        negatives = [neg for sublist in batch["negatives"] for neg in sublist]
        negative_embeddings = []
        for i in range(len(negatives)):
            inputs = document_tokenizer(negatives[i], return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
            embedding = query_encoder(**inputs).last_hidden_state[:, 0, :]
            negative_embeddings.append(embedding)
        negative_embeddings = torch.cat(negative_embeddings, dim=0)

        queries_input = document_tokenizer(batch["query"], return_tensors="pt", padding=True, truncation=True).to("cuda")
        positives = document_tokenizer(batch["positive"], return_tensors="pt", padding=True, truncation=True).to("cuda")
        
        # Encode queries, positives, and negatives
        query_emb = query_encoder(**queries_input).last_hidden_state[:, 0, :]
        positive_emb = query_encoder(**positives).last_hidden_state[:, 0, :]

        scores_positive = torch.matmul(query_embeddings, positive_emb.T)
        scores_negative = torch.matmul(query_embeddings, negative_embeddings.T)
        
        loss = contrastive_loss(scores_positive, scores_negative)
        # Backward pass and optimization
        optimizer.zero_grad()
        with torch.autograd.set_detect_anomaly(True):
            loss.backward(retain_graph=True)
        optimizer.step()

        
        # Update progress bar and epoch loss
        progress_bar.set_postfix({"Loss": loss.item()})
        epoch_loss += loss.item()
        
    print(f"Epoch {epoch + 1} completed with average loss: {epoch_loss / len(dataloader)}")

query_encoder.save_pretrained("fine_tuned_query_encoder")
document_tokenizer.save_pretrained("fine_tuned_query_encoder")

In [None]:
from transformers import AutoTokenizer, AutoModel
# Path to the saved query encoder
query_encoder_path = "/kaggle/input/dataset/fine_tuned_query_encoder"  # Replace with your directory path

# Load the model and tokenizer
query_encoder = AutoModel.from_pretrained(query_encoder_path).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(query_encoder_path)

In [None]:
query_texts = [q.__dict__['_text'] for q in queries] 
query_inputs = tokenizer(query_texts, return_tensors="pt", padding=True, truncation=True).to("cuda")
with torch.no_grad():
    query_embeddings = query_encoder(**query_inputs).last_hidden_state[:, 0, :].cpu().numpy()

# Save the embeddings for reuse
torch.save(query_embeddings, "query_embeddings.pt")

In [None]:
from tqdm import tqdm
from dexter.data.datastructures.evidence import Evidence
from transformers import AutoTokenizer, AutoModel
from torch.cuda.amp import autocast
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

def compute_document_embeddings_batched(corpus, document_encoder, tokenizer):
    embeddings = []
    
    for i in tqdm(range(0, len(corpus))):
        batch = corpus[i]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to("cuda") for k, v in inputs.items()}  # Move to GPU
        
        with torch.no_grad():
            with autocast():  # Mixed precision
                outputs = document_encoder(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu())
        
        torch.cuda.empty_cache()
    return torch.cat(embeddings, dim=0)

# Extract texts from Evidence objects
corpus_texts = [evidence.__dict__['_text'] for evidence in corpus]

# Initialize tokenizer and document encoder
document_tokenizer = tokenizer
document_encoder = query_encoder
document_encoder.gradient_checkpointing_enable()


In [None]:
# Compute embeddings
document_embeddings = compute_document_embeddings_batched(corpus_texts, document_encoder, document_tokenizer)

# Save embeddings
torch.save(document_embeddings, "document_embeddings.pt")
faiss_index = faiss.IndexFlatIP(document_embeddings.shape[1])
faiss_index.add(document_embeddings)
torch.save(document_embeddings, "document_embeddings.pt")

In [None]:
# Path to the saved FAISS index
faiss_index_path = "/kaggle/input/dataset/document_index.faiss"

# Load the index
faiss_index = faiss.read_index(faiss_index_path)
_, indices = faiss_index.search(query_embeddings, k=3)

# Run The LLM

In [None]:
import json
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import torch
from torch.cuda.amp import autocast

# Load JSON files
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Prepare input for LLM
def prepare_input(query, contexts):
    user_prompt = f"""[Question]: When does monsoon season end in the state the area code 575 is located?
[Answer]: The area code 575 is located in New Mexico. Monsoon season in New Mexico typically ends in mid-September. So the
[Final Answer]: mid-September.
[Question]: What is the current official currency in the country where Ineabelle Diaz is a citizen?
[Answer]: Ineabelle Diaz is from Peurto Rico, which is in the United States of America. The current official currency in the United
States is the United States dollar. 
[Final Answer]: United States dollar.
[Question]: Where was the person who founded the American Institute of Public Opinion in 1935 born?
[Answer]: The person who founded the American Institute of Public Opinion in 1935 is George Gallup. George Gallup was born
in Jefferson, Iowa. 
[Final Answer]: Jefferson.
[Question]: What language is used by the director of Tiffany Memorandum?
[Answer]: The director of Tiffany Memorandum is Sergio Grieco. Sergio Grieco speaks Italian.
[Final Answer]: Italian.
[Question]: What is the sports team the person played for who scored the first touchdown in Superbowl 1?
[Answer]: The player that scored the first touchdown in Superbowl 1 is Max McGee. Max McGee played for the Green Bay
Packers.
[Final Answer]: Green Bay Packers.
[Question]: The birth country of Jayantha Ketagoda left the British Empire when?
[Answer]: The birth country of Jayantha Ketagoda is Sri Lanka. Sri Lanka left the British Empire on February 4, 1948. So the
[Final Answer]: February 4, 1948.\n\n Follow the above example and Given the evidence, Evidence: {' '.join(contexts)} \n use the information, think step by step and output the final answer extremely concisely in the form [Final Answer]: for the question, Question:{query}"""
    return user_prompt

# Generate response using LLM
def generate_answer(input_text, tokenizer, model):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to('cuda')
    inputs.pop("token_type_ids", None)
    try:
        outputs = model.generate(**inputs)
        # Decode the output tokens to text
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        # Return a default response if generation fails
        print(f"Generation error: {e}")
        return "not possible"

# F1 Score Calculation
def f1_score_metric(prediction, ground_truth):
    pred_tokens = set(prediction.lower().split())
    gt_tokens = set(ground_truth.lower().split())
    
    intersection = pred_tokens.intersection(gt_tokens)
    precision = len(intersection) / len(pred_tokens) if len(pred_tokens) > 0 else 0
    recall = len(intersection) / len(gt_tokens) if len(gt_tokens) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1, precision, recall

# BLEU Score Calculation
def bleu_score_metric(prediction, ground_truth):
    prediction_tokens = prediction.lower().split()
    ground_truth_tokens = ground_truth.lower().split()
    score = sentence_bleu([ground_truth_tokens], prediction_tokens)
    return score

# ROUGE Score Calculation
def rouge_score_metric(prediction, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(ground_truth, prediction)
    return scores

# Process dataset and evaluate responses
def process_and_evaluate(dataset, tokenizer, model):
    results = {}
    question_df = {"questions":[],"answers":[]}
    total_em = 0
    total_f1 = 0
    total_precision = 0
    total_recall = 0
    total_bleu = 0
    total_rouge = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    count = 0

    for entry in dataset:
        if count == 400:
            break
        else:
            count += 1
        query_id = entry["_id"]
        query = entry["question"]
        ground_truth = entry["answer"]
        # Retrieve contexts and prepare input
        contexts = retrieved_contexts[count]
        # print(contexts)
        input_text = prepare_input(query, contexts)
        # print(input_text)
        
        # Generate response
        prediction = generate_answer(input_text, tokenizer, model)
        
        # Calculate Exact Match score
        if "not possible" in prediction.lower() or "unknown" in prediction.lower():
            em = 0
        elif len(re.split(r'\[?Final Answer\]?:', prediction)) > 1:
            answer = re.split(r'\[?Final Answer\]?:', prediction)[-1]
            em = 1 if ground_truth.lower() in prediction.lower() else 0
        else:
            em = 0

        total_em += em
        
        # Calculate F1 Score
        f1,precision, recall = f1_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_f1 += f1
        total_precision += precision
        total_recall += recall
        
        # Calculate BLEU Score
        bleu = bleu_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_bleu += bleu
        
        # Calculate ROUGE Score
        rouge = rouge_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_rouge['rouge1'] += rouge['rouge1'].fmeasure
        total_rouge['rouge2'] += rouge['rouge2'].fmeasure
        total_rouge['rougeL'] += rouge['rougeL'].fmeasure
        
        # Store results
        results[query_id] = {"prediction": prediction, "ground_truth": ground_truth, "exact_match": em}
        
        if len(re.split(r'\[?Final Answer\]?:', prediction)) > 1:
            question_df["answers"].append(re.split(r'\[?Final Answer\]?:', prediction)[-1])
        else:
            question_df["answers"].append(prediction)
            
        question_df["questions"].append(query)

    # Calculate overall metrics
    accuracy = total_em / 1200
    average_f1 = total_f1 / 1200
    average_precision = total_precision / 1200
    average_recall = total_recall / 1200
    average_bleu = total_bleu / 1200
    average_rouge1 = total_rouge['rouge1'] / 1200
    average_rouge2 = total_rouge['rouge2'] / 1200
    average_rougeL = total_rouge['rougeL'] / 1200

    final_questions = pd.DataFrame(question_df)

    print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")
    print(f"Average F1 Score: {average_f1:.2f}")
    print(f"Average Precision: {average_precision:.2f}")
    print(f"Average Recall: {average_recall:.2f}")
    print(f"Average BLEU Score: {average_bleu:.2f}")
    print(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}")
    print(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}")
    print(f"Average ROUGE-L F-Score: {average_rougeL:.2f}")
    
    final_questions.to_csv("llama2_wqa_rag_5_few_shot_metrics.tsv", sep="\t", index=False)
    
    return results, accuracy, average_f1, average_precision, average_recall, average_bleu, average_rouge1, average_rouge2, average_rougeL

In [None]:
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()

auth_token = os.getenv("huggingface_token")

if not auth_token:
    raise ValueError("Authentication token not found. Please set huggingface_token in your .env file.")

login(auth_token)

# File paths
dataset_file_path = "/kaggle/input/contriever/dev.json"
document_file_path = "/kaggle/input/contriever/wiki_musique_corpus.json"

# Load data
# contriever_data = relevant
dataset = load_json(dataset_file_path)
document_data = load_json(document_file_path)

model_name = "meta-llama/Llama-2-7b-chat-hf"

# Initialize LLM and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")


In [None]:
def map_indices_to_texts(indices, corpus):
    return [
        [corpus[int(idx)].__dict__["_text"] for idx in doc_indices if int(idx) < len(corpus)]
        for doc_indices in indices
    ]
retrieved_contexts = map_indices_to_texts(indices, corpus)

In [None]:
output_path = "retrieved_contexts.json"
with open(output_path, "w") as f:
    json.dump(retrieved_contexts, f)

In [None]:
results, accuracy, average_f1,average_precision, average_recall, average_bleu, average_rouge1, average_rouge2, average_rougeL = process_and_evaluate(
    dataset, tokenizer, model)

# print(f"Results for ratio {ratio}, N={N}")
# Print evaluation results
print(f"Overall Exact Match Accuracy: {accuracy * 100:.2f}%")
print(f"Average F1 Score: {average_f1:.2f}")
print(f"Average Precision: {average_precision:.2f}")
print(f"Average Recall: {average_recall:.2f}")
print(f"Average BLEU Score: {average_bleu:.2f}")
print(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}")
print(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}")
print(f"Average ROUGE-L F-Score: {average_rougeL:.2f}")

with open(f"/kaggle/working/accuracy_metrics.txt", "w") as f:
    f.write(f"Overall Exact Match Accuracy: {accuracy * 100:.2f}%\n")
    f.write(f"Average F1 Score: {average_f1:.2f}\n")
    f.write(f"Average Precision: {average_precision:.2f}\n")
    f.write(f"Average Recall: {average_recall:.2f}\n")
    f.write(f"Average BLEU Score: {average_bleu:.2f}\n")
    f.write(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}\n")
    f.write(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}\n")
    f.write(f"Average ROUGE-L F-Score: {average_rougeL:.2f}\n")