# Task 1: Evaluate LLMs on retrieved context documents

## 1.1 Extract contexts for each query using an off the shelf retriever (contriever)

In [None]:
from dexter.retriever.dense.Contriever import Contriever
from dexter.config.constants import Split
from dexter.data.loaders.RetrieverDataset import RetrieverDataset
from dexter.utils.metrics.retrieval.RetrievalMetrics import RetrievalMetrics
from dexter.utils.metrics.SimilarityMatch import CosineSimilarity as CosScore
from dexter.data.datastructures.hyperparameters.dpr import DenseHyperParams
import json


config_instance = DenseHyperParams(query_encoder_path="facebook/contriever",
                                 document_encoder_path="facebook/contriever"
                                 ,batch_size=32)
config = config_instance.get_all_params()

loader = RetrieverDataset("wikimultihopqa","wikimultihopqa-corpus","config/config.ini",Split.DEV)
queries, qrels, corpus = loader.qrels()
print(type(corpus))
print("queries",len(queries),len(qrels),len(corpus),queries[0])
tasb_search = Contriever(config_instance)

In [None]:
k_values=[0,2,4]

similarity_measure = CosScore()
for k in k_values:
    response = tasb_search.retrieve(corpus,queries,k,similarity_measure)
    print("indices",len(response))
    metrics = RetrievalMetrics()
    print(metrics.evaluate_retrieval(qrels=qrels,results=response))

    cur = k + 1

    # Save the retrieval results to a json file
    with open(f"retrieval_results/retrieval_results_top_{cur}.json","w") as f:
        json.dump(response,f)

## 1.2 Feed the retrieval results to the LLM and evaluate the accuracy using Exact Match (and metrics like ROUGE and BLEU)

### Zero-Shot Prompts Evaluation

In [None]:
import json
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import torch
import os
from dotenv import load_dotenv

# Load JSON files
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Map document IDs to their text
def get_document_texts(doc_ids, document_data):
    return [document_data[str(doc_id)]["text"] for doc_id in doc_ids if str(doc_id) in document_data]

# Retrieve top-k contexts
def retrieve_contexts(query_id, contriever_data, document_data):
    doc_ids = list(contriever_data.get(query_id, {}).keys())
    return get_document_texts(doc_ids, document_data)

# Prepare input for LLM
def prepare_input(query, contexts):
    user_prompt = f"Given the evidence, Evidence: {' '.join(contexts)} \n use the information, think step by step and output the final answer extremely concisely in the form [Final Answer]: for the question, Question:{query}"
    return user_prompt

# Generate response using LLM
def generate_answer(input_text, tokenizer, model):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to('cuda')
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# F1 Score Calculation
def f1_score_metric(prediction, ground_truth):
    pred_tokens = set(prediction.lower().split())
    gt_tokens = set(ground_truth.lower().split())
    
    intersection = pred_tokens.intersection(gt_tokens)
    precision = len(intersection) / len(pred_tokens) if len(pred_tokens) > 0 else 0
    recall = len(intersection) / len(gt_tokens) if len(gt_tokens) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

# BLEU Score Calculation
def bleu_score_metric(prediction, ground_truth):
    prediction_tokens = prediction.lower().split()
    ground_truth_tokens = ground_truth.lower().split()
    score = sentence_bleu([ground_truth_tokens], prediction_tokens)
    return score

# ROUGE Score Calculation
def rouge_score_metric(prediction, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(ground_truth, prediction)
    return scores

# Process dataset and evaluate responses
def process_and_evaluate(contriever_data, dataset, document_data, tokenizer, model):
    results = {}
    question_df = {"questions":[],"answers":[]}
    total_em = 0
    total_f1 = 0
    total_bleu = 0
    total_rouge = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    count = 0

    for entry in dataset:
        if count == 1200:
            break
        else:
            count += 1
        query_id = entry["_id"]
        query = entry["question"]
        ground_truth = entry["answer"]
        
        # Retrieve contexts and prepare input
        contexts = retrieve_contexts(query_id, contriever_data, document_data)
        input_text = prepare_input(query, contexts)
        
        # Generate response
        prediction = generate_answer(input_text, tokenizer, model)
        
        # Calculate Exact Match score
        if "not possible" in prediction.lower() or "unknown" in prediction.lower():
            em = 0
        elif len(re.split(r'\[?Final Answer\]?:', prediction)) > 1:
            answer = re.split(r'\[?Final Answer\]?:', prediction)[-1]
            em = 1 if ground_truth.lower() in prediction.lower() else 0
        else:
            em = 0

        total_em += em
        
        # Calculate F1 Score
        f1 = f1_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_f1 += f1
        
        # Calculate BLEU Score
        bleu = bleu_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_bleu += bleu
        
        # Calculate ROUGE Score
        rouge = rouge_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_rouge['rouge1'] += rouge['rouge1'].fmeasure
        total_rouge['rouge2'] += rouge['rouge2'].fmeasure
        total_rouge['rougeL'] += rouge['rougeL'].fmeasure
        
        # Store results
        results[query_id] = {"prediction": prediction, "ground_truth": ground_truth, "exact_match": em}
        
        if len(re.split(r'\[?Final Answer\]?:', prediction)) > 1:
            question_df["answers"].append(re.split(r'\[?Final Answer\]?:', prediction)[-1])
        else:
            question_df["answers"].append(prediction)
            
        question_df["questions"].append(query)

    # Calculate overall metrics
    accuracy = total_em / 1200
    average_f1 = total_f1 / 1200
    average_bleu = total_bleu / 1200
    average_rouge1 = total_rouge['rouge1'] / 1200
    average_rouge2 = total_rouge['rouge2'] / 1200
    average_rougeL = total_rouge['rougeL'] / 1200

    final_questions = pd.DataFrame(question_df)

    print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")
    print(f"Average F1 Score: {average_f1:.2f}")
    print(f"Average BLEU Score: {average_bleu:.2f}")
    print(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}")
    print(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}")
    print(f"Average ROUGE-L F-Score: {average_rougeL:.2f}")
    
    final_questions.to_csv("llama2_wqa_rag_3_zero_shot_metrics.tsv", sep="\t", index=False)
    
    return results, accuracy, average_f1, average_bleu, average_rouge1, average_rouge2, average_rougeL

from huggingface_hub import login

load_dotenv()

auth_token = os.getenv("huggingface_token")

if not auth_token:
    raise ValueError("Authentication token not found. Please set HF_TOKEN in your .env file.")

login(auth_token)

# File paths
contriever_file_path = "/kaggle/input/contriever-results-correct/retrieval_results_top_3.json"
dataset_file_path = "/kaggle/input/dataset/dev.json"
document_file_path = "/kaggle/input/corpus/wiki_musique_corpus.json"

# Load data
contriever_data = load_json(contriever_file_path)
dataset = load_json(dataset_file_path)
document_data = load_json(document_file_path)

model_name = "meta-llama/Llama-2-7b-chat-hf"

# Initialize LLM and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

# Process dataset and evaluate
results, accuracy, average_f1, average_bleu, average_rouge1, average_rouge2, average_rougeL = process_and_evaluate(
    contriever_data, dataset, document_data, tokenizer, model)

# Print evaluation results
print(f"Overall Exact Match Accuracy: {accuracy * 100:.2f}%")
print(f"Average F1 Score: {average_f1:.2f}")
print(f"Average BLEU Score: {average_bleu:.2f}")
print(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}")
print(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}")
print(f"Average ROUGE-L F-Score: {average_rougeL:.2f}")

with open("/kaggle/working/accuracy_metrics.txt", "w") as f:
    f.write(f"Overall Exact Match Accuracy: {accuracy * 100:.2f}%\n")
    f.write(f"Average F1 Score: {average_f1:.2f}\n")
    f.write(f"Average BLEU Score: {average_bleu:.2f}\n")
    f.write(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}\n")
    f.write(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}\n")
    f.write(f"Average ROUGE-L F-Score: {average_rougeL:.2f}\n")


### Few-Shot Prompts Evaluation

In [None]:
import json
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import torch

# Load JSON files
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Map document IDs to their text
def get_document_texts(doc_ids, document_data):
    return [document_data[str(doc_id)]["text"] for doc_id in doc_ids if str(doc_id) in document_data]

# Retrieve top-k contexts
def retrieve_contexts(query_id, contriever_data, document_data):
    doc_ids = list(contriever_data.get(query_id, {}).keys())
    return get_document_texts(doc_ids, document_data)

# Prepare input for LLM
def prepare_input(query, contexts):
    user_prompt = f"""[Question]: When does monsoon season end in the state the area code 575 is located?
[Answer]: The area code 575 is located in New Mexico. Monsoon season in New Mexico typically ends in mid-September. So the
[Final Answer]: mid-September.
[Question]: What is the current official currency in the country where Ineabelle Diaz is a citizen?
[Answer]: Ineabelle Diaz is from Peurto Rico, which is in the United States of America. The current official currency in the United
States is the United States dollar. 
[Final Answer]: United States dollar.
[Question]: Where was the person who founded the American Institute of Public Opinion in 1935 born?
[Answer]: The person who founded the American Institute of Public Opinion in 1935 is George Gallup. George Gallup was born
in Jefferson, Iowa. 
[Final Answer]: Jefferson.
[Question]: What language is used by the director of Tiffany Memorandum?
[Answer]: The director of Tiffany Memorandum is Sergio Grieco. Sergio Grieco speaks Italian.
[Final Answer]: Italian.
[Question]: What is the sports team the person played for who scored the first touchdown in Superbowl 1?
[Answer]: The player that scored the first touchdown in Superbowl 1 is Max McGee. Max McGee played for the Green Bay
Packers.
[Final Answer]: Green Bay Packers.
[Question]: The birth country of Jayantha Ketagoda left the British Empire when?
[Answer]: The birth country of Jayantha Ketagoda is Sri Lanka. Sri Lanka left the British Empire on February 4, 1948. So the
[Final Answer]: February 4, 1948.\n\n Follow the above example and Given the evidence, Evidence: {' '.join(contexts)} \n use the information, think step by step and output the final answer extremely concisely in the form [Final Answer]: for the question, Question:{query}"""
    return user_prompt

# Generate response using LLM
def generate_answer(input_text, tokenizer, model):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to('cuda')
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# F1 Score Calculation
def f1_score_metric(prediction, ground_truth):
    pred_tokens = set(prediction.lower().split())
    gt_tokens = set(ground_truth.lower().split())
    
    intersection = pred_tokens.intersection(gt_tokens)
    precision = len(intersection) / len(pred_tokens) if len(pred_tokens) > 0 else 0
    recall = len(intersection) / len(gt_tokens) if len(gt_tokens) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

# BLEU Score Calculation
def bleu_score_metric(prediction, ground_truth):
    prediction_tokens = prediction.lower().split()
    ground_truth_tokens = ground_truth.lower().split()
    score = sentence_bleu([ground_truth_tokens], prediction_tokens)
    return score

# ROUGE Score Calculation
def rouge_score_metric(prediction, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(ground_truth, prediction)
    return scores

# Process dataset and evaluate responses
def process_and_evaluate(contriever_data, dataset, document_data, tokenizer, model):
    results = {}
    question_df = {"questions":[],"answers":[]}
    total_em = 0
    total_f1 = 0
    total_bleu = 0
    total_rouge = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    count = 0

    for entry in dataset:
        if count == 1200:
            break
        else:
            count += 1
        query_id = entry["_id"]
        query = entry["question"]
        ground_truth = entry["answer"]
        
        # Retrieve contexts and prepare input
        contexts = retrieve_contexts(query_id, contriever_data, document_data)
        input_text = prepare_input(query, contexts)
        
        # Generate response
        prediction = generate_answer(input_text, tokenizer, model)
        
        # Calculate Exact Match score
        if "not possible" in prediction.lower() or "unknown" in prediction.lower():
            em = 0
        elif len(re.split(r'\[?Final Answer\]?:', prediction)) > 1:
            answer = re.split(r'\[?Final Answer\]?:', prediction)[-1]
            em = 1 if ground_truth.lower() in prediction.lower() else 0
        else:
            em = 0

        total_em += em
        
        # Calculate F1 Score
        f1 = f1_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_f1 += f1
        
        # Calculate BLEU Score
        bleu = bleu_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_bleu += bleu
        
        # Calculate ROUGE Score
        rouge = rouge_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_rouge['rouge1'] += rouge['rouge1'].fmeasure
        total_rouge['rouge2'] += rouge['rouge2'].fmeasure
        total_rouge['rougeL'] += rouge['rougeL'].fmeasure
        
        # Store results
        results[query_id] = {"prediction": prediction, "ground_truth": ground_truth, "exact_match": em}
        
        if len(re.split(r'\[?Final Answer\]?:', prediction)) > 1:
            question_df["answers"].append(re.split(r'\[?Final Answer\]?:', prediction)[-1])
        else:
            question_df["answers"].append(prediction)
            
        question_df["questions"].append(query)

    # Calculate overall metrics
    accuracy = total_em / 1200
    average_f1 = total_f1 / 1200
    average_bleu = total_bleu / 1200
    average_rouge1 = total_rouge['rouge1'] / 1200
    average_rouge2 = total_rouge['rouge2'] / 1200
    average_rougeL = total_rouge['rougeL'] / 1200

    final_questions = pd.DataFrame(question_df)

    print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")
    print(f"Average F1 Score: {average_f1:.2f}")
    print(f"Average BLEU Score: {average_bleu:.2f}")
    print(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}")
    print(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}")
    print(f"Average ROUGE-L F-Score: {average_rougeL:.2f}")
    
    final_questions.to_csv("llama2_wqa_rag_5_few_shot_metrics.tsv", sep="\t", index=False)
    
    return results, accuracy, average_f1, average_bleu, average_rouge1, average_rouge2, average_rougeL

auth_token = os.getenv("huggingface_token")

if not auth_token:
    raise ValueError("Authentication token not found. Please set HF_TOKEN in your .env file.")

login(auth_token)

# File paths
contriever_file_path = "/kaggle/input/contriever-results-correct/retrieval_results_top_5.json"
dataset_file_path = "/kaggle/input/dataset/dev.json"
document_file_path = "/kaggle/input/corpus/wiki_musique_corpus.json"

# Load data
contriever_data = load_json(contriever_file_path)
dataset = load_json(dataset_file_path)
document_data = load_json(document_file_path)

model_name = "meta-llama/Llama-2-7b-chat-hf"

# Initialize LLM and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

# Process dataset and evaluate
results, accuracy, average_f1, average_bleu, average_rouge1, average_rouge2, average_rougeL = process_and_evaluate(
    contriever_data, dataset, document_data, tokenizer, model)

# Print evaluation results
print(f"Overall Exact Match Accuracy: {accuracy * 100:.2f}%")
print(f"Average F1 Score: {average_f1:.2f}")
print(f"Average BLEU Score: {average_bleu:.2f}")
print(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}")
print(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}")
print(f"Average ROUGE-L F-Score: {average_rougeL:.2f}")

with open("/kaggle/working/accuracy_metrics.txt", "w") as f:
    f.write(f"Overall Exact Match Accuracy: {accuracy * 100:.2f}%\n")
    f.write(f"Average F1 Score: {average_f1:.2f}\n")
    f.write(f"Average BLEU Score: {average_bleu:.2f}\n")
    f.write(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}\n")
    f.write(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}\n")
    f.write(f"Average ROUGE-L F-Score: {average_rougeL:.2f}\n")

# Task 2: Repeat the above experiment without the retriever, using only oracle contexts as input

### Zero-Shot Prompts Evaluation

In [None]:
import json
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import torch

# Load JSON files
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Retrieve top-k most similar evidence
def get_top_k_similar_instances(sentence, data_emb, data, k, threshold, model):
    sent_emb = model.encode(sentence)
    similarities = cosine_similarity(data_emb, [sent_emb]).flatten()
    sorted_indices = similarities.argsort()[::-1]
    top_evidences = []
    
    for idx in sorted_indices[:k]:
        if similarities[idx] > threshold:
            top_evidences.append(data[idx])
    return top_evidences

# Prepare input for LLM
def prepare_input(query, contexts):
    user_prompt = f"Given the evidence, Evidence: {'; '.join(contexts)} \n use the information, think step by step and output the final answer extremely concisely in the form [Final Answer]: for the question, Question:{query}"
    return user_prompt

# Generate response using LLM
def generate_answer(input_text, tokenizer, model):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to('cuda')
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# F1 Score Calculation
def f1_score_metric(prediction, ground_truth):
    pred_tokens = set(prediction.lower().split())
    gt_tokens = set(ground_truth.lower().split())
    
    intersection = pred_tokens.intersection(gt_tokens)
    precision = len(intersection) / len(pred_tokens) if len(pred_tokens) > 0 else 0
    recall = len(intersection) / len(gt_tokens) if len(gt_tokens) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

# BLEU Score Calculation
def bleu_score_metric(prediction, ground_truth):
    prediction_tokens = prediction.lower().split()
    ground_truth_tokens = ground_truth.lower().split()
    score = sentence_bleu([ground_truth_tokens], prediction_tokens)
    return score

# ROUGE Score Calculation
def rouge_score_metric(prediction, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(ground_truth, prediction)
    return scores

# Process dataset and evaluate responses
def process_and_evaluate(dataset, tokenizer, model, sentence_model, k=3, threshold=0.5):
    results = {}
    question_df = {"questions":[],"answers":[]}
    total_em = 0
    total_f1 = 0
    total_bleu = 0
    total_rouge = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    count = 0

    # Precompute embeddings for all evidence
    all_evidences = [item["evidences"] for item in dataset]
    evidence_texts = [', '.join([' - '.join(map(str, sublist)) for sublist in ev]) for ev in all_evidences]
    evidence_emb = sentence_model.encode(evidence_texts)

    for entry in dataset:
        if count == 1200:
            break
        else:
            count += 1
        query_id = entry["_id"]
        query = entry["question"]
        ground_truth = entry["answer"]
        
       # Retrieve top-k contexts
        top_evidences = get_top_k_similar_instances(query, evidence_emb, evidence_texts, k, threshold, sentence_model)
        
        input_text = prepare_input(query, top_evidences)
        
        # Generate response
        prediction = generate_answer(input_text, tokenizer, model)
        print(prediction)
        
        # Calculate Exact Match score
        if "not possible" in prediction.lower() or "unknown" in prediction.lower():
            em = 0
        elif len(re.split(r'\[?Final Answer\]?:', prediction)) > 1:
            answer = re.split(r'\[?Final Answer\]?:', prediction)[-1]
            em = 1 if ground_truth.lower() in prediction.lower() else 0
        else:
            em = 0

        total_em += em
        
        # Calculate F1 Score
        f1 = f1_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_f1 += f1
        
        # Calculate BLEU Score
        bleu = bleu_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_bleu += bleu
        
        # Calculate ROUGE Score
        rouge = rouge_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_rouge['rouge1'] += rouge['rouge1'].fmeasure
        total_rouge['rouge2'] += rouge['rouge2'].fmeasure
        total_rouge['rougeL'] += rouge['rougeL'].fmeasure
        
        # Store results
        results[query_id] = {"prediction": prediction, "ground_truth": ground_truth, "exact_match": em}
        
        if len(re.split(r'\[?Final Answer\]?:', prediction)) > 1:
            question_df["answers"].append(re.split(r'\[?Final Answer\]?:', prediction)[-1])
        else:
            question_df["answers"].append(prediction)
            
        question_df["questions"].append(query)

    # Calculate overall metrics
    accuracy = total_em / 1200
    average_f1 = total_f1 / 1200
    average_bleu = total_bleu / 1200
    average_rouge1 = total_rouge['rouge1'] / 1200
    average_rouge2 = total_rouge['rouge2'] / 1200
    average_rougeL = total_rouge['rougeL'] / 1200

    final_questions = pd.DataFrame(question_df)

    print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")
    print(f"Average F1 Score: {average_f1:.2f}")
    print(f"Average BLEU Score: {average_bleu:.2f}")
    print(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}")
    print(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}")
    print(f"Average ROUGE-L F-Score: {average_rougeL:.2f}")
    
    final_questions.to_csv("llama2_wqa_rag_oracle_zero_shot_top_5_metrics.tsv", sep="\t", index=False)
    
    return results, accuracy, average_f1, average_bleu, average_rouge1, average_rouge2, average_rougeL

auth_token = os.getenv("huggingface_token")

if not auth_token:
    raise ValueError("Authentication token not found. Please set HF_TOKEN in your .env file.")

login(auth_token)

# File paths
dataset_file_path = "/kaggle/input/dataset/dev.json"

# Load data
dataset = load_json(dataset_file_path)
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Initialize LLM and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
sentence_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")

# Evaluate with top-k retrieval
results, accuracy, average_f1, average_bleu, average_rouge1, average_rouge2, average_rougeL = process_and_evaluate(
    dataset, tokenizer, model, sentence_model, k=5, threshold=0.5)

# Print evaluation results
print(f"Overall Exact Match Accuracy: {accuracy * 100:.2f}%")
print(f"Average F1 Score: {average_f1:.2f}")
print(f"Average BLEU Score: {average_bleu:.2f}")
print(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}")
print(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}")
print(f"Average ROUGE-L F-Score: {average_rougeL:.2f}")

with open("/kaggle/working/accuracy_metrics.txt", "w") as f:
    f.write(f"Overall Exact Match Accuracy: {accuracy * 100:.2f}%\n")
    f.write(f"Average F1 Score: {average_f1:.2f}\n")
    f.write(f"Average BLEU Score: {average_bleu:.2f}\n")
    f.write(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}\n")
    f.write(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}\n")
    f.write(f"Average ROUGE-L F-Score: {average_rougeL:.2f}\n")

### Few-Shot Prompts Evaluation

In [None]:
import json
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import torch
from huggingface_hub import login

# Load JSON files
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# Retrieve top-k most similar evidence
def get_top_k_similar_instances(sentence, data_emb, data, k, threshold, model):
    sent_emb = model.encode(sentence)
    similarities = cosine_similarity(data_emb, [sent_emb]).flatten()
    sorted_indices = similarities.argsort()[::-1]
    top_evidences = []
    
    for idx in sorted_indices[:k]:
        if similarities[idx] > threshold:
            top_evidences.append(data[idx])
    return top_evidences

# Prepare input for LLM
def prepare_input(query, contexts):
    user_prompt = f"""[Question]: When does monsoon season end in the state the area code 575 is located?
[Answer]: The area code 575 is located in New Mexico. Monsoon season in New Mexico typically ends in mid-September. So the
[Final Answer]: mid-September.
[Question]: What is the current official currency in the country where Ineabelle Diaz is a citizen?
[Answer]: Ineabelle Diaz is from Peurto Rico, which is in the United States of America. The current official currency in the United
States is the United States dollar. 
[Final Answer]: United States dollar.
[Question]: Where was the person who founded the American Institute of Public Opinion in 1935 born?
[Answer]: The person who founded the American Institute of Public Opinion in 1935 is George Gallup. George Gallup was born
in Jefferson, Iowa. 
[Final Answer]: Jefferson.
[Question]: What language is used by the director of Tiffany Memorandum?
[Answer]: The director of Tiffany Memorandum is Sergio Grieco. Sergio Grieco speaks Italian.
[Final Answer]: Italian.
[Question]: What is the sports team the person played for who scored the first touchdown in Superbowl 1?
[Answer]: The player that scored the first touchdown in Superbowl 1 is Max McGee. Max McGee played for the Green Bay
Packers.
[Final Answer]: Green Bay Packers.
[Question]: The birth country of Jayantha Ketagoda left the British Empire when?
[Answer]: The birth country of Jayantha Ketagoda is Sri Lanka. Sri Lanka left the British Empire on February 4, 1948. So the
[Final Answer]: February 4, 1948.\n\n Given the evidence, Evidence: {'; '.join(contexts)} \n use the information, think step by step and output the final answer extremely concisely in the form [Final Answer]: for the question, Question:{query}"""
    return user_prompt

# Generate response using LLM
def generate_answer(input_text, tokenizer, model):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to('cuda')
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# F1 Score Calculation
def f1_score_metric(prediction, ground_truth):
    pred_tokens = set(prediction.lower().split())
    gt_tokens = set(ground_truth.lower().split())
    
    intersection = pred_tokens.intersection(gt_tokens)
    precision = len(intersection) / len(pred_tokens) if len(pred_tokens) > 0 else 0
    recall = len(intersection) / len(gt_tokens) if len(gt_tokens) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

# BLEU Score Calculation
def bleu_score_metric(prediction, ground_truth):
    prediction_tokens = prediction.lower().split()
    ground_truth_tokens = ground_truth.lower().split()
    score = sentence_bleu([ground_truth_tokens], prediction_tokens)
    return score

# ROUGE Score Calculation
def rouge_score_metric(prediction, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(ground_truth, prediction)
    return scores

# Process dataset and evaluate responses
def process_and_evaluate(dataset, tokenizer, model, sentence_model, k=3, threshold=0.5):
    results = {}
    question_df = {"questions":[],"answers":[]}
    total_em = 0
    total_f1 = 0
    total_bleu = 0
    total_rouge = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    count = 0

    # Precompute embeddings for all evidence
    all_evidences = [item["evidences"] for item in dataset]
    evidence_texts = [', '.join([' - '.join(map(str, sublist)) for sublist in ev]) for ev in all_evidences]
    evidence_emb = sentence_model.encode(evidence_texts)

    for entry in dataset:
        if count == 1200:
            break
        else:
            count += 1
        query_id = entry["_id"]
        query = entry["question"]
        ground_truth = entry["answer"]
        
       # Retrieve top-k contexts
        top_evidences = get_top_k_similar_instances(query, evidence_emb, evidence_texts, k, threshold, sentence_model)
        
        input_text = prepare_input(query, top_evidences)
        
        # Generate response
        prediction = generate_answer(input_text, tokenizer, model)

        
        # Calculate Exact Match score
        if "not possible" in prediction.lower() or "unknown" in prediction.lower():
            em = 0
        elif len(re.split(r'\[?Final Answer\]?:', prediction)) > 1:
            answer = re.split(r'\[?Final Answer\]?:', prediction)[-1]
            em = 1 if ground_truth.lower() in prediction.lower() else 0
        else:
            em = 0

        total_em += em
        
        # Calculate F1 Score
        f1 = f1_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_f1 += f1
        
        # Calculate BLEU Score
        bleu = bleu_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_bleu += bleu
        
        # Calculate ROUGE Score
        rouge = rouge_score_metric(re.split(r'\[?Final Answer\]?:', prediction)[-1], ground_truth)
        total_rouge['rouge1'] += rouge['rouge1'].fmeasure
        total_rouge['rouge2'] += rouge['rouge2'].fmeasure
        total_rouge['rougeL'] += rouge['rougeL'].fmeasure
        
        # Store results
        results[query_id] = {"prediction": prediction, "ground_truth": ground_truth, "exact_match": em}
        
        if len(re.split(r'\[?Final Answer\]?:', prediction)) > 1:
            question_df["answers"].append(re.split(r'\[?Final Answer\]?:', prediction)[-1])
        else:
            question_df["answers"].append(prediction)
            
        question_df["questions"].append(query)

    # Calculate overall metrics
    accuracy = total_em / 1200
    average_f1 = total_f1 / 1200
    average_bleu = total_bleu / 1200
    average_rouge1 = total_rouge['rouge1'] / 1200
    average_rouge2 = total_rouge['rouge2'] / 1200
    average_rougeL = total_rouge['rougeL'] / 1200

    final_questions = pd.DataFrame(question_df)

    print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")
    print(f"Average F1 Score: {average_f1:.2f}")
    print(f"Average BLEU Score: {average_bleu:.2f}")
    print(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}")
    print(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}")
    print(f"Average ROUGE-L F-Score: {average_rougeL:.2f}")
    
    final_questions.to_csv("llama2_wqa_rag_oracle_few_shot_top_3_metrics.tsv", sep="\t", index=False)
    
    return results, accuracy, average_f1, average_bleu, average_rouge1, average_rouge2, average_rougeL

auth_token = os.getenv("huggingface_token")

if not auth_token:
    raise ValueError("Authentication token not found. Please set HF_TOKEN in your .env file.")

login(auth_token)

# File paths
dataset_file_path = "/kaggle/input/dataset/dev.json"

# Load data
dataset = load_json(dataset_file_path)
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Initialize LLM and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
sentence_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")

# Evaluate with top-k retrieval
results, accuracy, average_f1, average_bleu, average_rouge1, average_rouge2, average_rougeL = process_and_evaluate(
    dataset, tokenizer, model, sentence_model, k=3, threshold=0.5)

# Print evaluation results
print(f"Overall Exact Match Accuracy: {accuracy * 100:.2f}%")
print(f"Average F1 Score: {average_f1:.2f}")
print(f"Average BLEU Score: {average_bleu:.2f}")
print(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}")
print(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}")
print(f"Average ROUGE-L F-Score: {average_rougeL:.2f}")

with open("/kaggle/working/accuracy_metrics.txt", "w") as f:
    f.write(f"Overall Exact Match Accuracy: {accuracy * 100:.2f}%\n")
    f.write(f"Average F1 Score: {average_f1:.2f}\n")
    f.write(f"Average BLEU Score: {average_bleu:.2f}\n")
    f.write(f"Average ROUGE-1 F-Score: {average_rouge1:.2f}\n")
    f.write(f"Average ROUGE-2 F-Score: {average_rouge2:.2f}\n")
    f.write(f"Average ROUGE-L F-Score: {average_rougeL:.2f}\n")