### LLM's

#### AWS

In [None]:
import os
import boto3
import json

import boto3

bedrock_runtime = boto3.client(
    service_name="bedrock-runtime",
    region_name="us-east-1",
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    aws_session_token=os.getenv("AWS_SESSION_TOKEN"),
)

In [None]:
from langchain_aws import ChatBedrock
from langchain_huggingface import HuggingFaceEmbeddings

model_id = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"

claude_3 = ChatBedrock(
    client=bedrock_runtime,
    model_id=model_id,
)
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    encode_kwargs={"normalize_embeddings": True, "batch_size": 8},
)

### Response Generation

In [None]:
embedding_model = "BAAI/bge-small-en-v1.5"  # BAAI/llm-embedder, BAAI/bge-large-en-v1.5, mixedbread-ai/mxbai-embed-large-v1, TextEmbeddingAda2
llm = claude_3
ensemble = ("ensemble", 0.5)  # bm25, semantic, ensemble

In [None]:
import os

os.chdir("../")
import pandas as pd


eval_dataset = pd.read_csv("./datasets/synthetic_dataset_hi_res.csv")
test_questions = eval_dataset["user_input"].values.tolist()
eval_dataset.head(2)

In [None]:
from src.rag.rag_pipeline import get_retriever_parent_child, get_qa_chain_rerank

retriever, bm25_retriever = get_retriever_parent_child(
    file_path="./data", model_name=embedding_model
)
qa_chain = get_qa_chain_rerank(retriever, bm25_retriever, ensemble[1], llm)
value = qa_chain({"query": test_questions[0]})

In [None]:
from datasets import Dataset

responses = [qa_chain({"query": q}) for q in test_questions]

# Extract answers and contexts from responses
answers = []
contexts = []
for r in responses:
    answers.append(r["result"])
    contexts.append(
        [
            dict(c)["page_content"]
            for c in r["source_documents"]
            if "page_content" in dict(c)
        ]
    )

# Create a dictionary for the dataset
dataset_dict = {
    "question": test_questions,
    "answer": answers,
    "contexts": contexts,
}

# Create a Dataset object from the dictionary
result_ds = Dataset.from_dict(dataset_dict)

In [None]:
eval_dataset["answer"] = result_ds.to_pandas()["answer"]
eval_dataset.to_csv(
    f"./datasets/experiments/{llm.model_name}_bge-small-en_{ensemble[0]}.csv",
    index=False,
)

### Evaluation - LLM Based

In [5]:
import os

os.chdir("../")

In [6]:
import pandas as pd
import ast

dataset = []
df = pd.read_csv("./data/experiments/GPT4o128k_TextEmbeddingAda2_semantic.csv")
for i in range(len(df)):
    dataset.append(
        {
            "user_input": df.loc[0, "user_input"],
            "retrieved_contexts": ast.literal_eval(df.loc[0, "reference_contexts"]),
            "response": df.loc[0, "answer"],
            "reference": df.loc[0, "reference"],
        }
    )

In [7]:
from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_list(dataset)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas import RunConfig
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    encode_kwargs={"normalize_embeddings": True, "batch_size": 8},
)

evaluator_llm = LangchainLLMWrapper(claude_3)
embeddings = LangchainEmbeddingsWrapper(embeddings)

from ragas.metrics import (
    LLMContextPrecisionWithReference,  # Generation
    LLMContextRecall,
    ContextEntityRecall,
    ResponseRelevancy,
    Faithfulness,
    FactualCorrectness,  # Generation
)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[
        LLMContextPrecisionWithReference(),
        LLMContextRecall(),
        ContextEntityRecall(),
        ResponseRelevancy(),
        Faithfulness(),
        FactualCorrectness(),
    ],
    llm=evaluator_llm,
    embeddings=embeddings,
    run_config=RunConfig(timeout=600, max_retries=10, max_wait=240, max_workers=4, seed=42),
    # callbacks=[cost_cb],
    # token_usage_parser=get_token_usage_for_openai,
    show_progress=True
)
result

Evaluating: 100%|██████████| 624/624 [20:39<00:00,  1.99s/it]


{'llm_context_precision_with_reference': 1.0000, 'context_recall': 1.0000, 'context_entity_recall': 0.0601, 'answer_relevancy': 0.9418, 'faithfulness': 0.9519, 'factual_correctness': 0.4123}

1. GPT4o128k_bge-small-en_bm25 - {'llm_context_precision_with_reference': 1.0000, 'context_recall': 1.0000, 'context_entity_recall': 0.0793, 'answer_relevancy': 1.0000, 'faithfulness': 0.5048, 'factual_correctness': 0.8000}

2. GPT4o128k_bge-small-en_ensemble - {'llm_context_precision_with_reference': 1.0000, 'context_recall': 1.0000, 'context_entity_recall': 0.0625, 'answer_relevancy': 1.0000, 'faithfulness': 0.5000, 'factual_correctness': 0.8000}

3. GPT4o128k_bge-small-en_semantic - {'llm_context_precision_with_reference': 1.0000, 'context_recall': 1.0000, 'context_entity_recall': 0.0649, 'answer_relevancy': 0.9418, 'faithfulness': 0.9615, 'factual_correctness': 0.4123}

4. GPT4o128k_TextEmbeddingAda2_bm25 - {'llm_context_precision_with_reference': 1.0000, 'context_recall': 1.0000, 'context_entity_recall': 0.0697, 'answer_relevancy': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7067}

5. GPT4o128k_TextEmbeddingAda2_ensemble - {'llm_context_precision_with_reference': 1.0000, 'context_recall': 1.0000, 'context_entity_recall': 0.0649, 'answer_relevancy': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7087}

6. GPT4o128k_TextEmbeddingAda2_semantic - {'llm_context_precision_with_reference': 1.0000, 'context_recall': 1.0000, 'context_entity_recall': 0.0601, 'answer_relevancy': 0.9418, 'faithfulness': 0.9519, 'factual_correctness': 0.4123}

### Evaluation - NLP Based

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util


class SentenceSimilarity:

    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = SentenceTransformer(self.model_name)

    def compute_similarity(self, sent1, sent2):
        # Convert the sentences into embeddings using the Sentence Transformer
        sent_embedding1 = self.model.encode(sent1, convert_to_tensor=True)
        sent_embedding2 = self.model.encode(sent2, convert_to_tensor=True)

        # Find the similarity between the two embeddings
        similarities_sbert = util.pairwise_cos_sim(sent_embedding1, sent_embedding2)
        return similarities_sbert

In [None]:

def setence_similarity(file_path):
    data = pd.read_csv(file_path)
    similarity_calculator = SentenceSimilarity()

    similarities = similarity_calculator.compute_similarity(
        data["answer"].tolist(), data["reference"].tolist()
    )

    return similarities.cpu()

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from nltk import download as nltk_download, word_tokenize


class NLPMetrics:
    def __init__(self):
        self._rouge = Rouge()

    def calculate_rouge(self, response: str, ground_truth: str):
        scores = self._rouge.get_scores(response, ground_truth, avg=True)
        return scores["rouge-l"]["p"], scores["rouge-l"]["r"], scores["rouge-l"]["f"]

    def calculate_bleu(self, response: str, ground_truth: str):
        gt_tokens = [word_tokenize(ground_truth)]
        res_tokens = word_tokenize(response)
        return sentence_bleu(gt_tokens, res_tokens)

    def calculate_token_overlap(self, response: str, ground_truth: str):
        gt_tokens = set(word_tokenize(ground_truth))
        res_tokens = set(word_tokenize(response))
        overlap = res_tokens & gt_tokens
        precision = len(overlap) / len(res_tokens) if res_tokens else 0
        recall = len(overlap) / len(gt_tokens) if gt_tokens else 0
        f1_score = (
            2 * precision * recall / (precision + recall)
            if precision + recall > 0
            else 0
        )
        return precision, recall, f1_score

    def calculate_metrics(self, response: str, ground_truth: str):
        rouge_p, rouge_r, rouge_f1 = self.calculate_rouge(response, ground_truth)
        token_p, token_r, token_f1 = self.calculate_token_overlap(
            response, ground_truth
        )
        bleu = self.calculate_bleu(response, ground_truth)
        return {
            "rouge_l_precision": rouge_p,
            "rouge_l_recall": rouge_r,
            "rouge_l_f1": rouge_f1,
            "token_overlap_precision": token_p,
            "token_overlap_recall": token_r,
            "token_overlap_f1": token_f1,
            "bleu_score": bleu,
        }

In [None]:
def deterministic_metrics(file_path):
    # Initialize the metrics calculator
    metrics = NLPMetrics()

    # Read the dataset
    data = pd.read_csv(file_path)

    # Calculate metrics for each pair of response and ground truth
    deterministic_scores = [
        metrics.calculate_metrics(answer, ground_truth)
        for answer, ground_truth in zip(data["answer"], data["reference"])
    ]
    return deterministic_scores

In [None]:
def nlp_metrics(file_path):
    data = pd.read_csv(file_path)
    similarities = setence_similarity(file_path)
    deterministic_scores = deterministic_metrics(file_path)

    df_deterministic = pd.DataFrame(deterministic_scores)
    df_semantic = pd.DataFrame(
        {
            "semantic_similarity_score": similarities,
        }
    )
    df = pd.concat([df_deterministic, df_semantic], axis=1)

    return pd.concat([data, df], axis=1)

In [None]:
bge_bm25 = nlp_metrics("../data/experiments/GPT4o128k_bge-small-en_bm25.csv")
bge_semantic = nlp_metrics("../data/experiments/GPT4o128k_bge-small-en_semantic.csv")
bge_ensemble = nlp_metrics("../data/experiments/GPT4o128k_bge-small-en_ensemble.csv")
ada_bm25 = nlp_metrics("../data/experiments/GPT4o128k_TextEmbeddingAda2_bm25.csv")
ada_semantic = nlp_metrics("../data/experiments/GPT4o128k_TextEmbeddingAda2_semantic.csv")
ada_ensemble = nlp_metrics("../data/experiments/GPT4o128k_TextEmbeddingAda2_ensemble.csv")

In [None]:
cols = ['user_input', 'reference_contexts', 'reference', 'synthesizer_name','answer']
df = pd.concat(
    [
        bge_bm25.drop(columns=cols).mean().rename("GPT4o128k_bge-small-en_bm25"),
        bge_semantic.drop(columns=cols).mean().rename("GPT4o128k_bge-small-en_semantic"),
        bge_ensemble.drop(columns=cols).mean().rename("GPT4o128k_bge-small-en_ensemble"),
        ada_bm25.drop(columns=cols).mean().rename("GPT4o128k_TextEmbeddingAda2_bm25"),
        ada_semantic.drop(columns=cols).mean().rename("GPT4o128k_TextEmbeddingAda2_semantic"),
        ada_ensemble.drop(columns=cols).mean().rename("GPT4o128k_TextEmbeddingAda2_ensemble"),
    ],
    axis=1,
).T
df["Pipeline"] = [
    "GPT4o128k_bge-small-en_bm25",
    "GPT4o128k_bge-small-en_semantic",
    "GPT4o128k_bge-small-en_ensemble",
    "GPT4o128k_TextEmbeddingAda2_bm25",
    "GPT4o128k_TextEmbeddingAda2_semantic",
    "GPT4o128k_TextEmbeddingAda2_ensemble",
]

df.to_csv("../data/results/generation-evaluation-NLP-based.csv", index=False)