##### Homework 4
##### Nazmul Rabbi
##### 07/26/2024

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from rouge import Rouge

  from tqdm.autonotebook import tqdm, trange


In [2]:
# initialize tqdm for pandas
tqdm.pandas()

In [3]:
# data source url
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"

# load data
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

# filter data
df = df.iloc[:300]

In [4]:
# select the embeddings model
model_name = 'multi-qa-mpnet-base-dot-v1'

# load the model
embedding_model = SentenceTransformer(model_name)

In [5]:
# Create the embeddings for the first LLM answer
answer_llm = df.iloc[0].answer_llm
embedding = embedding_model.encode(answer_llm)

# print the first value of the resulting vector
print(f"Q1: The first value of the resulting vector {embedding[0]:.2f}")

Q1: The first value of the resulting vector -0.42


In [6]:
# Initialize evaluations as an empty list
evaluations = []

# Compute embeddings for both answer_llm and answer_orig with tqdm progress bar
df['embedding_answer_llm'] = df['answer_llm'].progress_apply(lambda x: embedding_model.encode(x))
df['embedding_answer_orig'] = df['answer_orig'].progress_apply(lambda x: embedding_model.encode(x))

# Compute dot products and store in evaluations list with tqdm progress bar
evaluations = df.progress_apply(lambda row: np.dot(row['embedding_answer_llm'], row['embedding_answer_orig']), axis=1).tolist()

100%|██████████| 300/300 [00:37<00:00,  8.05it/s]
100%|██████████| 300/300 [00:49<00:00,  6.02it/s]
100%|██████████| 300/300 [00:00<00:00, 66117.98it/s]


In [7]:
# Calculate the 75th percentile of the scores
percentile_75 = np.percentile(evaluations, 75)

# print the result
print(f"Q2: The 75th percentile of the scores is {percentile_75:.2f}")

Q2: The 75th percentile of the scores is 31.67


In [8]:
# Function to normalize a vector
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

# Initialize evaluations as an empty list
evaluations = []

# Normalize the embeddings and compute cosine similarity
for i in range(len(df)):
    embedding_llm_norm = normalize_vector(df.iloc[i]['embedding_answer_llm'])
    embedding_orig_norm = normalize_vector(df.iloc[i]['embedding_answer_orig'])
    cosine_similarity = np.dot(embedding_llm_norm, embedding_orig_norm)
    evaluations.append(cosine_similarity)

In [9]:
# Calculate the 75th percentile of the scores
percentile_75 = np.percentile(evaluations, 75)

# print the result
print(f"Q3: The 75th percentile cosine scores is {percentile_75:.2f}")

Q3: The 75th percentile cosine scores is 0.84


In [10]:
# Initialize the Rouge scorer
rouge_scorer = Rouge()

# Compute the ROUGE scores
scores = rouge_scorer.get_scores(df.loc[10, 'answer_llm'], df.loc[10, 'answer_orig'])[0]

In [11]:
# Extract the F1 score for rouge-1
rouge_1_f_score = scores['rouge-1']['f']

# print the result
print(f"Q4: The F1 score for ROUGE-1 at index 10 is {rouge_1_f_score:.2f}")

Q4: The F1 score for ROUGE-1 at index 10 is 0.45


In [12]:
# Calculate the average F1 score for ROUGE-1, ROUGE-2, and ROUGE-L
average_f_score = np.mean([scores[metric]['f'] for metric in ['rouge-1', 'rouge-2', 'rouge-l']])

# print the result
print(f"Q5: The average ROUGE score (F1) for index 10 is {average_f_score:.2f}")

Q5: The average ROUGE score (F1) for index 10 is 0.35


In [13]:
# Initialize lists to store the ROUGE scores
rouge_2_scores = []

# Iterate over all records in the dataframe and compute ROUGE scores
for i in range(len(df)):
    scores = rouge_scorer.get_scores(df.loc[i, 'answer_llm'], df.loc[i, 'answer_orig'])[0]
    rouge_2_scores.append(scores['rouge-2']['f'])

In [14]:
# Calculate the average ROUGE-2 score across all records
average_rouge_2 = np.mean(rouge_2_scores)

# print the result
print(f"Q6: The average ROUGE-2 score across all records is {average_rouge_2:.2f}")

Q6: The average ROUGE-2 score across all records is 0.21
