# Evaluation and Monitoring

## Getting the data

In [None]:
import pandas as pd
import numpy as np
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

In [2]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"

url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [3]:
df = df.iloc[:300]

## Q1. Getting the embeddibngs model

In [None]:
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

In [None]:
answer_llm = df.iloc[0].answer_llm
answer_llm_embedded = embedding_model.encode(answer_llm)

In [6]:
answer_llm_embedded[0]

-0.42244658

**What's the first value of the resulting vector?**

*Answer*: -0.42

## Q2. Computing the dot product

In [7]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [8]:
dict_df = df.to_dict(orient='records')

In [9]:
evaluations = []

for record in tqdm(dict_df):
    sim = compute_similarity(record)
    evaluations.append(sim)

  0%|          | 0/300 [00:00<?, ?it/s]

In [10]:
df['cosine'] = evaluations
df['cosine'].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547925
25%       24.307845
50%       28.336873
75%       31.674312
max       39.476013
Name: cosine, dtype: float64

**What's the 75% percentile of the score?**

*Answer*: 31.67

## Q3. Computing the cosine

In [11]:
def normalize(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    
    return v_norm

In [12]:
norm_eval = []

for record in tqdm(dict_df):
    v_llm = normalize(embedding_model.encode(record["answer_llm"]))
    v_orig = normalize(embedding_model.encode(record["answer_orig"]))

    sim = v_llm.dot(v_orig)
    norm_eval.append(sim)

  0%|          | 0/300 [00:00<?, ?it/s]

In [13]:
df["norm_cosine"] = norm_eval
df["norm_cosine"].describe()

count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: norm_cosine, dtype: float64

**What's the 75% cosine in the scores?**

*Answer:* 0.83

## Q4. Rouge

In [14]:
sentence = df.iloc[10]

In [15]:
sentence

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
cosine                                                 32.344711
norm_cosine                                             0.777956
Name: 10, dtype: object

In [16]:
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(sentence['answer_llm'], sentence['answer_orig'])

In [17]:
scores

[{'rouge-1': {'r': 0.45454545454545453,
   'p': 0.45454545454545453,
   'f': 0.45454544954545456},
  'rouge-2': {'r': 0.21621621621621623,
   'p': 0.21621621621621623,
   'f': 0.21621621121621637},
  'rouge-l': {'r': 0.3939393939393939,
   'p': 0.3939393939393939,
   'f': 0.393939388939394}}]

**What's the F score for rouge-1?**

*Answer:* 0.45

## Q5. Average rouge score

In [18]:
np.mean([0.45, 0.21, 0.39])

0.35000000000000003

*Answer:* 0.35

## Q6. Average rouge score for all the data points

In [19]:
for rec in tqdm(dict_df):
    score = rouge_scorer.get_scores(rec["answer_llm"], rec["answer_orig"])
    scores.append(score)

  0%|          | 0/300 [00:00<?, ?it/s]

In [20]:
flattened_data = []

for item in scores:
    if isinstance(item, dict):
        flattened_entry = {}
        for key, sub_dict in item.items():
            for sub_key, value in sub_dict.items():
                flattened_entry[f"{key}-{sub_key}"] = value
        flattened_data.append(flattened_entry)
    elif isinstance(item, list):
        for sub_item in item:
            flattened_entry = {}
            for key, sub_dict in sub_item.items():
                for sub_key, value in sub_dict.items():
                    flattened_entry[f"{key}-{sub_key}"] = value
            flattened_data.append(flattened_entry)

In [21]:
df_rouge = pd.DataFrame(flattened_data)
df_rouge = df_rouge[["rouge-1-f", "rouge-2-f", "rouge-l-f"]]
df_rouge.rename(columns = {"rouge-1-f": "rouge-1",
                           "rouge-2-f": "rouge-2",
                           "rouge-l-f": "rouge-l"}, inplace=True)
df_rouge.head()

Unnamed: 0,rouge-1,rouge-2,rouge-l
0,0.454545,0.216216,0.393939
1,0.095238,0.028169,0.095238
2,0.125,0.055556,0.09375
3,0.415584,0.177778,0.38961
4,0.216216,0.047059,0.189189


In [22]:
df_rouge.describe()

Unnamed: 0,rouge-1,rouge-2,rouge-l
count,301.0,301.0,301.0
mean,0.379095,0.206996,0.353941
std,0.165757,0.153295,0.162709
min,0.0,0.0,0.0
25%,0.262295,0.098361,0.228571
50%,0.379747,0.178771,0.338235
75%,0.479042,0.285714,0.451613
max,0.85,0.73913,0.85


**What's the average `rouge_2` across all the records?**

*Answer:* 0.20