### Homework: Evaluation and Monitoring for LLM

### Q1. Getting the embeddings model

In [8]:
import pandas as pd

In [9]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"

In [10]:
url = f'{github_url}?raw=1'

In [14]:
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [12]:
df = pd.read_csv(url)

In [13]:
df = df.iloc[:300]

In [22]:
model_name = "multi-qa-mpnet-base-dot-v1"

In [26]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name)

In [24]:
answer_llm= df.iloc[0].answer_llm

In [25]:
answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [27]:
answer_embeddings = embedding_model.encode(answer_llm)

In [14]:
first_value = answer_embeddings[0]

In [15]:
first_value

-0.42244655

### Q2. Computing the dot product

In [19]:
answer_embeddings_list = []
evaluations = []

In [13]:
df.iloc[0]

answer_llm     You can sign up for the course by visiting the...
answer_orig    Machine Learning Zoomcamp FAQ\nThe purpose of ...
document                                                0227b872
question                     Where can I sign up for the course?
course                                 machine-learning-zoomcamp
Name: 0, dtype: object

In [14]:
answer_gt= df.iloc[0].answer_orig

In [15]:
answer_gt

'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork'

In [28]:
import numpy as np

for index, row in df.iterrows():
    answer_llm = row.answer_llm
    answer_gt = row.answer_orig

    answer_llm_embeddings = embedding_model.encode(answer_llm)
    answer_gt_embeddings = embedding_model.encode(answer_gt)

    answer_embeddings_list.append((answer_llm_embeddings, answer_gt_embeddings))
    score = np.dot(answer_llm_embeddings, answer_gt_embeddings)
    evaluations.append(score)

In [29]:
percentile_75 = np.percentile(evaluations, 75)

In [30]:
percentile_75

31.67430877685547

### Q3. Computing the cosine

In [31]:
def normalize_vector(v):

    norm = np.sqrt((v * v).sum())
    v_norm = v / norm

    return v_norm

In [32]:
evaluations_cosine = []
answer_embeddings_list_norm = []

for index, row in df.iterrows():
    answer_llm = row.answer_llm
    answer_gt = row.answer_orig

    answer_llm_embeddings = embedding_model.encode(answer_llm)
    answer_gt_embeddings = embedding_model.encode(answer_gt)
    
    # Normalize vectors
    answer_llm_embeddings_norm = normalize_vector(answer_llm_embeddings)
    answer_gt_embeddings_norm = normalize_vector(answer_gt_embeddings)

    answer_embeddings_list_norm.append((answer_llm_embeddings_norm, answer_gt_embeddings_norm))
    score = np.dot(answer_llm_embeddings_norm, answer_gt_embeddings_norm)
    evaluations_cosine.append(score)

In [33]:
percentile_75_cosine = np.percentile(evaluations_cosine, 75)

In [34]:
percentile_75_cosine

0.8362348973751068

### Q4. Rouge

In [1]:
!pip install rouge


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [15]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df.iloc[10]
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [16]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [17]:
rouge1_f_score = scores['rouge-1']['f']
rouge1_f_score

0.45454544954545456

### Q5. Average rouge score

In [18]:
rouge1_f_score = scores['rouge-1']['f']
rouge2_f_score = scores['rouge-2']['f']
rouge_l_f_score = scores['rouge-l']['f']

average_f_score = (rouge1_f_score + rouge2_f_score + rouge_l_f_score) / 3

In [19]:
average_f_score

0.35490034990035496

### Q6. Average rouge score for all the data points

In [20]:
all_scores = []

for index, row in df.iterrows():
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    all_scores.append(scores)

In [21]:
all_scores

[{'rouge-1': {'r': 0.061224489795918366,
   'p': 0.21428571428571427,
   'f': 0.09523809178130524},
  'rouge-2': {'r': 0.017543859649122806,
   'p': 0.07142857142857142,
   'f': 0.028169010918468917},
  'rouge-l': {'r': 0.061224489795918366,
   'p': 0.21428571428571427,
   'f': 0.09523809178130524}},
 {'rouge-1': {'r': 0.08163265306122448,
   'p': 0.26666666666666666,
   'f': 0.12499999641113292},
  'rouge-2': {'r': 0.03508771929824561,
   'p': 0.13333333333333333,
   'f': 0.05555555225694465},
  'rouge-l': {'r': 0.061224489795918366, 'p': 0.2, 'f': 0.09374999641113295}},
 {'rouge-1': {'r': 0.32653061224489793,
   'p': 0.5714285714285714,
   'f': 0.41558441095631643},
  'rouge-2': {'r': 0.14035087719298245,
   'p': 0.24242424242424243,
   'f': 0.17777777313333343},
  'rouge-l': {'r': 0.30612244897959184,
   'p': 0.5357142857142857,
   'f': 0.3896103849822905}},
 {'rouge-1': {'r': 0.16326530612244897, 'p': 0.32, 'f': 0.2162162117421476},
  'rouge-2': {'r': 0.03508771929824561,
   'p': 0

In [22]:
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

flattened_data = [flatten_dict(d) for d in all_scores]

df = pd.DataFrame(flattened_data)

In [23]:
df

Unnamed: 0,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f
0,0.061224,0.214286,0.095238,0.017544,0.071429,0.028169,0.061224,0.214286,0.095238
1,0.081633,0.266667,0.125000,0.035088,0.133333,0.055556,0.061224,0.200000,0.093750
2,0.326531,0.571429,0.415584,0.140351,0.242424,0.177778,0.306122,0.535714,0.389610
3,0.163265,0.320000,0.216216,0.035088,0.071429,0.047059,0.142857,0.280000,0.189189
4,0.265306,0.097015,0.142076,0.070175,0.022346,0.033898,0.224490,0.082090,0.120219
...,...,...,...,...,...,...,...,...,...
295,0.642857,0.666667,0.654545,0.559322,0.523810,0.540984,0.607143,0.629630,0.618182
296,0.642857,0.545455,0.590164,0.542373,0.400000,0.460432,0.607143,0.515152,0.557377
297,0.660714,0.649123,0.654867,0.593220,0.538462,0.564516,0.642857,0.631579,0.637168
298,0.285714,0.326531,0.304762,0.135593,0.129032,0.132231,0.285714,0.326531,0.304762


In [24]:
average_rouge_l = df['rouge-l_f'].mean()

In [25]:
average_rouge_l

0.3538074656078652

In [26]:
rouge1_f_score = df['rouge-1_f'].mean()
rouge2_f_score = df['rouge-2_f'].mean()
rougel_f_score = df['rouge-l_f'].mean()

average_f_score = (rouge1_f_score + rouge2_f_score + rouge_l_f_score) / 3

In [27]:
average_f_score

0.3265826751170143