### Q1

In [1]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [2]:
url = f'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]

In [3]:
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

In [4]:
answer_llm = df.iloc[0].answer_llm
v_answer_llm = embedding_model.encode(answer_llm)
v_answer_llm[0]

np.float32(-0.42244676)

### Q2

In [5]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [6]:
results_gpt_4o_mini = df.to_dict(orient='records')
evaluations = []

In [7]:
for record in tqdm(results_gpt_4o_mini):
    sim = compute_similarity(record)
    evaluations.append(sim)

100%|██████████| 300/300 [01:33<00:00,  3.21it/s]


In [8]:
percentile_75 = np.percentile(evaluations, 75)
percentile_75

np.float32(31.674309)

### Q3

In [9]:
def compute_norm_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)

    norm_v_llm = np.sqrt((v_llm * v_llm).sum())
    norm_v_orig = np.sqrt((v_orig * v_orig).sum())

    v_llm = v_llm / norm_v_llm
    v_orig = v_orig / norm_v_orig
    
    return v_llm.dot(v_orig)

In [None]:
evaluations_serie = []
for record in tqdm(results_gpt_4o_mini):
    sim = compute_norm_similarity(record)
    evaluations_serie.append(sim)

In [10]:
from concurrent.futures import ThreadPoolExecutor

In [12]:
pool = ThreadPoolExecutor()

def execute_in_parallel_2(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [13]:
evaluations_norm_2 = execute_in_parallel_2(pool, results_gpt_4o_mini, compute_norm_similarity)

100%|██████████| 300/300 [02:51<00:00,  1.75it/s]


In [14]:
norm_percentile_75 = np.percentile(evaluations_norm, 75)
norm_percentile_75

np.float32(0.8362348)

In [15]:
norm_percentile_75_2 = np.percentile(evaluations_norm_2, 75)
norm_percentile_75_2

np.float32(0.8362348)

### Q4

In [16]:
!uv pip install rouge==1.0.1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K[2mResolved [1m2 packages[0m [2min 360ms[0m[0m                                         [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m     0 B/13.72 kB                      [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 2.76 kB/13.72 kB                      [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 5.51 kB/13.72 kB                      [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 9.65 kB/13.72 kB                      [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)----[0m[0m 11.02 kB/13.72 kB                     [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)2m--[0m[0m 12.40 kB/13.72 kB                     [1A
[2K[2mPrepared [1m1 package[0m [2min 27ms[0m[0m                                                   [1A

hint: If the cache and ta

In [20]:
r = [doc for doc in results_gpt_4o_mini if doc.get("document") == '5170565b'][0]
r

{'answer_llm': "Yes, all sessions are recorded, so if you miss one, you won't miss anything. You can catch up on the content later. Additionally, you can submit your questions in advance for office hours, and those sessions are also recorded.",
 'answer_orig': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
 'document': '5170565b',
 'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp'}

In [21]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [22]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

### Q5

In [23]:
(scores['rouge-1']['f'] + scores['rouge-2']['f'] + scores['rouge-l']['f']) / 3

0.35490034990035496

### Q6

In [26]:
rouge_gpt_4o_mini = []

for result in results_gpt_4o_mini:
    scores = rouge_scorer.get_scores(result['answer_llm'], result['answer_orig'])[0]
    result["rouge-1"] = scores['rouge-1']['f']
    result["rouge-2"] = scores['rouge-2']['f']
    result["rouge-l"] = scores['rouge-l']['f']

In [28]:
df_rouge = pd.DataFrame(results_gpt_4o_mini)
df_rouge.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course,rouge-1,rouge-2,rouge-l
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp,0.095238,0.028169,0.095238
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp,0.125,0.055556,0.09375
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp,0.415584,0.177778,0.38961
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp,0.216216,0.047059,0.189189
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp,0.142076,0.033898,0.120219


In [30]:
df_rouge['rouge-2'].mean()

np.float64(0.20696501983423318)