### Getting the data

In [1]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '04-monitoring/data/results-gpt4o-mini.csv'
github_url = f'{base_url}/{relative_url}'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [2]:
df = df.iloc[:300]

### Q1. Getting the embeddings model

In [3]:
from sentence_transformers import SentenceTransformer


model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm
You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [4]:
answer_llm = df.iloc[0].answer_llm

In [5]:
answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [6]:
v_llm = embedding_model.encode(answer_llm)

In [7]:
print(f"The first value of the resulting vector is: {v_llm[0]:.2f}")

The first value of the resulting vector is: -0.42


### Q2. Computing the dot product

In [8]:
def dot_product(model, record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [9]:
from tqdm.auto import tqdm

results = df.to_dict(orient='records')
evaluations = []

for record in tqdm(results):
    sim = dot_product(embedding_model, record)
    evaluations.append(sim)

100%|████████████████████████| 300/300 [00:52<00:00,  5.67it/s]


In [10]:
import numpy as np

print(f"The 75% percentile of the score is: {np.percentile(a=evaluations, q=75):.2f}")

The 75% percentile of the score is: 31.67


### Q3. Computing the cosine

In [11]:
def normalize(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

def cosine_similarity(model, record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    
    return normalize(v_llm).dot(normalize(v_orig))

In [12]:
evaluations = []

for record in tqdm(results):
    sim = cosine_similarity(embedding_model, record)
    evaluations.append(sim)

100%|████████████████████████| 300/300 [00:18<00:00, 15.87it/s]


In [13]:
print(f"The 75% cosine in the scores is: {np.percentile(a=evaluations, q=75):.3f}")

The 75% cosine in the scores is: 0.836


### Q4. Rouge

In [14]:
# !pip install rouge

In [15]:
from rouge import Rouge

r = results[10]
rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [16]:
print(f"The F score for rouge-1 is: {scores['rouge-1']['f']:.2f}")

The F score for rouge-1 is: 0.45


### Q5. Average rouge score

In [17]:
print(f"The average F-score between rouge-1, rouge-2 and rouge-l is: {np.average([values['f'] for key, values in scores.items()]):.2f}")

The average F-score between rouge-1, rouge-2 and rouge-l is: 0.35


### Q6. Average rouge score for all the data points

In [18]:
evaluations = []

for record in tqdm(results):
    scores = rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]
    evaluations.append(scores['rouge-2']['f'])

100%|███████████████████████| 300/300 [00:00<00:00, 512.43it/s]


In [19]:
df['rouge-2-fscore'] = evaluations

In [20]:
print(f"The average rouge_2 across all the records is: {df['rouge-2-fscore'].describe()['mean']:.3f}")

The average rouge_2 across all the records is: 0.207
