In [54]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from rouge import Rouge

In [2]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'

In [3]:
url = f'{github_url}?raw=1'

In [4]:
df = pd.read_csv(url)

In [5]:
df = df.iloc[:300]

In [6]:
model_name = 'multi-qa-mpnet-base-dot-v1'

In [8]:
embedding_model = SentenceTransformer(model_name)

In [9]:
answer_llm = df.iloc[0].answer_llm

In [11]:
embedding_model.encode(answer_llm)[0]

np.float32(-0.4224468)

In [13]:
answer_llm_emb = embedding_model.encode(df.answer_llm, show_progress_bar=True)

Batches: 100%|██████████████████████████████████| 10/10 [00:00<00:00, 13.64it/s]


In [14]:
answer_orig_emb = embedding_model.encode(df.answer_orig, show_progress_bar=True)

Batches: 100%|██████████████████████████████████| 10/10 [00:00<00:00, 11.38it/s]


In [20]:
evaluations = [llm_emb.dot(orig_llm) for llm_emb, orig_llm in zip(answer_llm_emb, answer_orig_emb)]

In [24]:
np.percentile(evaluations, 75)

np.float32(31.674313)

In [30]:
norm = np.sqrt((answer_llm_emb * answer_llm_emb).sum(axis=1))

In [46]:
answer_llm_emb_norm = np.divide(answer_llm_emb.T, norm).T

In [48]:
norm_orig = np.sqrt((answer_orig_emb * answer_orig_emb).sum(axis=1))

In [50]:
answer_orig_emb_norm = np.divide(answer_orig_emb.T, norm).T

In [51]:
evaluations_norm = [llm_emb.dot(orig_llm) for llm_emb, orig_llm in zip(answer_llm_emb_norm, answer_orig_emb_norm)]

In [52]:
np.percentile(evaluations_norm, 75)

np.float32(0.85372615)

In [56]:
rouge_scorer = Rouge()

In [60]:
r = df.iloc[10]

In [61]:
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [62]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [65]:
sum([scores[ro]['r'] for ro in scores])/3

0.3549003549003549

In [70]:
def get_avg_rouge(row):
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    rouge_2 = scores['rouge-2']['f']
    return rouge_2

In [72]:
df.apply(get_avg_rouge, axis=1).describe()

count    300.000000
mean       0.206965
std        0.153550
min        0.000000
25%        0.097809
50%        0.178671
75%        0.286181
max        0.739130
dtype: float64