In [2]:
# Getting the data

In [3]:
import pandas as pd

In [4]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f"{github_url}?raw=1"
df = pd.read_csv(url)

In [5]:
df = df.iloc[:300]

In [6]:
df.iloc[0]

answer_llm     You can sign up for the course by visiting the...
answer_orig    Machine Learning Zoomcamp FAQ\nThe purpose of ...
document                                                0227b872
question                     Where can I sign up for the course?
course                                 machine-learning-zoomcamp
Name: 0, dtype: object

In [7]:
# Q1. Getting the embeddings model

In [8]:
from sentence_transformers import SentenceTransformer

In [None]:
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

In [10]:
answer_llm = df.iloc[0].answer_llm
embedding = embedding_model.encode(answer_llm)

In [11]:
print(embedding[0])

-0.42244655


In [12]:
# Q2. Computing the dot product

In [13]:
from tqdm.auto import tqdm

In [14]:
results = df.to_dict(orient='records')

In [15]:
results[0]

{'answer_llm': 'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).',
 'answer_orig': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'document': '0227b872',
 'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp'}

In [16]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [17]:
evaluations = []
for record in tqdm(results):
    sim = compute_similarity(record)
    evaluations.append(sim)

  0%|          | 0/300 [00:00<?, ?it/s]

In [18]:
df['evaluations'] = evaluations
df['evaluations'].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547923
25%       24.307844
50%       28.336870
75%       31.674309
max       39.476013
Name: evaluations, dtype: float64

In [19]:
# Q3. Computing the cosine

In [20]:
import numpy as np

In [21]:
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

In [22]:
def compute_cosine_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return normalize_vector(v_llm).dot(normalize_vector(v_orig))

In [23]:
cosine_similarity = []
for record in tqdm(results):
    sim = compute_cosine_similarity(record)
    cosine_similarity.append(sim)

  0%|          | 0/300 [00:00<?, ?it/s]

In [24]:
df['cosine'] = cosine_similarity
df['cosine'].describe()

count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: cosine, dtype: float64

In [25]:
# Q4. Rouge

In [26]:
from rouge import Rouge
rouge_scorer = Rouge()

In [28]:
r = results[10]
if r["document"] != "5170565b":
    print("Something went wrong")

In [32]:
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [33]:
scores['rouge-1']['f']

0.45454544954545456

In [34]:
# Q5. Average rouge score

In [36]:
f_scores = [rouge["f"] for rouge in scores.values()]

In [39]:
sum(f_scores)/len(f_scores)

0.35490034990035496

In [40]:
# Q6. Average rouge score for all the data points

In [44]:
rouge_2_f_scores = []
for record in tqdm(results):
    scores = rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]
    rouge_2_f_scores.append(scores["rouge-2"]["f"])

  0%|          | 0/300 [00:00<?, ?it/s]

In [45]:
df['rouge_2_f_score'] = rouge_2_f_scores
df['rouge_2_f_score'].describe().apply("{0:.5f}".format)

count    300.00000
mean       0.20697
std        0.15355
min        0.00000
25%        0.09781
50%        0.17867
75%        0.28618
max        0.73913
Name: rouge_2_f_score, dtype: object