In [22]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

In [1]:
github_url="https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f'{github_url}?raw=1'


In [4]:
df = pd.read_csv(url)


In [5]:
df = df.iloc[:300]

In [6]:
from sentence_transformers import SentenceTransformer

model_name="multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm
You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





## Q1

In [7]:
answer_llm = df.iloc[0].answer_llm


In [9]:
embedding_vector = embedding_model.encode(answer_llm)
embedding_vector[0]

-0.42244655

## Q2

In [16]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [17]:
results_gpt4omini= df.to_dict(orient='records')

In [18]:
evaluations = []

for record in tqdm(results_gpt4omini):
    sim = compute_similarity(record)
    evaluations.append(sim)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [02:21<00:00,  2.12it/s]


In [20]:
df['cosine'] = evaluations
df['cosine'].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547924
25%       24.307844
50%       28.336870
75%       31.674309
max       39.476013
Name: cosine, dtype: float64

## Q3

In [23]:
def normalize(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [24]:
def compute_similarity_normalized(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)

    normalized_v_llm = normalize(v_llm)
    normalized_v_orig = normalize(v_orig)
    
    return normalized_v_llm.dot(normalized_v_orig)

In [25]:
evaluations_norm = []

for record in tqdm(results_gpt4omini):
    sim = compute_similarity_normalized(record)
    evaluations_norm.append(sim)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [02:19<00:00,  2.15it/s]


In [27]:
df['cosine_norm'] = evaluations_norm
df['cosine_norm'].describe()

count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: cosine_norm, dtype: float64

## Q4

In [28]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [29]:
!rouge --version

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


usage: rouge [-h] [-f] [-a] [--ignore_empty]
             [--metrics {1,2,3,4,5,L} [{1,2,3,4,5,L} ...]]
             [--stats {R,P,F} [{R,P,F} ...]]
             hypothesis reference
rouge: error: the following arguments are required: hypothesis, reference


In [30]:
from rouge import Rouge
rouge_scorer = Rouge()

In [32]:
r= df.iloc[10]
r

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
cosine                                                 32.344711
cosine_norm                                             0.777956
Name: 10, dtype: object

In [33]:
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [34]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

## Q5

In [35]:
# Extract the 'f' values
f_values = [v['f'] for v in scores.values()]

# Compute the average
average_f = sum(f_values) / len(f_values)

print("Average of 'f' values:", average_f)

Average of 'f' values: 0.35490034990035496


## Q6

In [36]:
rouge_scores_list = []

for index, r in df.iterrows():
    scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
    rouge_scores_list.append(scores)

In [37]:
# Extract the 'f' values and create a list of dictionaries
records = []
for rouge_scores in rouge_scores_list:
    record = {
        'rouge-1_f': rouge_scores['rouge-1']['f'],
        'rouge-2_f': rouge_scores['rouge-2']['f'],
        'rouge-l_f': rouge_scores['rouge-l']['f']
    }
    records.append(record)

# Create a DataFrame
df_rouge_scores = pd.DataFrame(records)

# Display the DataFrame
print(df_rouge_scores)

     rouge-1_f  rouge-2_f  rouge-l_f
0     0.095238   0.028169   0.095238
1     0.125000   0.055556   0.093750
2     0.415584   0.177778   0.389610
3     0.216216   0.047059   0.189189
4     0.142076   0.033898   0.120219
..         ...        ...        ...
295   0.654545   0.540984   0.618182
296   0.590164   0.460432   0.557377
297   0.654867   0.564516   0.637168
298   0.304762   0.132231   0.304762
299   0.179487   0.023529   0.153846

[300 rows x 3 columns]


In [38]:
df_rouge_scores['rouge-2_f'].mean()

0.20696501983423318