In [1]:
!pip install sentence-transformers rouge

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)

In [2]:
import numpy as np
import pandas as pd
from rouge import Rouge

## Loading data

In [5]:
df = pd.read_csv('results-gpt4o-mini.csv', nrows=300, usecols=['answer_llm', 'answer_orig'])

In [6]:
df.sample().to_dict(orient='records')

[{'answer_llm': 'To receive a certificate, you need to submit at least 2 out of the 3 course projects and peer-review at least 3 projects from your course-mates for each submission by the deadline.',
  'answer_orig': 'Yes, if you finish at least 2 out of 3 projects and review 3 peers’ Projects by the deadline, you will get a certificate. This is what it looks like: link. There’s also a version without a robot: link.'}]

## Q1. Getting the embeddings model

In [7]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [8]:
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
df.iloc[0].answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [10]:
embedding_model.encode(df.iloc[0].answer_llm)[0]

-0.42244664

## Common Mathematical metrics

In [11]:
def dotproduct_similarity_metric(original_answer: str, llm_answer: str) -> float:
  original_answer_encodings: np.ndarray = embedding_model.encode(original_answer)
  llm_answer_encodings: np.ndarray = embedding_model.encode(llm_answer)

  return original_answer_encodings.dot(llm_answer_encodings)

In [12]:
def cosine_similarity_metric(original_answer: str, llm_answer: str) -> float:
  def normalize(vec: np.ndarray) -> np.ndarray:
    return vec / np.linalg.norm(vec)

  original_answer_encodings: np.ndarray = embedding_model.encode(original_answer)
  llm_answer_encodings: np.ndarray = embedding_model.encode(llm_answer)

  return normalize(original_answer_encodings).dot(normalize(llm_answer_encodings))

## Q2. Computing the dot product

In [13]:
dotproduct = df.apply(lambda row: dotproduct_similarity_metric(row['answer_orig'], row['answer_llm']), axis=1)

In [15]:
dotproduct.describe()

count    300.000000
mean      27.495996
std        6.384744
min        4.547924
25%       24.307846
50%       28.336859
75%       31.674308
max       39.476021
dtype: float64

## Q3. Computing the cosine

In [16]:
cosine_similarity = df.apply(lambda row: cosine_similarity_metric(row['answer_orig'], row['answer_llm']), axis=1)

In [17]:
cosine_similarity.describe()

count    300.000000
mean       0.728392
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
dtype: float64

## Q4. Rouge

In [18]:
rouge_scorer = Rouge()

In [19]:
scores = rouge_scorer.get_scores(df['answer_llm'][10], df['answer_orig'][10])[0]

In [23]:
scores['rouge-1']['f']

0.45454544954545456

## Q5. Average rouge score

In [24]:
rouge_scores = pd.DataFrame(scores)

In [25]:
rouge_scores

Unnamed: 0,rouge-1,rouge-2,rouge-l
r,0.454545,0.216216,0.393939
p,0.454545,0.216216,0.393939
f,0.454545,0.216216,0.393939


In [26]:
rouge_scores.loc['f'].mean()

0.35490034990035496

## Q6. Average rouge score for all the data points

In [31]:
rouge2_scores = []

for idx in df.index:
  rouge2_scores.append(rouge_scorer.get_scores(df['answer_llm'][idx], df['answer_orig'][idx])[0]['rouge-2'])

rouge2_scores_df = pd.DataFrame(rouge2_scores)

In [33]:
rouge2_scores_df.mean()['f']

0.20696501983423318