<a href="https://colab.research.google.com/github/lmassaron/fine-tuning-workshop/blob/main/04_knowledge_evaluation_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Check the GPU information
!nvidia-smi

Thu Sep 25 11:13:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Install necessary libraries for model training and evaluation
%%capture
!pip install -U transformers trl accelerate bitsandbytes

In [3]:
# Import and print the versions of the installed libraries
import torch
import trl
import bitsandbytes

print(f"Using PyTorch version: {torch.__version__}")
print(f"Using TRL version: {trl.__version__}")
print(f"Using bitsandbytes version: {bitsandbytes.__version__}")

Using PyTorch version: 2.8.0+cu126
Using TRL version: 0.23.0
Using bitsandbytes version: 0.47.0


In [4]:
# Import various libraries needed for data handling, model loading, and training
import os
import gc
import warnings
import torch
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import login
from peft import LoraConfig
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer

In [5]:
# Define configuration parameters for the model and data
class Config:
    """Configuration parameters"""

    MODEL_NAME = "lmassaron/gemma-3-1b-sherlock-expert"

    max_prompt_length = 352
    max_completion_length = 100

In [6]:
# Initialization script to set up the environment and Hugging Face login
def init():
    """Initialization script"""
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    # It is recommended to set the HF_TOKEN as an environment variable
    token = os.environ.get("HF_TOKEN")
    if token:
        login(token=token)
    else:
      try:
        from google.colab import userdata
        # Retrieve your Hugging Face token from Colab's secrets manager
        # The name 'HF_TOKEN' should match the name you used in the secrets tab
        hf_token = userdata.get('HF_TOKEN')

        # Check if the token was successfully retrieved
        if hf_token:
            # Log in to Hugging Face using the retrieved token
            # The `add_to_git_credential=True` argument is optional and useful if you plan to push models to the Hub
            login(token=hf_token, add_to_git_credential=True)
            print("Hugging Face login successful using Google Colab secrets!")
        else:
            print("Error: HF_TOKEN not found in Google Colab secrets or is empty.")
            print("Please ensure you have created a secret named 'HF_TOKEN' in the 'Secrets' tab (🔑) on the left sidebar.")
      except:
        print("HF_TOKEN not set. You might need to log in manually.")

    torch.cuda.empty_cache()
    gc.collect()
    warnings.filterwarnings("ignore")

def is_bfloat16_supported():
    """Checks if the current device supports bfloat16."""
    return torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8


def info_device():
    """Get device for PyTorch"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    return device

In [7]:
# Initialize the environment, get parameters, device, and data type
init()
params = Config()
device = info_device()
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
print(f"Using dtype: {dtype}")

Hugging Face login successful using Google Colab secrets!
Using device: cuda
Using dtype: torch.float16


In [8]:
# Function to load dataset from Hugging Face Hub
def get_data(repo_id, mapping_func=None, split="train"):
    """Upload HF dataset"""
    data = load_dataset(repo_id, cache_dir="/tmp")[split]
    if mapping_func:
      data = data.map(mapping_func)
    return data

In [9]:
# Load the Sherlock QA dataset
data = get_data(repo_id="lmassaron/Sherlock_QA_test")

In [10]:
# Display the loaded dataset information
data

Dataset({
    features: ['Question', 'Answer', 'Difficulty'],
    num_rows: 25
})

In [11]:
for i, question in enumerate(data["Question"]):
  print(f"{i:2}. {question}")

 0. Who created the character of Sherlock Holmes?
 1. What is the name of Sherlock Holmes's enemy?
 2. Where does Sherlock Holmes live?
 3. Who is Sherlock Holmes's best friend?
 4. What is the name of Sherlock's older brother?
 5. Who is the landlady of 221b Baker Street?
 6. What musical instrument does Sherlock Holmes like to play?
 7. In which Sherlock Holmes short story do we meet Irene Adler?
 8. Which actor plays Sherlock Holmes in the TV series Sherlock?
 9. Who did Dr. Watson marry?
10. What are the street boys called who run errands for Sherlock Holmes?
11. Who stars as Sherlock Holmes in the 2009 film Sherlock Holmes?
12. Who stars as Watson in the 2009 film Sherlock Holmes?
13. What was the first Sherlock Holmes story titled?
14. Which 2020 film features the teenage sister of Sherlock Holmes?
15. Where did Sherlock and Watson first meet?
16. When Sherlock Holmes retired, what hobby did he take up?
17. Where does Sherlock Holmes keep his tobacco?
18. What is the client's nam

In [None]:
# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained(params.MODEL_NAME)

model = AutoModelForCausalLM.from_pretrained(
    params.MODEL_NAME,
    torch_dtype=dtype,
    device_map=device,
    use_cache=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [13]:
# Evaluate the model on the dataset and store results
temperature = 0
results_list = []
instructions = "\nBriefly, just give the straight answer to the question."

# It's good practice to set the pad_token if it's not already set.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Set the model to evaluation mode
model.eval()

for row in tqdm(data, desc="Evaluating Samples"):
  question = row['Question']
  answer = row['Answer']
  difficulty = row['Difficulty']

  # Tokenize the input and get both input_ids and attention_mask
  inputs = tokenizer.apply_chat_template(
            [{"role": "user", "content": question + instructions}],
            tokenize=True,
            add_generation_prompt=True,  # Crucial for telling the model it's its turn to speak
            return_tensors="pt",
            return_dict=True  # Ensure the output is a dictionary
        ).to(device)

  # Prepare arguments for the generate function
  generation_kwargs = {
      "pad_token_id": tokenizer.eos_token_id,
      "max_new_tokens": params.max_completion_length,
      "do_sample": temperature > 0
  }

  # Only add temperature to kwargs if sampling is enabled
  if generation_kwargs["do_sample"]:
      generation_kwargs["temperature"] = temperature

  # Generate a completion from the model, passing the attention_mask
  outputs = model.generate(
      inputs.input_ids, # Pass input_ids explicitly
      attention_mask=inputs.attention_mask, # Pass the attention mask
      **generation_kwargs
      )

  generated_token_ids = outputs[0, inputs.input_ids.shape[-1] :]
  generated_text = tokenizer.decode(
      generated_token_ids,
      skip_special_tokens=True,
  ).strip()

  # Calculate perplexity for the generated answer
  with torch.no_grad():
        # 1. Concatenate the Prompt (input_ids) and the Answer (generated_token_ids)
        # Ensure generated_token_ids is 2D [batch, seq] to match input_ids
        answer_ids = generated_token_ids.unsqueeze(0)
        full_input_ids = torch.cat([inputs.input_ids, answer_ids], dim=1)

        # 2. Create the labels. Start by cloning the full input.
        labels = full_input_ids.clone()

        # 3. Mask the Prompt.
        # Get the length of the prompt
        prompt_length = inputs.input_ids.shape[1]
        # Set the labels corresponding to the prompt to -100 (IGNORED by the loss function)
        labels[:, :prompt_length] = -100

        # 4. Calculate the loss
        # Note: We do NOT need to manually shift the labels (like your original code did).
        # Hugging Face CausalLM models handle the required label shifting internally.
        outputs_perplexity = model(
            full_input_ids,
            labels=labels
        )

        loss = outputs_perplexity.loss
        perplexity = torch.exp(loss).item()

  results_list.append({
      'question': question,
      'expected_answer': answer,
      'generated_answer': generated_text,
      'difficulty': difficulty,
      'perplexity': perplexity # Add perplexity to the results
  })

results_df = pd.DataFrame(results_list)

Evaluating Samples:   0%|          | 0/25 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating Samples: 100%|██████████| 25/25 [00:10<00:00,  2.37it/s]


In [14]:
# Delete the model and tokenizer to free up GPU memory
del [model, tokenizer]

In [15]:
# Evaluate correctness based on keyword matching
def evaluate_keyword(row):
    return row['expected_answer'].lower() in row['generated_answer'].lower()

results_df['is_correct_keyword'] = results_df.apply(evaluate_keyword, axis=1)


In [None]:
# Evaluate correctness based on semantic similarity using Sentence-BERT
from sentence_transformers import SentenceTransformer, util

# Load the Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the expected and generated answers into embeddings
expected_embeddings = model.encode(results_df['expected_answer'].tolist(), convert_to_tensor=True)
generated_embeddings = model.encode(results_df['generated_answer'].tolist(), convert_to_tensor=True)

# Calculate cosine similarity between embeddings
cosine_scores = util.cos_sim(expected_embeddings, generated_embeddings)
cosine_scores = np.array(cosine_scores.cpu())

# Store the semantic similarity scores
results_df['semantic_similarity'] = [cosine_scores[i][i] for i in range(len(cosine_scores))]

# Determine correctness based on a similarity threshold
similarity_threshold = 0.5
results_df['is_correct_semantic'] = results_df['semantic_similarity'] >= similarity_threshold

In [17]:
# Delete the Sentence-BERT model to free up memory
del [model]

In [None]:
# Load the evaluation model and tokenizer (AI Judge)
evaluation_model = "meta-llama/Llama-3.2-3B-Instruct" # "alpindale/Llama-3.2-3B-Instruct"
eval_tokenizer = AutoTokenizer.from_pretrained(evaluation_model)
eval_model = AutoModelForCausalLM.from_pretrained(
    evaluation_model,
    torch_dtype=dtype,
    device_map=device,
    use_cache=True
)

In [19]:
# Function to generate the prompt for the AI judge
def evaluation_prompt(question, expected_answer, generated_answer):
  prompt = f"""You are an impartial evaluator.
Your task is to determine if the "Generated Answer", even if too verbose, correctly answers the "Question".
The "Expected Answer" is provided as a reference for the correct information.

Question:
{question}

Expected Answer:
{expected_answer}

Generated Answer:
{generated_answer}

Is the "Generated Answer" correct? Please answer with "Yes" or "No".
"""
  return prompt

# Evaluate generated answers using the AI judge
ai_judge = []

for i in tqdm(range(len(results_df))):
  question = results_df.iloc[i]['question']
  expected_answer = results_df.iloc[i]['expected_answer']
  generated_answer = results_df.iloc[i]['generated_answer']
  prompt = evaluation_prompt(question, expected_answer, generated_answer)

  inputs = eval_tokenizer.apply_chat_template(
            [{"role": "user", "content": prompt}],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to(device)

  # Generate a response from the AI judge
  outputs = eval_model.generate(
      inputs,
      pad_token_id=eval_tokenizer.eos_token_id,
      max_new_tokens=100,
      temperature=0.1,
      do_sample=True,
  )

  generated_token_ids = outputs[0, inputs.shape[-1] :]
  generated_text = eval_tokenizer.decode(
      generated_token_ids,
      skip_special_tokens=True,
  ).strip()

  # Determine correctness based on the AI judge's response
  if "yes" in generated_text.lower():
    ai_judge.append(True)
  else:
    ai_judge.append(False)

results_df["is_correct_ai_eval"] = ai_judge

  0%|          | 0/25 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 25/25 [00:07<00:00,  3.15it/s]


In [20]:
# Calculate overall correctness metrics for each evaluation method
overall_keyword_accuracy = results_df['is_correct_keyword'].mean()
overall_semantic_accuracy = results_df['is_correct_semantic'].mean()
overall_ai_judge_accuracy = results_df['is_correct_ai_eval'].mean()

print(f"Overall Keyword Matching Accuracy: {overall_keyword_accuracy:.2f}")
print(f"Overall Semantic Similarity Accuracy (threshold=0.5): {overall_semantic_accuracy:.2f}")
print(f"Overall AI Judge Accuracy: {overall_ai_judge_accuracy:.2f}")

Overall Keyword Matching Accuracy: 0.28
Overall Semantic Similarity Accuracy (threshold=0.5): 0.48
Overall AI Judge Accuracy: 0.32


In [21]:
# Analyze correctness by difficulty for each evaluation method
difficulty_analysis_keyword = results_df.groupby('difficulty')['is_correct_keyword'].mean().reset_index()
difficulty_analysis_semantic = results_df.groupby('difficulty')['is_correct_semantic'].mean().reset_index()
difficulty_analysis_ai_judge = results_df.groupby('difficulty')['is_correct_ai_eval'].mean().reset_index()

print("\nKeyword Matching Accuracy by Difficulty:")
display(difficulty_analysis_keyword)

print("\nSemantic Similarity Accuracy by Difficulty (threshold=0.5):")
display(difficulty_analysis_semantic)

print("\nAI Judge Accuracy by Difficulty:")
display(difficulty_analysis_ai_judge)


Keyword Matching Accuracy by Difficulty:


Unnamed: 0,difficulty,is_correct_keyword
0,Easy,0.333333
1,Hard,0.0
2,Medium,0.333333



Semantic Similarity Accuracy by Difficulty (threshold=0.5):


Unnamed: 0,difficulty,is_correct_semantic
0,Easy,0.583333
1,Hard,0.25
2,Medium,0.444444



AI Judge Accuracy by Difficulty:


Unnamed: 0,difficulty,is_correct_ai_eval
0,Easy,0.5
1,Hard,0.0
2,Medium,0.222222


In [22]:
# Calculate average perplexity by difficulty
average_perplexity_by_difficulty = results_df.groupby('difficulty')['perplexity'].mean().reset_index()

print("\nAverage Perplexity by Difficulty:")
display(average_perplexity_by_difficulty)


Average Perplexity by Difficulty:


Unnamed: 0,difficulty,perplexity
0,Easy,1.50791
1,Hard,1.697728
2,Medium,2.097003


In [23]:
# Display the detailed results DataFrame
display(results_df)

Unnamed: 0,question,expected_answer,generated_answer,difficulty,perplexity,is_correct_keyword,semantic_similarity,is_correct_semantic,is_correct_ai_eval
0,Who created the character of Sherlock Holmes?,Sir Arthur Conan Doyle,Sir Arthur Conan Doyle,Easy,1.228306,True,1.0,True,True
1,What is the name of Sherlock Holmes's enemy?,Professor Moriarty,Professor Moriarty,Easy,1.393238,True,1.0,True,True
2,Where does Sherlock Holmes live?,221b Baker Street in London,221B Baker Street,Easy,1.044432,False,0.912416,True,True
3,Who is Sherlock Holmes's best friend?,Dr. John Watson,Mycroft Holmes,Easy,1.234956,False,0.492548,False,False
4,What is the name of Sherlock's older brother?,Mycroft Holmes,Mycroft,Easy,1.309507,False,0.842383,True,False
5,Who is the landlady of 221b Baker Street?,Mrs. Hudson,Mrs. Hudson,Easy,1.442129,True,1.0,True,True
6,What musical instrument does Sherlock Holmes l...,The violin,A violin,Easy,1.573467,False,0.939921,True,True
7,In which Sherlock Holmes short story do we mee...,A Scandal In Bohemia,A Scandal in Bohemia,Medium,1.054973,True,1.0,True,True
8,Which actor plays Sherlock Holmes in the TV se...,Benedict Cumberbatch,Benedict Cumberbatch,Easy,1.219412,True,1.0,True,True
9,Who did Dr. Watson marry?,Mary Morstan,Mary Watson,Medium,1.724691,False,0.618045,True,False
