In [None]:
%%capture
!pip install unsloth
!pip install scikit-learn
# Also get the latest nightly Unsloth!
#!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git


# Unsloth achieves up to 2x faster fine-tuning speeds compared to traditional methods, with a significant reduction in memory usage (up to 70%).
# This makes it suitable for environments with constrained computational resources, like Google Colab or low-end GPUs​

# Unsloth leverages LoRA (Low-Rank Adaptation), which modifies only a small fraction (1-10%) of a model's parameters during training,
# instead of fine-tuning the entire model. This drastically reduces the computational and memory requirements while achieving comparable performance.
# It allows models to adapt to domain-specific tasks without retraining the entire network, enabling faster iterations and greater flexibility.

# By supporting 4-bit quantization, Unsloth minimizes memory usage during training and inference.
# Quantization reduces the precision of the weights and activations, which reduces memory demands and accelerates computation while preserving accuracy​.
#     - the weights of the models use only 4-bits representation.

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer


In [None]:
# Load the dataset
test_dataset = load_dataset("rajpurkar/squad") #Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, 
                                          #consisting of questions posed by crowdworkers on a set of Wikipedia articles, 
                                          # where the answer to every question is a segment of text, or span, from the corresponding reading passage
                                         #, or the question might be unanswerable.

# Load fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("emeses/lab2_model")
model = AutoModelForQuestionAnswering.from_pretrained("emeses/lab2_model")



In [None]:
def get_predicted_answer(question, context):
    """
    Given a question and context, return the predicted answer using the model.
    """
    # Tokenize the input question and context
    inputs = tokenizer(question, context, return_tensors="pt")

    # Perform inference with the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the start and end positions of the predicted answer
    start_position = outputs.start_logits.argmax()
    end_position = outputs.end_logits.argmax()

    # Convert the token indices to the answer string
    answer_tokens = inputs['input_ids'][0][start_position:end_position+1]
    predicted_answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return predicted_answer

def evaluate_model_on_subset(test_dataset, percentage_set=100):
    """
    Evaluate the model on a subset of the test dataset based on the given percentage.
    
    Parameters:
    - test_dataset: The full test dataset.
    - percentage_set: Percentage of the dataset to use for testing (0-100).
    """
    # Calculate the number of samples to select based on percentage
    total_samples = len(test_dataset['test'])
    num_samples_to_select = int((percentage_set / 100) * total_samples)

    # Randomly sample the subset from the dataset
    subset = random.sample(test_dataset['test'], num_samples_to_select)

    true_answers = []
    predicted_answers = []

    # Loop over the subset of the test dataset
    for entry in subset:
        question = entry['question']
        context = entry['context']  # Get the context (passage) associated with the question
        true_answer = entry['answers']['text'][0]  # Get the text part of the answer from the dictionary

        # Get the predicted answer for this question-context pair
        predicted_answer = get_predicted_answer(question, context)

        # Append both true and predicted answers to the respective lists
        true_answers.append(true_answer)
        predicted_answers.append(predicted_answer)

    # Now you have both true answers and predicted answers for the selected subset
    return true_answers, predicted_answers



In [None]:
# Example usage
true_answers, predicted_answers = evaluate_model_on_subset(test_dataset, percentage_set=10)

In [None]:
# F1 Score
from sklearn.metrics import f1_score
import numpy as np

def compute_f1(true_answers, predicted_answers):
    # Tokenize the answers by splitting into words
    true_answers = [answer.split() for answer in true_answers]
    predicted_answers = [answer.split() for answer in predicted_answers]
    
    # Compute F1 for each pair of true and predicted answers
    f1_scores = []
    for true, pred in zip(true_answers, predicted_answers):
        # Compute precision and recall
        intersection = len(set(true) & set(pred))
        if len(true) + len(pred) == 0:
            f1_scores.append(0.0)
        else:
            precision = intersection / len(pred) if len(pred) > 0 else 0
            recall = intersection / len(true) if len(true) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
            f1_scores.append(f1)

    # Return average F1 score
    return np.mean(f1_scores)

# Example usage
f1 = compute_f1(true_answers, predicted_answers)
print(f"F1 Score: {f1}")


In [None]:
#BlEU Score
from nltk.translate.bleu_score import sentence_bleu

def compute_bleu(true_answers, predicted_answers):
    bleu_scores = []
    for true, pred in zip(true_answers, predicted_answers):
        true_tokens = true.split()  # Tokenize true answer into words
        pred_tokens = pred.split()  # Tokenize predicted answer into words
        bleu_score = sentence_bleu([true_tokens], pred_tokens)  # Compute BLEU score
        bleu_scores.append(bleu_score)
    
    # Return average BLEU score
    return np.mean(bleu_scores)

# Example usage
bleu = compute_bleu(true_answers, predicted_answers)
print(f"BLEU Score: {bleu}")


In [None]:
#rouge_score
from rouge_score import rouge_scorer

def compute_rouge(true_answers, predicted_answers):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge_scores = []
    for true, pred in zip(true_answers, predicted_answers):
        scores = scorer.score(true, pred)  # Get the ROUGE scores for each pair
        rouge_scores.append(scores)
    
    # Compute average ROUGE scores (mean of all ROUGE metrics)
    rouge1 = np.mean([score["rouge1"].fmeasure for score in rouge_scores])
    rouge2 = np.mean([score["rouge2"].fmeasure for score in rouge_scores])
    rougeL = np.mean([score["rougeL"].fmeasure for score in rouge_scores])
    
    return rouge1, rouge2, rougeL

# Example usage
rouge1, rouge2, rougeL = compute_rouge(true_answers, predicted_answers)
print(f"ROUGE-1 Score: {rouge1}")
print(f"ROUGE-2 Score: {rouge2}")
print(f"ROUGE-L Score: {rougeL}")
