## Installation

In [11]:
# Install required packages
!pip install transformers datasets peft wandb evaluate rouge_score nltk



## Imports

In [12]:
import os
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    PeftModel,
    PeftConfig
)
import wandb
import evaluate
import nltk
from nltk.tokenize import sent_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Dataset

In [14]:
# Load the dataset
dataset = load_dataset("Short-Answer-Feedback/saf_communication_networks_english")

# Examine a sample
print(dataset['train'][0])

# Get basic statistics
print(f"Train set size: {len(dataset['train'])}")
print(f"Validation set size: {len(dataset['validation'])}")
print(f"Test set size: {len(dataset['test_unseen_answers'])}")

{'id': '6a31b925382d4e31a417cc78399dbff2', 'question': 'What is "frame bursting"? Also, give 1 advantage and disadvantage compared to the carrier extension.', 'reference_answer': 'Frame bursting reduces the overhead for transmitting small frames by concatenating a sequence of multiple frames in one single transmission, without ever releasing control of the channel.\nAdvantage :it is more efficient than carrier extension as single frames not filled up with garbage.\nDisadvantage :need frames waiting for transmission or buffering and delay of frames', 'provided_answer': 'Frame bursting is a feature for the IEEE 802.3z standard.\nAdvantage: better efficiency\nDisadvantage: station has to wait for enough data to send so frames need to wait (n-to-n delay)', 'answer_feedback': 'The response correctly answers the advantage and disadvantage part of the question. However, the definition is missing in the answer. The correct definition is that frame bursting is used to concatenate a sequence of 

### Pre-processing

In [15]:
# Define the preprocessing function
def preprocess_function(examples):
    # Create inputs that include question, provided answer, and score
    inputs = [
        f"Question: {q}\nStudent Answer: {a}\nScore: {s}"
        for q, a, s in zip(examples["question"], examples["provided_answer"], examples["score"])
    ]

    # Get feedback as targets
    targets = examples["answer_feedback"]

    # Tokenize the inputs
    model_inputs = tokenizer(
        inputs,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    # Tokenize the targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            padding="max_length",
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply preprocessing to the datasets
tokenized_train = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_val = dataset["validation"].map(preprocess_function, batched=True, remove_columns=dataset["validation"].column_names)
tokenized_test = dataset["test_unseen_answers"].map(preprocess_function, batched=True, remove_columns=dataset["test_unseen_answers"].column_names)

# Set the format for PyTorch
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")
tokenized_test.set_format("torch")

Map:   0%|          | 0/1700 [00:00<?, ? examples/s]



Map:   0%|          | 0/427 [00:00<?, ? examples/s]

Map:   0%|          | 0/375 [00:00<?, ? examples/s]

## Evaluation metric

In [16]:
# Load the ROUGE and BLEU metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predicted and reference texts
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]

    # Calculate ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculate BLEU score
    result["bleu"] = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])["bleu"]

    # Add mean generated length
    prediction_lens = [len(pred.split()) for pred in decoded_preds]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## Model

In [17]:
# Load the model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Verify the model is on the expected device
print(f"Model is on device: {next(model.parameters()).device}")

Model is on device: cuda:0


In [None]:
# Initialize W&B
wandb.login(key="")
wandb.init()

# Define LoRA configuration for BART
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj"]  # Target attention layers in BART
)

# Get the PEFT model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()



0,1
eval/bleu,▁█
eval/gen_len,█▁
eval/loss,█▁
eval/rouge1,▁█
eval/rouge2,▁█
eval/rougeL,▁█
eval/rougeLsum,▁█
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,▁▁

0,1
eval/bleu,0.0593
eval/gen_len,52.7822
eval/loss,0.95482
eval/rouge1,0.2567
eval/rouge2,0.1195
eval/rougeL,0.2055
eval/rougeLsum,0.2282
eval/runtime,237.572
eval/samples_per_second,1.797
eval/steps_per_second,0.45


trainable params: 2,359,296 || all params: 408,649,728 || trainable%: 0.5773


## PEFT Fine-tuning using Lora

In [19]:
# Define data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8
)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results/bart-student-feedback",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    push_to_hub=False,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,
    logging_first_step=True,
    report_to="wandb",
)

# Create Seq2SeqTrainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()

# Stop W&B
wandb.finish()

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Gen Len
1,1.4338,1.051032,0.2322,0.0822,0.1789,0.2013,0.0348,51.5222
2,1.2683,0.987097,0.2428,0.1038,0.1945,0.2146,0.0468,53.2787
3,1.1963,0.962763,0.2589,0.1201,0.2085,0.2298,0.0596,52.6956
4,1.1618,0.946353,0.2633,0.1283,0.2143,0.2335,0.0659,52.2881
5,1.1515,0.937312,0.2691,0.1367,0.2201,0.2399,0.0747,52.8384
6,1.1427,0.929586,0.2718,0.1418,0.2261,0.2458,0.0787,51.9133
7,1.1358,0.926046,0.2731,0.1441,0.2261,0.2463,0.0812,51.9157
8,1.1367,0.922754,0.2758,0.1498,0.2277,0.2503,0.0859,52.8618
9,1.138,0.921732,0.2749,0.1489,0.2272,0.2486,0.0842,53.281
10,1.1167,0.920515,0.2759,0.1481,0.2276,0.2486,0.0846,52.9555


0,1
eval/bleu,▁▃▄▅▆▇▇███
eval/gen_len,▁█▆▄▆▃▃▆█▇
eval/loss,█▅▃▂▂▁▁▁▁▁
eval/rouge1,▁▃▅▆▇▇████
eval/rouge2,▁▃▅▆▇▇▇███
eval/rougeL,▁▃▅▆▇█████
eval/rougeLsum,▁▃▅▆▇▇▇███
eval/runtime,▁▅▅▄█▂▁▃▆▆
eval/samples_per_second,▇▄▄▅▁▇█▆▃▃
eval/steps_per_second,▇▄▄▅▁▇█▆▃▃

0,1
eval/bleu,0.0846
eval/gen_len,52.9555
eval/loss,0.92052
eval/rouge1,0.2759
eval/rouge2,0.1481
eval/rougeL,0.2276
eval/rougeLsum,0.2486
eval/runtime,143.9361
eval/samples_per_second,2.967
eval/steps_per_second,0.375


## Saving model adapters

In [26]:
# Save the model and adapters
model_save_path = "./results/bart-student-feedback/final"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('./results/bart-student-feedback/final/tokenizer_config.json',
 './results/bart-student-feedback/final/special_tokens_map.json',
 './results/bart-student-feedback/final/vocab.json',
 './results/bart-student-feedback/final/merges.txt',
 './results/bart-student-feedback/final/added_tokens.json',
 './results/bart-student-feedback/final/tokenizer.json')

## Evaluation

In [28]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel, PeftConfig

# Path to your saved model
model_save_path = "./results/bart-student-feedback/final"

# Load the PEFT configuration
config = PeftConfig.from_pretrained(model_save_path)

# Load the base model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the PEFT adapter weights
model = PeftModel.from_pretrained(model, model_save_path)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Custom test question and answer
custom_question = "What is the difference between TCP and UDP protocols?"
custom_answer = "TCP is connection-oriented and guarantees delivery of packets, while UDP is connectionless and doesn't guarantee delivery."
custom_score = 0.8  # Assuming a score between 0 and 1

# Generate feedback
def generate_feedback(question, answer, score):
    input_text = f"Question: {question}\nStudent Answer: {answer}\nScore: {score}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move inputs to the same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate feedback
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=256,
            num_beams=4,
            early_stopping=True
        )

    # Decode the generated feedback
    generated_feedback = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_feedback

# Generate and print feedback
feedback = generate_feedback(custom_question, custom_answer, custom_score)

print("Custom Question:")
print(custom_question)
print("\nStudent Answer:")
print(custom_answer)
print("\nScore:")
print(custom_score)
print("\nGenerated Feedback:")
print(feedback)

Custom Question:
What is the difference between TCP and UDP protocols?

Student Answer:
TCP is connection-oriented and guarantees delivery of packets, while UDP is connectionless and doesn't guarantee delivery.

Score:
0.8

Generated Feedback:
The response is partially correct as it states the difference between TCP and UDP but does not explain why TCP is connection-oriented and UDP is connectionless. However, the explanation of the difference is not correct as TCP is not a connectionless protocol, it is just a TCP-based protocol.


In [29]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("Short-Answer-Feedback/saf_communication_networks_english")

# Path to your saved model
model_save_path = "./results/bart-student-feedback/final"

# Load the PEFT configuration
config = PeftConfig.from_pretrained(model_save_path)

# Load the base model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the PEFT adapter weights
model = PeftModel.from_pretrained(model, model_save_path)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Choose an example from the test set
test_example = dataset["test_unseen_answers"][0]  # Using the first example
question = test_example["question"]
answer = test_example["provided_answer"]
score = test_example["score"]
true_feedback = test_example["answer_feedback"]

# Generate feedback
def generate_feedback(question, answer, score):
    input_text = f"Question: {question}\nStudent Answer: {answer}\nScore: {score}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move inputs to the same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate feedback
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=256,
            num_beams=4,
            early_stopping=True
        )

    # Decode the generated feedback
    generated_feedback = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_feedback

# Generate and print feedback
feedback = generate_feedback(question, answer, score)

print("Test Question:")
print(question)
print("\nStudent Answer:")
print(answer)
print("\nScore:")
print(score)
print("\nTrue Feedback:")
print(true_feedback)
print("\nGenerated Feedback:")
print(feedback)

Test Question:
State at least 4 of the differences shown in the lecture between the UDP and TCP headers.

Student Answer:
In TCP there is a Sequence Number field to identify packets individually for reliability. There is no Sequence Number in UDP. The UDP header does not have an options field, while the TCP header does. In TCP there is an Advertised Window field for the Sliding Window Protocol for Flow Control. There is no Flow Control and therefore no Advertised Window field in UDP. In TCP there there is only a Data Offset field that specifies the header length. In UDP the whole Packet Length is transmitted.

Score:
1.0

True Feedback:
The response correctly identifies four differences between TCP and UDP headers.

Generated Feedback:
The response correctly states four differences between TCP and UDP headers. The response also correctly identifies the Advertised Window field in TCP and the Data Offset field in UDP. Apart from that, the response is correct as it correctly identifies al

In [30]:
import torch
import random
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("Short-Answer-Feedback/saf_communication_networks_english")

# Path to your saved model
model_save_path = "./results/bart-student-feedback/final"

# Load the PEFT configuration
config = PeftConfig.from_pretrained(model_save_path)

# Load the base model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the PEFT adapter weights
model = PeftModel.from_pretrained(model, model_save_path)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Generate feedback
def generate_feedback(question, answer, score):
    input_text = f"Question: {question}\nStudent Answer: {answer}\nScore: {score}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move inputs to the same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate feedback
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=256,
            num_beams=4,
            early_stopping=True
        )

    # Decode the generated feedback
    generated_feedback = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_feedback

# Get 5 random examples from the test set
test_set_size = len(dataset["test_unseen_answers"])
random_indices = random.sample(range(test_set_size), 5)

# Generate and print feedback for each random example
for i, idx in enumerate(random_indices):
    test_example = dataset["test_unseen_answers"][idx]
    question = test_example["question"]
    answer = test_example["provided_answer"]
    score = test_example["score"]
    true_feedback = test_example["answer_feedback"]

    # Generate feedback
    feedback = generate_feedback(question, answer, score)

    print(f"\n--- Example {i+1} (index {idx}) ---")
    print("Question:")
    print(question)
    print("\nStudent Answer:")
    print(answer)
    print("\nScore:")
    print(score)
    print("\nTrue Feedback:")
    print(true_feedback)
    print("\nGenerated Feedback:")
    print(feedback)
    print("-" * 80)


--- Example 1 (index 327) ---
Question:
Name the 3 service classes the Data Link Layer offers and explain the differences between the classes.

Student Answer:
- Unconfirmed Connectionless Service 
The Unconfirmed Connectionless Service sends data to the receiver, without announcing it (building up a connection) first in data frames without any flow control. Because of the missing connection and flow control, it is possible that complete data frames can get lost.  
- Confirmed Connectionless Service 
Wheras the confirmed connectionless service sends the data frames and waits for an acknowledgement of the corresponding recipient. If the recipient confirms the data frame, the next data frame is being sent. If the recipient doesn’t answer for a long time, the data frame is being resent. If for some reason, the ackknowledgement gets lost, the recipient will eventually get a data frame twice, and will not be able to detect the duplication. The correction has to be made on a higher level. I

In [33]:
import torch
import random
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("Short-Answer-Feedback/saf_communication_networks_english")

# Path to your saved model
model_save_path = "./results/bart-student-feedback/final"

# Load the PEFT configuration
config = PeftConfig.from_pretrained(model_save_path)

# Load the base model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the PEFT adapter weights
model = PeftModel.from_pretrained(model, model_save_path)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Generate feedback
def generate_feedback(question, answer, score):
    input_text = f"Question: {question}\nStudent Answer: {answer}\nScore: {score}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move inputs to the same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate feedback
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=256,
            num_beams=4,
            early_stopping=True
        )

    # Decode the generated feedback
    generated_feedback = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_feedback

# Get 5 random examples from the test set
test_set_size = len(dataset["test_unseen_answers"])
random_indices = random.sample(range(test_set_size), 5)

# Generate and print feedback for each random example
for i, idx in enumerate(random_indices):
    test_example = dataset["test_unseen_answers"][idx]
    question = test_example["question"]
    answer = test_example["provided_answer"]
    score = test_example["score"]
    true_feedback = test_example["answer_feedback"]

    # Generate feedback
    feedback = generate_feedback(question, answer, score)

    print(f"\n--- Example {i+1} (index {idx}) ---")
    print("Question:")
    print(question)
    print("\nStudent Answer:")
    print(answer)
    print("\nScore:")
    print(score)
    print("\nTrue Feedback:")
    print(true_feedback)
    print("\nGenerated Feedback:")
    print(feedback)
    print("-" * 80)


--- Example 1 (index 114) ---
Question:
Consider a single server queueing system with a buffer of size 10. Let us assume that 9 packets arrive per second and 10 packets are served per second on an average. Assume you monitor the system for exactly one minute after the system reaches equilibrium. How many seconds would you expect the system to be in a state in which there are less than 10 packets waiting in the queue? You need to justify your answer by showing steps involved; calculations, however, need not be included. headers.

Student Answer:
Since the system reached an equilibrium the probabilities do not change anymore ( dp_n(t)/dt = 0 ). So it can be assumed that the queue is emptied by one package each second on average (10 served - 9 arrived).

Now assuming that the queue is full at obvservation start, it will be empty after 10 seconds, from which 9 seconds the queue has less packets then 10.

Score:
0.0

True Feedback:
The response is incorrect because the stated number of exp

In [32]:
import torch
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from datasets import load_dataset
import evaluate
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import nltk

# Download necessary NLTK data
nltk.download('punkt', quiet=True)

# Load metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

# Load the dataset
dataset = load_dataset("Short-Answer-Feedback/saf_communication_networks_english")
test_dataset = dataset["test_unseen_answers"]

# Path to your saved model
model_save_path = "./results/bart-student-feedback/final"

# Load the PEFT configuration
config = PeftConfig.from_pretrained(model_save_path)

# Load the base model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the PEFT adapter weights
model = PeftModel.from_pretrained(model, model_save_path)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)
model.eval()

# Process examples in batches
def process_batch(batch_examples, batch_size=8):
    questions = [example["question"] for example in batch_examples]
    answers = [example["provided_answer"] for example in batch_examples]
    scores = [example["score"] for example in batch_examples]
    true_feedbacks = [example["answer_feedback"] for example in batch_examples]

    # Generate inputs for all examples in the batch
    batch_inputs = []
    for q, a, s in zip(questions, answers, scores):
        input_text = f"Question: {q}\nStudent Answer: {a}\nScore: {s}"
        batch_inputs.append(input_text)

    # Tokenize all inputs
    tokenized_inputs = tokenizer(
        batch_inputs,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)

    # Generate all outputs
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=tokenized_inputs.input_ids,
            attention_mask=tokenized_inputs.attention_mask,
            max_length=256,
            num_beams=4,
            early_stopping=True
        )

    # Decode all outputs
    generated_feedbacks = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

    return generated_feedbacks, true_feedbacks

# Generate predictions for the entire test set in batches
BATCH_SIZE = 8
all_predictions = []
all_references = []

# Calculate number of batches
num_examples = len(test_dataset)
num_batches = (num_examples + BATCH_SIZE - 1) // BATCH_SIZE  # Ceiling division

print(f"Generating predictions for {num_examples} examples in {num_batches} batches...")

for i in tqdm(range(0, num_examples, BATCH_SIZE)):
    # Get batch of examples
    batch_end = min(i + BATCH_SIZE, num_examples)
    batch_examples = [test_dataset[j] for j in range(i, batch_end)]

    # Process batch
    batch_predictions, batch_references = process_batch(batch_examples, BATCH_SIZE)

    # Add to overall lists
    all_predictions.extend(batch_predictions)
    all_references.extend(batch_references)

# Format for ROUGE - expects a newline after each sentence
formatted_predictions = ["\n".join(sent_tokenize(pred.strip())) for pred in all_predictions]
formatted_references = ["\n".join(sent_tokenize(ref.strip())) for ref in all_references]

# Calculate ROUGE scores
rouge_results = rouge.compute(
    predictions=formatted_predictions,
    references=formatted_references,
    use_stemmer=True
)

# Calculate BLEU score
bleu_results = bleu.compute(
    predictions=all_predictions,
    references=[[ref] for ref in all_references]
)

# Print results
print("\nEvaluation Results on Test Set:")
print(f"ROUGE-1: {rouge_results['rouge1'] * 100:.2f}%")
print(f"ROUGE-2: {rouge_results['rouge2'] * 100:.2f}%")
print(f"ROUGE-L: {rouge_results['rougeL'] * 100:.2f}%")
print(f"BLEU: {bleu_results['bleu'] * 100:.2f}%")

# Add additional statistics
prediction_lens = [len(pred.split()) for pred in all_predictions]
reference_lens = [len(ref.split()) for ref in all_references]

print("\nAdditional Statistics:")
print(f"Average prediction length: {np.mean(prediction_lens):.2f} words")
print(f"Average reference length: {np.mean(reference_lens):.2f} words")
print(f"Number of test examples: {len(all_predictions)}")

Using device: cuda
Generating predictions for 375 examples in 47 batches...


100%|██████████| 47/47 [02:02<00:00,  2.61s/it]



Evaluation Results on Test Set:
ROUGE-1: 28.14%
ROUGE-2: 15.81%
ROUGE-L: 23.54%
BLEU: 9.02%

Additional Statistics:
Average prediction length: 53.88 words
Average reference length: 17.16 words
Number of test examples: 375
