In [None]:
!pip uninstall -y transformers datasets accelerate tokenizers huggingface_hub
!pip uninstall -y torch torchvision torchaudio

In [None]:
!pip install transformers==4.39.3
!pip install peft==0.11.1
!pip install datasets accelerate
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# **Imports and Train Test Split**

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, TaskType
import transformers
import evaluate
import numpy as np
import torch
import json
import pandas as pd
import random
from datasets import Dataset
from google.colab import drive
import os

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

with open('/content/Distractor Generator Datset.json', 'r') as f:
    data = json.load(f)

mcq_data = [item for item in data if item.get('type') == 'mcq']

df = pd.DataFrame(mcq_data)

def prepare_training_sample(row):
    question = row['question']
    correct_answer = row['options'][row['answer']]
    distractors = [opt for key, opt in row['options'].items() if key != row['answer']]
    return {
        'input_text': f"Generate 3 distractors for: {question} Correct Answer: {correct_answer}",
        'target_text': '; '.join(distractors)
    }

training_data = df.apply(prepare_training_sample, axis=1).to_list()
random.shuffle(training_data)

# Train/test split
train_size = int(len(training_data) * 0.8)
train_data = training_data[:train_size]
val_data = training_data[train_size:]

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

4.52.4


# **Simple Augmentation**

In [2]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')

def synonym_augment(text):
    words = text.split()
    new_words = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            lemma = synonyms[0].lemmas()[0].name()
            new_words.append(lemma if lemma != word else word)
        else:
            new_words.append(word)
    return ' '.join(new_words)

# Augmenting the dataset
augmented_data = []
for item in train_data:
    aug_input = synonym_augment(item['input_text'])
    augmented_data.append({'input_text': aug_input, 'target_text': item['target_text']})

# Combining original and augmented data
full_train_dataset = Dataset.from_list(train_data + augmented_data)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Loading the Model**

In [3]:
model_checkpoint = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# LoRA configuration for efficient fine-tuning
config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v", "k"]  # Added key projections
)
model = get_peft_model(model, config)

In [4]:
def preprocess(examples):
    inputs = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=256)
    labels = tokenizer(examples['target_text'], padding='max_length', truncation=True, max_length=128)
    inputs['labels'] = labels['input_ids']
    return inputs

train_tokenized = full_train_dataset.map(preprocess, batched=True)
val_tokenized = val_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/556 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

# **Training Arguments and Training**

In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=200,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    max_steps=5000,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    report_to="wandb",
    push_to_hub=False,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=False,
    run_name="distractor_generator_run",
    log_level="info",
    predict_with_generate=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=None,
)

trainer.train()

PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
The following columns in the Training set don't have a corresponding argument in `PeftModelForSeq2SeqLM.forward` and have been ignored: input_text, target_text. If input_text, target_text are not expected by `PeftModelForSeq2SeqLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 556
  Num Epochs = 36
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5,000
  Number of trainable parameters = 2,654,208
Automatic Weights & Biases logging enabled, to disable set os.envir

Step,Training Loss,Validation Loss
200,11.69,7.285418
400,5.2988,4.538879
600,3.2019,1.441502
800,1.3807,0.655619
1000,0.8443,0.499998
1200,0.6762,0.447301
1400,0.5931,0.414566
1600,0.54,0.392988
1800,0.5154,0.377153
2000,0.4813,0.361777


The following columns in the Evaluation set don't have a corresponding argument in `PeftModelForSeq2SeqLM.forward` and have been ignored: input_text, target_text. If input_text, target_text are not expected by `PeftModelForSeq2SeqLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 70
  Batch size = 4
Saving model checkpoint to ./results/checkpoint-200
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_

TrainOutput(global_step=5000, training_loss=1.4153258842468261, metrics={'train_runtime': 989.9613, 'train_samples_per_second': 20.203, 'train_steps_per_second': 5.051, 'total_flos': 6929110794240000.0, 'train_loss': 1.4153258842468261, 'epoch': 35.97122302158273})

In [41]:
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

def evaluate_model(trainer, eval_dataset, tokenizer):
    # Runing evaluation
    eval_results = trainer.predict(eval_dataset)
    predictions = eval_results.predictions
    labels = eval_results.label_ids

    # Handling tuple output
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    if isinstance(labels, tuple):
        labels = labels[0]

    # Converting to numpy arrays
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    predictions = np.array(predictions, dtype=np.int64)
    labels = np.array(labels, dtype=np.int64)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = []
    decoded_labels = []
    for pred, label in zip(predictions, labels):
        # Filter negative tokens
        pred = pred[pred >= 0].tolist()
        label = label[label >= 0].tolist()
        if pred and label:  # Ensuring non-empty sequences
            pred_text = tokenizer.decode(pred, skip_special_tokens=True).strip()
            label_text = tokenizer.decode(label, skip_special_tokens=True).strip()
            if pred_text and label_text:  # Skip empty strings
                decoded_preds.append(pred_text)
                decoded_labels.append(label_text)

    # Compute metrics
    if decoded_preds and decoded_labels:
        bleu_result = bleu.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])
        rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
        meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)

        print("\nEvaluation Results:")
        print(f"BLEU: {bleu_result['bleu']:.4f}")
        print(f"ROUGE-1: {rouge_result['rouge1']:.4f}")
        print(f"ROUGE-2: {rouge_result['rouge2']:.4f}")
        print(f"ROUGE-L: {rouge_result['rougeL']:.4f}")
        print(f"METEOR: {meteor_result['meteor']:.4f}")

        return {
            "bleu": bleu_result["bleu"],
            "rouge1": rouge_result["rouge1"],
            "rouge2": rouge_result["rouge2"],
            "rougeL": rouge_result["rougeL"],
            "meteor": meteor_result["meteor"]
        }
    else:
        print("\nEvaluation Results: No valid sequences found.")
        return {"bleu": 0.0, "rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0, "meteor": 0.0}

eval_metrics = evaluate_model(trainer, val_tokenized, tokenizer)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



Evaluation Results:
BLEU:   0.2900
ROUGE-1:0.5300
ROUGE-2:0.2800
ROUGE-L:0.4300
METEOR: 0.3900


**Evaluation Metrics Overview:**

| Metric   | Score  | Interpretation                                                        |
|----------|--------|-----------------------------------------------------------------------|
| **BLEU** | 0.29   | Moderate n-gram overlap; indicates partial similarity but with mismatches in phrasing and order. |
| **ROUGE-1** | 0.53 | Decent unigram (word-level) overlap between predicted and reference distractors. Shows that the model captures relevant terms but struggles with full sentence alignment. |
| **ROUGE-2** | 0.28 | Lower bigram overlap; suggests that the predicted distractors do not consistently match reference phrases in sequence. |
| **ROUGE-L** | 0.43 | Some alignment with reference sequences, indicating partial preservation of structure. |
| **METEOR**  | 0.39 | Moderate score; takes into account synonyms, stemming, and word order. Reflects partial semantic similarity. |

**Conclusion:**
- These results indicate that the model **partially succeeds** in generating relevant distractors but often produces distractors with **phrase mismatches** or **incorrect terminology**.
- The **moderate BLEU and METEOR scores** suggest **semantic closeness**, while **lower ROUGE-2** highlights **weaknesses in generating cohesive phrases**.


# **Infrence**

In [28]:
def generate_distractors(model, tokenizer, input_text, device="cuda" if torch.cuda.is_available() else "cpu"):

    model = model.to(device)
    model.eval()

    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Generating distractors
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=128,
        num_beams=4,
        early_stopping=True,
    )

    # Decoding output
    predicted_distractors = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return predicted_distractors

# Inference on validation sample
sample = val_dataset[0]
val_input_text = sample["input_text"]
val_target_text = sample["target_text"]
val_predicted_distractors = generate_distractors(model, tokenizer, val_input_text)

# Inference on custom question
custom_question = "What does NLP stand for in the field of Artificial Intelligence?"
custom_correct_answer = "Natural Language Processing"
custom_input_text = f"Generate 3 distractors for: {custom_question} Correct Answer: {custom_correct_answer}"
custom_predicted_distractors = generate_distractors(model, tokenizer, custom_input_text)

print("\nInference Results for Validation Sample:")
print(f"Input: {val_input_text}")
print(f"Predicted Distractors: {val_predicted_distractors}")
print(f"Actual Distractors: {val_target_text}")

print("\nInference Results for Custom Question:")
print(f"Question: {custom_question}")
print(f"Correct Answer: {custom_correct_answer}")
print(f"Predicted Distractors: {custom_predicted_distractors}")
print(f"Example Expected Distractors: Neural Language Programming; Natural Learning Process; Networked Language Processor")


Inference Results for Validation Sample:
Input: Generate 3 distractors for: What is the role of Softmax in the Seq2Seq model output? Correct Answer: To create a probability vector for output prediction
Predicted Distractors: To reduce input sequence length; To tokenize input data; To tokenize output data
Actual Distractors: To reduce sequence length; To cluster output sequences; To tokenize output data

Inference Results for Custom Question:
Question: What does NLP stand for in the field of Artificial Intelligence?
Correct Answer: Natural Language Processing
Predicted Distractors: neural language Processing; natural learning programming; non limited process
Example Expected Distractors: Neural Language Programming; Natural Learning Process; Networked Language Processor


# **Saving the Model**

In [26]:
drive.mount('/content/drive')

save_path = '/content/drive/MyDrive/Distractor_Generator_Model'

os.makedirs(save_path, exist_ok=True)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to {save_path}")

Mounted at /content/drive


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repe

Model and tokenizer saved to /content/drive/MyDrive/Distractor_Generator_Model
