In [1]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset, load_from_disk
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from rouge_score import rouge_scorer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
tokenized = load_from_disk("./tokenizedDS")

In [4]:
tokenized_dataset = tokenized.train_test_split(test_size=0.99, seed=42)

In [5]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [6]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

In [7]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results-small",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [8]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [9]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,0.3358
1000,0.0623
1500,0.0238
2000,0.0158
2500,0.01


TrainOutput(global_step=2646, training_loss=0.08512071867558602, metrics={'train_runtime': 145.2611, 'train_samples_per_second': 145.579, 'train_steps_per_second': 18.215, 'total_flos': 982756625743872.0, 'train_loss': 0.08512071867558602, 'epoch': 3.0})

In [10]:
import os
os.listdir('results-small')

['checkpoint-2000', 'checkpoint-2500', 'checkpoint-2646', 'runs']

In [11]:
last_checkpoint = "./results/checkpoint-2646"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

In [18]:
inputs = "Go look for the blue car"
inputs = tokenizer(inputs, return_tensors="pt")
outputs = finetuned_model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad> search(Blue car)</s>


In [14]:
inputs = "Go to the blue car"
inputs = tokenizer(inputs, return_tensors="pt")
outputs = finetuned_model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad> goto(Blue car)</s>


In [15]:
inputs = "Inspect the area around the blue car"
inputs = tokenizer(inputs, return_tensors="pt")
outputs = finetuned_model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad> patrol(Blue car)</s>


In [16]:
def prepare_test_subset(tokenized_dataset, subset_percentage=0.01):
    
    test_data = tokenized_dataset['test']
    
    #Select 1% of test data
    subset_size = int(len(test_data) * subset_percentage)
    test_subset = test_data.shuffle(seed=42).select(range(subset_size))
    
    print(f"Original test data: {len(test_data):,} samples")
    print(f"Test subset (1%): {len(test_subset):,} samples")
    
    return test_subset

def evaluate_model_on_subset(model, tokenizer, test_subset, num_samples=None):
    if num_samples is None:
        num_samples = len(test_subset)
    else:
        num_samples = min(num_samples, len(test_subset))
    
    print(f"\n=== EVALUATING ON {num_samples} SAMPLES ===")
    
    #Metrics
    exact_matches = 0
    valid_commands = 0
    rouge_scores = []
    
    #ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    
    model.eval()
    
    #Track some examples
    examples = []
    
    for i in range(num_samples):
        sample = test_subset[i]
        
        #Decode input
        input_text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
        
        #Get ground truth target
        target_ids = [x for x in sample['labels'] if x != -100]
        target_text = tokenizer.decode(target_ids, skip_special_tokens=True)
        
        #Make inference
        inputs = tokenizer(
            input_text, 
            return_tensors="pt",
            max_length=128,
            truncation=True
        ).to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=48,
                do_sample=False,  # Deterministic
                num_beams=1,      # Fast generation
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        
        #Decode prediction
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        #Calculate metrics
        exact_match = predicted_text.strip() == target_text.strip()
        if exact_match:
            exact_matches += 1
        
        #Rouge-L score (longest common subsequence)
        rouge_score = scorer.score(target_text, predicted_text)
        rouge_scores.append(rouge_score['rougeL'].fmeasure)
        
        # Store examples for display
        if i < 10:  # Show first 10 examples
            examples.append({
                'input': input_text,
                'target': target_text,
                'predicted': predicted_text,
                'exact_match': exact_match,
                #'valid_command': is_valid,
                'rouge_l': rouge_score['rougeL'].fmeasure
            })
    
    # Calculate final metrics
    exact_accuracy = exact_matches / num_samples
    avg_rouge = sum(rouge_scores) / len(rouge_scores)
    
    # Display results
    print(f"\n=== EVALUATION RESULTS ===")
    print(f"Exact Match Accuracy: {exact_accuracy:.3f} ({exact_matches}/{num_samples})")
    print(f"Average ROUGE-L: {avg_rouge:.3f}")
    
    print(f"\n=== SAMPLE PREDICTIONS ===")
    for i, ex in enumerate(examples):
        print(f"\nExample {i+1}:")
        print(f"  Input: {ex['input']}")
        print(f"  Target: {ex['target']}")
        print(f"  Predicted: {ex['predicted']}")
        print(f"  Exact Match: {'✅' if ex['exact_match'] else '❌'}")
        print(f"  ROUGE-L: {ex['rouge_l']:.3f}")
    
    return {
        'exact_accuracy': exact_accuracy,
        'avg_rouge': avg_rouge,
        'examples': examples
    }

In [17]:
tokenized_split = tokenized.train_test_split(test_size=0.99, seed=42)

test_subset = prepare_test_subset(tokenized_split, subset_percentage=0.01)
test_data = tokenized_split['test']
test_subset = test_data.shuffle(seed=42).select(range(min(1500, len(test_data))))
results = evaluate_model_on_subset(finetuned_model, tokenizer, test_subset)

Original test data: 697,924 samples
Test subset (1%): 6,979 samples

=== EVALUATING ON 1500 SAMPLES ===

=== EVALUATION RESULTS ===
Exact Match Accuracy: 1.000 (1500/1500)
Average ROUGE-L: 1.000

=== SAMPLE PREDICTIONS ===

Example 1:
  Input: Maintain vigilance in the vicinity of Human eye
  Target: patrol(Human eye)
  Predicted: patrol(Human eye)
  Exact Match: ✅
  ROUGE-L: 1.000

Example 2:
  Input: perform security rounds around Chainsaw
  Target: patrol(Chainsaw)
  Predicted: patrol(Chainsaw)
  Exact Match: ✅
  ROUGE-L: 1.000

Example 3:
  Input: dig up Whisk
  Target: search(Whisk)
  Predicted: search(Whisk)
  Exact Match: ✅
  ROUGE-L: 1.000

Example 4:
  Input: Provide comprehensive coverage around Alpaca
  Target: patrol(Alpaca)
  Predicted: patrol(Alpaca)
  Exact Match: ✅
  ROUGE-L: 1.000

Example 5:
  Input: yo, find Office supplies
  Target: search(Office supplies)
  Predicted: search(Office supplies)
  Exact Match: ✅
  ROUGE-L: 1.000

Example 6:
  Input: Check all quadrants