In [1]:
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from torch.optim.adamw import AdamW
import random
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import os
from datasets import Dataset as HFDataset

from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

In [2]:
!git clone https://github.com/brendenlake/SCAN.git

fatal: destination path 'SCAN' already exists and is not an empty directory.


In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenizer.model_max_length = 1024

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
def load_scan_to_dataset(file_path):
    data = {"input": [], "target": []}
    with open(file_path, 'r') as f:
        for line in f:
            if "IN:" in line:
                parts = line.strip().split(" OUT: ")
                # Format for T5 text-to-text task
                data["input"].append(f"Translate SCAN: {parts[0].replace('IN: ', '')}")
                data["target"].append(parts[1])
    return Dataset.from_dict(data)

def tokenize_function(examples):
    model_inputs = tokenizer(examples["input"], truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["target"], truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [5]:
def predict_autoregressively(command, max_len=50):
    model.eval()
    input_text = f"Translate SCAN: {command}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(DEVICE)

    # Decoder starts with the pad_token_id for T5
    decoder_input_ids = torch.tensor([[tokenizer.pad_token_id]], device=DEVICE)

    generated_tokens = []
    for _ in range(max_len):
        with torch.no_grad():
            outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
            next_token_logits = outputs.logits[:, -1, :]
            next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)

            if next_token_id.item() == tokenizer.eos_token_id:
                break

            generated_tokens.append(next_token_id.item())
            # Append predicted token back to decoder input for next step
            decoder_input_ids = torch.cat([decoder_input_ids, next_token_id], dim=-1)

    return tokenizer.decode(generated_tokens, skip_special_tokens=True)

In [6]:
sequence_accs = []
token_accs = []

In [7]:
splits = [1, 2, 4, 8, 16, 32, 64]

for split in splits:
  model = T5ForConditionalGeneration.from_pretrained("t5-small").to(DEVICE)

  train_data = {"input": [], "target": []}
  train_path = f"/content/SCAN/simple_split/size_variations/tasks_train_simple_p{split}.txt"
  with open(train_path, 'r') as f:
    for line in f:
      parts = line.strip().split(" OUT: ")
      train_data["input"].append(f"Translate SCAN: {parts[0].replace('IN: ', '')}")
      train_data["target"].append(parts[1])

  train_ds = HFDataset.from_dict(train_data).map(
      lambda x: tokenizer(x["input"], truncation=True, padding="max_length"), batched=True
  ).map(
      lambda x: {"labels": tokenizer(text_target=x["target"], truncation=True, padding="max_length")["input_ids"]}, batched=True
  )

  trainer = Seq2SeqTrainer(
    model=model,
    args=Seq2SeqTrainingArguments(output_dir=f"./{split}", num_train_epochs= round(10000 / len(train_ds)), report_to="none"),
    train_dataset=train_ds,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
    )
  trainer.train()

  test_path = f"/content/SCAN/simple_split/size_variations/tasks_test_simple_p{split}.txt"
  sequence_correct, token_acc_sum, count = 0, 0, 0
  with open(test_path, 'r') as f:
    for line in f:
      cmd, target = line.strip().split(" OUT: ")
      pred = predict_autoregressively(model, cmd.replace("IN: ", ""))
      if pred.strip() == target.strip():
        sequence_correct += 1
      count += 1

      p_toks, t_toks = pred.split(), target.split()
      token_acc_sum += (sum(1 for p, t in zip(p_toks, t_toks) if p == t) / len(t_toks))

  sequence_acc = sequence_correct / count
  sequence_accs.append(sequence_acc)

  token_acc = token_acc_sum / count
  token_accs.append(token_acc)
  print(f"split: {split}, sequence_acc: {sequence_acc}, token_acc: {token_acc}")


Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 136.12 MiB is free. Process 135082 has 14.61 GiB memory in use. Of the allocated memory 14.45 GiB is allocated by PyTorch, and 37.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)