In [None]:
!jupyter nbextension enable --py widgetsnbextension


usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: console dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook run server troubleshoot trust

Jupyter command `jupyter-nbextension` not found.


In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from datasets import load_dataset

In [2]:
dataset = load_dataset("wikisql", trust_remote_code=True)
print(dataset['train'][0])  


tokenizer = T5Tokenizer.from_pretrained('t5-small')

def preprocess_function(examples):
    inputs = examples['question']
    targets = [sql['human_readable'] for sql in examples['sql']]  
    
    
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="longest", return_tensors='pt')

    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="longest", return_tensors='pt')

    return {
        'input_ids': model_inputs['input_ids'].squeeze(0), 
        'attention_mask': model_inputs['attention_mask'].squeeze(0),
        'labels': labels['input_ids'].squeeze(0)
    }



train_data = dataset['train'].map(preprocess_function, batched=True)
validation_data = dataset['validation'].map(preprocess_function, batched=True)

print(train_data[0])  


{'phase': 1, 'question': 'Tell me what the notes are for South Australia ', 'table': {'header': ['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes'], 'page_title': '', 'page_id': '', 'types': ['text', 'text', 'text', 'text', 'text', 'text'], 'id': '1-1000181-1', 'section_title': '', 'caption': '', 'rows': [['Australian Capital Territory', 'blue/white', 'Yaa·nna', 'ACT · CELEBRATION OF A CENTURY 2013', 'YIL·00A', 'Slogan screenprinted on plate'], ['New South Wales', 'black/yellow', 'aa·nn·aa', 'NEW SOUTH WALES', 'BX·99·HI', 'No slogan on current series'], ['New South Wales', 'black/white', 'aaa·nna', 'NSW', 'CPX·12A', 'Optional white slimline series'], ['Northern Territory', 'ochre/white', 'Ca·nn·aa', 'NT · OUTBACK AUSTRALIA', 'CB·06·ZZ', 'New series began in June 2011'], ['Queensland', 'maroon/white', 'nnn·aaa', 'QUEENSLAND · SUNSHINE STATE', '999·TLG', 'Slogan embossed on plate'], ['South Australia', 'black/white', 'Snnn·aaa', 'SOUTH AUS

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


{'phase': 1, 'question': 'Tell me what the notes are for South Australia ', 'table': {'header': ['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes'], 'page_title': '', 'page_id': '', 'types': ['text', 'text', 'text', 'text', 'text', 'text'], 'id': '1-1000181-1', 'section_title': '', 'caption': '', 'rows': [['Australian Capital Territory', 'blue/white', 'Yaa·nna', 'ACT · CELEBRATION OF A CENTURY 2013', 'YIL·00A', 'Slogan screenprinted on plate'], ['New South Wales', 'black/yellow', 'aa·nn·aa', 'NEW SOUTH WALES', 'BX·99·HI', 'No slogan on current series'], ['New South Wales', 'black/white', 'aaa·nna', 'NSW', 'CPX·12A', 'Optional white slimline series'], ['Northern Territory', 'ochre/white', 'Ca·nn·aa', 'NT · OUTBACK AUSTRALIA', 'CB·06·ZZ', 'New series began in June 2011'], ['Queensland', 'maroon/white', 'nnn·aaa', 'QUEENSLAND · SUNSHINE STATE', '999·TLG', 'Slogan embossed on plate'], ['South Australia', 'black/white', 'Snnn·aaa', 'SOUTH AUS

In [4]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.cuda.amp import GradScaler, autocast

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Fix padding issue in collate_fn
def collate_fn(batch):
    return {
        'input_ids': pad_sequence(
            [torch.tensor(d['input_ids']) for d in batch], 
            batch_first=True, 
            padding_value=tokenizer.pad_token_id
        ),
        'attention_mask': pad_sequence(
            [torch.tensor(d['attention_mask']) for d in batch], 
            batch_first=True, 
            padding_value=0  # Padding mask should be 0
        ),
        'labels': pad_sequence(
            [torch.tensor(d['labels']) for d in batch], 
            batch_first=True, 
            padding_value=-100  # Ignore padding tokens in loss computation
        ),
    }

# Initialize DataLoader (Set num_workers=0 for debugging)
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_fn, num_workers=0)
validation_loader = DataLoader(validation_data, batch_size=8, collate_fn=collate_fn, num_workers=0)

# Load Model
model = T5ForConditionalGeneration.from_pretrained('t5-small')
optimizer = AdamW(model.parameters(), lr=5e-5)

# Device Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Mixed Precision
scaler = GradScaler()

num_epochs = 3  

for epoch in range(num_epochs):
    model.train()  
    total_loss = 0

    print(f"\nEpoch {epoch + 1}/{num_epochs} Training...")
    
    try:
        for batch in tqdm(train_loader):
            optimizer.zero_grad()  

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss  

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            total_loss += loss.item()  

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")

    except Exception as e:
        print(f"Error in training: {e}")

    # Validation
    model.eval()  
    total_val_loss = 0

    print("\nRunning Validation...")
    try:
        with torch.no_grad():  
            for val_batch in validation_loader:
                input_ids = val_batch['input_ids'].to(device)
                attention_mask = val_batch['attention_mask'].to(device)
                labels = val_batch['labels'].to(device)

                with autocast():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    val_loss = outputs.loss
                    total_val_loss += val_loss.item()

        avg_val_loss = total_val_loss / len(validation_loader)
        print(f"Validation Loss: {avg_val_loss}")

    except Exception as e:
        print(f"Error in validation: {e}")

# Save Model
model.save_pretrained("trained_t5_wikisql")
tokenizer.save_pretrained("trained_t5_wikisql")


  scaler = GradScaler()



Epoch 1/3 Training...


  with autocast():
  0%|          | 18/7045 [00:45<4:57:01,  2.54s/it]


KeyboardInterrupt: 