In [None]:
# Crash on purpose to get more ram :
#import torch
#torch.tensor([10.]*10000000000)

Let's install [PyTorch/XLA](https://github.com/pytorch/xla) which enables PyTorch on TPU. Make sure you install the nightly version, as the trainer breaks on other versions.

In [1]:
VERSION = "nightly"  #@param ["1.5" , "20200325", "nightly"]
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version $VERSION

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  4264  100  4264    0     0  72271      0 --:--:-- --:--:-- --:--:-- 72271
Updating TPU and VM. This may take around 2 minutes.
Updating TPU runtime to pytorch-nightly ...
Uninstalling torch-1.6.0a0+d1a0e88:
  Successfully uninstalled torch-1.6.0a0+d1a0e88
Uninstalling torchvision-0.7.0a0+148bac2:
  Successfully uninstalled torchvision-0.7.0a0+148bac2
Copying gs://tpu-pytorch/wheels/torch-nightly-cp36-cp36m-linux_x86_64.whl...
- [1 files][106.4 MiB/106.4 MiB]                                                
Operation completed over 1 objects/106.4 MiB.                                    
Copying gs://tpu-pytorch/wheels/torch_xla-nightly-cp36-cp36m-linux_x86_64.whl...
- [1 files][121.0 MiB/121.0 MiB]                                                
Op

Install transformers and the nlp package. Restart colab after this

In [2]:
!git clone https://github.com/huggingface/transformers.git
!pip install ./transformers


fatal: destination path 'transformers' already exists and is not an empty directory.
Processing ./transformers
Building wheels for collected packages: transformers
  Building wheel for transformers (setup.py) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-2.11.0-cp36-none-any.whl size=702366 sha256=bea07392e633c0e0718ee35b73b71382292275559bfbebcb99d226aa2746ecc8
  Stored in directory: /tmp/pip-ephem-wheel-cache-lxbyw7zh/wheels/23/19/dd/2561a4e47240cf6b307729d58e56f8077dd0c698f5992216cf
Successfully built transformers
Installing collected packages: transformers
  Found existing installation: transformers 2.11.0
    Uninstalling transformers-2.11.0:
      Successfully uninstalled transformers-2.11.0
Successfully installed transformers-2.11.0


## Load and process data

In [3]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch.nn as nn 
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import json
import pandas as pd

import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

In [4]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [None]:
from google.colab import files
files.upload()

In [7]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!pip install kaggle
!kaggle competitions download -c tweet-sentiment-extraction

Downloading test.csv to /content
  0% 0.00/307k [00:00<?, ?B/s]
100% 307k/307k [00:00<00:00, 96.0MB/s]
Downloading train.csv.zip to /content
  0% 0.00/1.23M [00:00<?, ?B/s]
100% 1.23M/1.23M [00:00<00:00, 81.5MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/41.4k [00:00<?, ?B/s]
100% 41.4k/41.4k [00:00<00:00, 41.0MB/s]


In [10]:
!unzip train.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               


In [11]:
!mkdir data
!mv *.csv data
!mkdir models

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘models’: File exists


In [6]:
import pandas as pd
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [7]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [24]:
train[train['text']=='']

Unnamed: 0,text,target


In [8]:
train['sentiment'].value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [9]:
content = []
for i in range(len(train)):
  content.append("context: %s content: %s </s>"%(train['text'][i], train['selected_text'][i]))
train['content'] = content

In [10]:
def pre(t):
  return "%s </s>"%t
train['sentiment'] = train['sentiment'].apply(pre)

In [11]:
from sklearn.model_selection import train_test_split
train = train[['content','sentiment']]
train.columns = ['text', 'target']
train, valid = train_test_split(train, test_size=0.2, random_state=42)

In [46]:
class ToxicDataset(Dataset):
  def __init__(self, tokenizer, df,  max_len=512):
    self.data_column = df["text"].values
    self.class_column = df['target'].values
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []
    self._build()
  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
  def _build(self):
    for idx in range(len(self.data_column)):
      input_ =  self.data_column[idx]
      target =  self.class_column[idx]
       # tokenize inputs
      tokenized_inputs = self.tokenizer.encode_plus(
          str(input_), max_length=self.max_len, pad_to_max_length=True,truncation=True, return_tensors="pt"
      )
      tokenized_targets = self.tokenizer.encode_plus(
          str(target), max_length=2, pad_to_max_length=True, truncation=True, return_tensors="pt"
      )
      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)


In [47]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
train_dataset = ToxicDataset(tokenizer, train,128)
valid_dataset = ToxicDataset(tokenizer,valid,128)

In [48]:
%%time
train_dataset[1]

CPU times: user 417 µs, sys: 0 ns, total: 417 µs
Wall time: 313 µs


{'source_ids': tensor([2625,   10,   25,  225,    5,  738,   10,   25,  225,    5,    1,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'source_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 

In [51]:
len(train_dataset), len(valid_dataset)

(21984, 5497)

In [50]:
# cach the dataset, so we can load it directly for training

torch.save(train_dataset, 'train_data.pt')
torch.save(valid_dataset, 'valid_data.pt')

For more details on how to use the nlp library check out this [notebook](https://colab.research.google.com/github/huggingface/nlp/blob/master/notebooks/Overview.ipynb).

## Write training script

In [52]:
!pip install pytorch_lightning




In [70]:
 import pytorch_lightning as pl
#Model FineTune
class T5Lightning(pl.LightningModule):
  def __init__(self, hparams):
    super(T5Lightning, self).__init__()
    self.hparams = hparams
    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
  
  def is_logger(self):
    return self.trainer.proc_rank <= 0
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        lm_labels=lm_labels,
    )

  def _step(self, batch):
    lm_labels = batch["target_ids"]
    lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        lm_labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss
  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
    if self.trainer.use_tpu:
      xm.optimizer_step(optimizer)
    else:
      optimizer.step()
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = torch.load('train_data.pt')
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = torch.load('valid_data.pt')
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [92]:
args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=512,
    learning_rate=5e-5,
    weight_decay=0.001,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=4,
    eval_batch_size=4,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you waant to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
    proc_rank=-1,
)

In [93]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from transformers import *
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
args_dict.update({'data_dir': 'data', 'output_dir': './t5_sentiment', 'num_train_epochs':2})
args = argparse.Namespace(**args_dict)
model = T5Lightning(args)
model.model

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [86]:
mkdir t5_segtiment


mkdir: cannot create directory ‘t5_segtiment’: File exists


In [94]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [95]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)
train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    tpu_cores=8,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)
trainer = pl.Trainer(**train_params)

GPU available: False, used: False
TPU available: True, using: 8 TPU cores


In [98]:
trainer.proc_rank=-1

In [99]:
trainer.fit(model)


training on 8 TPU cores
INIT TPU local core: 5, global rank: 5
INIT TPU local core: 3, global rank: 3
INIT TPU local core: 0, global rank: 0
INIT TPU local core: 4, global rank: 4
INIT TPU local core: 7, global rank: 7
INIT TPU local core: 1, global rank: 1
INIT TPU local core: 2, global rank: 2
INIT TPU local core: 6, global rank: 6

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 247 M 


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Exception: ignored

Let's write the arguments in a dict and store in a json file. The above code will load this file and parse the arguments.

In [None]:
args_dict = {
  "num_cores": 8,
  "model_name_or_path": 't5-base',
  "max_len": 128 ,
  "target_max_len": 2,
  "output_dir": './models/tpu',
  "overwrite_output_dir": True,
  "per_device_train_batch_size": 4,
  "per_gpu_eval_batch_size": 4,
  "gradient_accumulation_steps": 4,
  "learning_rate": 5e-5,
  "tpu_num_cores": 8,
  "num_train_epochs": 5,
  "do_train": True
}

In [None]:
with open('args.json', 'w') as f:
  json.dump(args_dict, f)

Start training!

In [None]:
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

In [None]:
xmp.spawn(_mp_fn, args=(), nprocs=8, start_method='fork')

Exception: ignored

## Eval

There are two gotchas here. First the metrics functionality in the nlp package is still work-in-progress so we will use the official squad evaluation script. Second, for some reason which I couldn't figure out, the `.generate` method is not working on TPU so will need to do prediction on CPU. For predicting the validation set it almost takes 40 mins.

In [None]:
import torch
import torch_xla
import torch_xla.core.xla_model as xm

from transformers import T5ForConditionalGeneration, T5Tokenizer

from tqdm.auto import tqdm

In [None]:
model = T5ForConditionalGeneration.from_pretrained('models/tpu').to('cpu') # because its loaded on xla by default
tokenizer = T5Tokenizer.from_pretrained('models/tpu')

In [None]:
valid_dataset = torch.load('valid_data.pt')
dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=32)

In [None]:
answers = []
for batch in tqdm(dataloader):
  outs = model.generate(input_ids=batch['input_ids'], 
                        attention_mask=batch['attention_mask'],
                        max_length=2,
                        early_stopping=True)
  outs = [tokenizer.decode(ids) for ids in outs]
  answers.extend(outs)

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [None]:
predictions = []
references = []
for ref, pred in zip(valid_dataset, answers):
  predictions.append(pred)
  references.append(tokenizer.decode(ref['target_ids']))

In [None]:
predictions[0], references[0]

('negative', 'positive')

In [None]:
from sklearn.metrics import classification_report
print(classification_report(references, predictions))

              precision    recall  f1-score   support

    negative       0.86      0.81      0.84       874
    positive       0.77      0.82      0.79       649

    accuracy                           0.82      1523
   macro avg       0.81      0.82      0.81      1523
weighted avg       0.82      0.82      0.82      1523

