In [1]:
!pip -q install gputil psutil humanize transformers nlp sentencepiece

In [2]:
import transformers

In [3]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi

In [4]:
import os
import nlp
import torch
import psutil
import pyarrow
import humanize
import warnings

import GPUtil as GPU

from transformers import (
    ReformerModelWithLMHead,
    ReformerTokenizer,
    ReformerConfig,
    Trainer,
    DataCollator,
    TrainingArguments,
)

In [5]:
warnings.filterwarnings('ignore')

In [6]:
GPUs = GPU.getGPUs()

In [7]:
GPUs

[<GPUtil.GPUtil.GPU at 0x7fdd40dec550>]

In [8]:
gpu = GPUs[0]

In [9]:
process = psutil.Process(os.getpid())
print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ),\
      " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(\
        gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))

Gen RAM Free: 12.6 GB  | Proc size: 538.1 MB
GPU RAM Free: 16270MB | Used: 10MB | Util   0% | Total 16280MB


In [10]:
dataset = nlp.load_dataset("crime_and_punish", split="train")

In [11]:
tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
tokenizer.pad_token = tokenizer.eos_token

Because want to pack all data into a single sample, we use the handy map() function to reduce the dataset into one sample and pad the sample to a length of 524288. We then expand the same sample to 8 training samples so that we can accumulate gradients during training. Finally, we make the dataset ready for training, by only keeping the columns needed for training.

In [12]:
sequence_length = 2 ** 19 

# define our map function to reduce the dataset to one sample
def flatten_and_tokenize(batch):
  all_input_text = ["".join(batch["line"])]
  input_ids_dict = tokenizer.batch_encode_plus(
      all_input_text, padding = 'max_length', max_length=sequence_length
  )

    # duplicate data 8 times to have have 8 examples in dataset
  for key in input_ids_dict.keys():
    input_ids_dict[key] = [8 * [x] for x in input_ids_dict[key]][0]

  return input_ids_dict

# reduce the dataset and set batch_size to all inputs
dataset = dataset.map(
  flatten_and_tokenize, batched=True, batch_size=-1, remove_columns=["line"]
)

# prepare dataset to be in torch format
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

In [13]:
dataset

Dataset(features: {'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, num_rows: 8)

We don't want the model to just memorize the dataset (the single example) by encoding the words in its position embeddings. Thus, at each training iteration we will randomly select how much padding to put before the text vs. after it.

With the Trainer framework of transformers, we can implement by using a Reformer specific DataCollator that randomely shifts the input_ids to the right and sets the labels correctly.

In [14]:
class ReformerCollator:
    def __init__(self, max_roll_length):
        self.max_roll_length = max_roll_length

    def __call__(self, features):
        # get random shift int
        random_shift_length = torch.randint(self.max_roll_length, (1,)).item()

        # shift input and mask
        rolled_input_ids = torch.roll(
            features[0]["input_ids"], random_shift_length
        ).unsqueeze(0)
        rolled_attention_mask = torch.roll(
            features[0]["attention_mask"], random_shift_length
        ).unsqueeze(0)

        return {
            "input_ids": rolled_input_ids,  # BS x SEQ_LEN
            "labels": rolled_input_ids,  # BS x SEQ_LEN
            "attention_mask": rolled_attention_mask,  # BS x SEQ_LEN
        }

To instantiate the data collator the length of padded input_ids needs to be calculated.

In [15]:
# the non_padded_sequence_length defines the max shift for our data collator
non_padded_sequence_length = sequence_length - sum(
    dataset["attention_mask"][0]
)

# get the data collator
data_collator = ReformerCollator(non_padded_sequence_length)

Next, we will define our reformer model by defining the ReformerConfig.

In [16]:
config = {
    "attention_head_size": 64,
    "attn_layers": ["local", "lsh", "local", "lsh", "local", "lsh"],
    "axial_pos_embds": True,
    "sinusoidal_pos_embds": False,
    "axial_pos_embds_dim": [64, 192],
    "axial_pos_shape": [512, 1024],
    "lsh_attn_chunk_length": 64,
    "local_attn_chunk_length": 64,
    "feed_forward_size": 512,
    "hidden_act": "relu",
    "hidden_size": 256,
    "is_decoder": True,
    "max_position_embeddings": 524288,
    "num_attention_heads": 2,
    "num_buckets": [64, 128],
    "num_hashes": 1,
    "vocab_size": 320,
    "lsh_attention_probs_dropout_prob": 0.0,
    "lsh_num_chunks_before": 1,
    "lsh_num_chunks_after": 0,
    "local_num_chunks_before": 1,
    "local_num_chunks_after": 0,
    "local_attention_probs_dropout_prob": 0.025,
    "hidden_dropout_prob": 0.025,
}

In [17]:
config = ReformerConfig(**config)
model = ReformerModelWithLMHead(config)
model = model.train()

In [18]:
### Training args

training_args = {
    "learning_rate": 1e-3,
    "max_steps": 100,
    "do_train": True,
    "gradient_accumulation_steps": 8,
    "logging_steps": 1,
    "warmup_steps": 500,
    "weight_decay": 0.001,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "save_steps": 1,
    "output_dir": "./"
}

In [19]:
training_args = TrainingArguments(**training_args)

In [20]:
### Accuracy metrics

def compute_metrics(pred):
    non_padded_indices = (pred.label_ids != -100)

    # correctly shift labels and pred as it's done in forward()
    labels = pred.label_ids[..., 1:][non_padded_indices[..., 1:]]
    pred = np.argmax(pred.predictions[:, :-1], axis=-1)[non_padded_indices[..., :-1]]

    acc = np.mean(np.asarray(pred == labels), dtype=np.float)
    return {"accuracy": acc}

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    train_dataset=dataset,
    eval_dataset=dataset
)

In [22]:
try:
  trainer.train()
except KeyboardInterrupt:
  print('Training was interrupted manually, last model saved will be used for prediction')

Step,Training Loss
1,5.868035
2,5.867171
3,5.866279
4,5.866044
5,5.864196
6,5.861832
7,5.85936
8,5.856157
9,5.851445
10,5.847889
