In [1]:
import os
import nlp
import torch
import psutil
import pyarrow
import humanize

import GPUtil as GPU

from transformers import (
    ReformerModelWithLMHead,
    ReformerTokenizer,
    ReformerConfig,
    Trainer,
    DataCollator,
    TrainingArguments,
)

In [2]:
GPUs = GPU.getGPUs()

In [3]:
gpu = GPUs[0]

In [4]:
process = psutil.Process(os.getpid())
print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ),\
      " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(\
        gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))

Gen RAM Free: 29.9 GB  | Proc size: 326.0 MB
GPU RAM Free: 9336MB | Used: 674MB | Util   7% | Total 10010MB


In [5]:
dataset = nlp.load_dataset("crime_and_punish", split="train")

In [6]:
tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.



Because want to pack all data into a single sample, we use the handy map() function to reduce the dataset into one sample and pad the sample to a length of 524288. We then expand the same sample to 8 training samples so that we can accumulate gradients during training. Finally, we make the dataset ready for training, by only keeping the columns needed for training.

In [7]:
sequence_length = 2 ** 19  # 524288

# define our map function to reduce the dataset to one sample
def flatten_and_tokenize(batch):
  all_input_text = ["".join(batch["line"])]
  input_ids_dict = tokenizer.batch_encode_plus(
      all_input_text, padding = 'max_length', max_length=sequence_length
  )

    # duplicate data 8 times to have have 8 examples in dataset
  for key in input_ids_dict.keys():
    input_ids_dict[key] = [8 * [x] for x in input_ids_dict[key]][0]

  return input_ids_dict

# reduce the dataset and set batch_size to all inputs
dataset = dataset.map(
  flatten_and_tokenize, batched=True, batch_size=-1, remove_columns=["line"]
)

# prepare dataset to be in torch format
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


