In [1]:
from transformers import (
    ReformerForMaskedLM,
    ReformerTokenizer,
    ReformerConfig,
    ReformerModel,
    Trainer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
)
import nlp
import torch
from torch.utils.data.dataset import Dataset
from fastai.text.all import *



In [2]:
path = untar_data(URLs.IMDB)

In [3]:
dls = TextDataLoaders.from_folder(path, is_lm=True, valid_pct=0.1,seq_len=16384 )

In [4]:
config = {
  "attention_head_size": 128,
  "attn_layers": [
    "local",
    "local",
    "lsh",
    "local",
    "local",
    "local",
    "lsh",
    "local",
    "local",
    "local",
    "lsh",
    "local"
  ],
  "axial_norm_std": 1.0,
  "axial_pos_embds": True,
  "axial_pos_embds_dim": [
    256,
    768
  ],
  "axial_pos_shape": [
    128,
    128
  ],
  "chunk_size_feed_forward": 0,
  "chunk_size_lm_head": 0,
  "eos_token_id": 2,
  "feed_forward_size": 1024,#4096,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.2,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "is_decoder": False,
  "layer_norm_eps": 1e-12,
  "local_attention_probs_dropout_prob": 0.2,
  "local_attn_chunk_length": 128,
  "local_num_chunks_after": 0,
  "local_num_chunks_before": 1,
  "lsh_attention_probs_dropout_prob": 0.1,
  "lsh_attn_chunk_length": 256,
  "lsh_num_chunks_after": 0,
  "lsh_num_chunks_before": 1,
  "max_position_embeddings": 16384,
  "model_type": "reformer",
  "num_attention_heads": 8,
  "num_buckets": 512,
  "num_hashes": 1,
  "pad_token_id": 0,
  "vocab_size": 322  # +1 for [MASK] token
}

config = ReformerConfig(**config)
model = ReformerForMaskedLM(config)
model = model.train()



In [5]:
model

ReformerForMaskedLM(
  (reformer): ReformerModel(
    (embeddings): ReformerEmbeddings(
      (word_embeddings): Embedding(322, 1024)
      (position_embeddings): AxialPositionEmbeddings(
        (weights): ParameterList(
            (0): Parameter containing: [torch.FloatTensor of size 128x1x256]
            (1): Parameter containing: [torch.FloatTensor of size 1x128x768]
        )
      )
    )
    (encoder): ReformerEncoder(
      (layers): ModuleList(
        (0): ReformerLayer(
          (attention): ReformerAttention(
            (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (self_attention): LocalSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=False)
              (key): Linear(in_features=1024, out_features=1024, bias=False)
              (value): Linear(in_features=1024, out_features=1024, bias=False)
            )
            (output): ReformerSelfOutput(
              (dense): Linear(in_features=1024, 

In [6]:
learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), 
                splitter=lambda m:[p for p in m.lm_head.parameters()], 
                metrics=[accuracy, Perplexity()], path=path, wd=0.1).to_fp16()

In [7]:
learn.fit(1)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,0.0,00:03,,,


RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB (GPU 0; 7.79 GiB total capacity; 5.16 GiB already allocated; 1.90 GiB free; 5.16 GiB reserved in total by PyTorch)

In [None]:
dataset = nlp.load_dataset("crime_and_punish", split="train")

### Get above fastai code working!!! 

In [None]:
ReformerTokenizer??

In [None]:
tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")

In [None]:
tokenizer.add_special_tokens({"mask_token": '[MASK]'})

In [None]:
print(tokenizer.mask_token_id)
len(tokenizer)

In [None]:
sequence_length = 2 ** 14  # 16384

# define our map function to reduce the dataset to one sample
def flatten_and_tokenize(batch):
  all_input_text = ["".join(batch["line"])]
  input_ids_dict = tokenizer(all_input_text, pad_to_max_length=True, max_length=sequence_length)

  # duplicate data 8 times to have have 8 examples in dataset
  for key in input_ids_dict.keys():
    input_ids_dict[key] = [4 * [x] for x in input_ids_dict[key]][0]

  return input_ids_dict

# reduce the dataset and set batch_size to all inputs
dataset = dataset.map(
  flatten_and_tokenize, batched=True, batch_size=-1, remove_columns=["line"]
)

# prepare dataset to be in torch format
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

In [None]:
print(dataset['input_ids'].shape)

In [None]:
# copy 0.15 from run language modeling script
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [None]:
class MLMReformerDataset(Dataset):

  def __init__(self, dataset):
    self.dataset = dataset

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, i):
    return self.dataset['input_ids'][i]

mlm_dataset = MLMReformerDataset(dataset)

In [None]:
config = {
  "attention_head_size": 128,
  "attn_layers": [
    "local",
    "local",
    "lsh",
    "local",
    "local",
    "local",
    "lsh",
    "local",
    "local",
    "local",
    "lsh",
    "local"
  ],
  "axial_norm_std": 1.0,
  "axial_pos_embds": True,
  "axial_pos_embds_dim": [
    256,
    768
  ],
  "axial_pos_shape": [
    128,
    128
  ],
  "chunk_size_feed_forward": 0,
  "chunk_size_lm_head": 0,
  "eos_token_id": 2,
  "feed_forward_size": 4096,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.2,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "is_decoder": False,
  "layer_norm_eps": 1e-12,
  "local_attention_probs_dropout_prob": 0.2,
  "local_attn_chunk_length": 128,
  "local_num_chunks_after": 0,
  "local_num_chunks_before": 1,
  "lsh_attention_probs_dropout_prob": 0.1,
  "lsh_attn_chunk_length": 256,
  "lsh_num_chunks_after": 0,
  "lsh_num_chunks_before": 1,
  "max_position_embeddings": 16384,
  "model_type": "reformer",
  "num_attention_heads": 8,
  "num_buckets": 512,
  "num_hashes": 1,
  "pad_token_id": 0,
  "vocab_size": 322  # +1 for [MASK] token
}

config = ReformerConfig(**config)
model = ReformerForMaskedLM(config)
model = model.train()

In [None]:
# define the training args
training_args = {
    "learning_rate": 1e-3,
    "max_steps": 20,
    "do_train": True,
    "gradient_accumulation_steps": 1,
    "logging_steps": 4,
    "warmup_steps": 0,
    "weight_decay": 0.001,
    "per_gpu_train_batch_size": 1,
    "per_gpu_eval_batch_size": 1,
    "save_steps": 20,
    "output_dir": "./"
}

training_args = TrainingArguments(**training_args)

In [None]:
# create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=mlm_dataset
)

# train
trainer.train()

In [None]:
trainer.train_dataloader