## MODEL

In [1]:
from transformers import AutoModelForMaskedLM,  AutoTokenizer

In [2]:
model_name= 'google/bert_uncased_L-2_H-128_A-2'

In [3]:
bert_tiny = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
bert_tiny.config

BertConfig {
  "_name_or_path": "google/bert_uncased_L-2_H-128_A-2",
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [5]:
bert_tiny.num_parameters()

4416698

## DATASET

In [7]:
# !pip install datasets

Collecting datasets
  Downloading datasets-1.2.1-py3-none-any.whl (159 kB)
[K     |████████████████████████████████| 159 kB 5.5 MB/s eta 0:00:01
[?25hCollecting dill
  Downloading dill-0.3.3-py2.py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 7.0 MB/s  eta 0:00:01
[?25hCollecting xxhash
  Downloading xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 11.6 MB/s eta 0:00:01
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.11.1-py37-none-any.whl (108 kB)
[K     |████████████████████████████████| 108 kB 16.3 MB/s eta 0:00:01
Installing collected packages: dill, xxhash, multiprocess, datasets
Successfully installed datasets-1.2.1 dill-0.3.3 multiprocess-0.70.11.1 xxhash-2.0.0


In [8]:
from datasets import load_dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1969.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1124.0, style=ProgressStyle(description…


Downloading and preparing dataset wikitext/wikitext-2-raw-v1 (download: 4.50 MiB, generated: 12.91 MiB, post-processed: Unknown size, total: 17.41 MiB) to /home/jupyter/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4721645.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset wikitext downloaded and prepared to /home/jupyter/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91. Subsequent calls will reuse this data.


## TOKENIZER

In [40]:
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True, padding=True, max_len=512)

In [41]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [42]:
tokenizer

PreTrainedTokenizerFast(name_or_path='google/bert_uncased_L-2_H-128_A-2', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [43]:
ex = tokenizer(['hello world'])
tokenizer.convert_ids_to_tokens(ex.input_ids[0])

['[CLS]', 'hello', 'world', '[SEP]']

In [44]:
train_dataset = dataset['train'].filter(lambda example: len(example['text'])>50)

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91/cache-bba8d691633557b1.arrow


In [61]:
def filter_func(example):
    if (type(example['text'])==str) and (len(example['text'])>50):
        ret = True
    else:
        ret = False
    return ret
    
train_dataset = dataset['train'].filter(lambda example: filter_func(example))

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91/cache-288969e7bc9b80d1.arrow


In [62]:
len(train_dataset)

16323

In [54]:
import torch

In [85]:
tokenizer(train_dataset[0]['text'], truncation=True, padding=True, return_special_tokens_mask=True)

{'input_ids': [101, 12411, 5558, 2053, 11748, 4801, 4360, 1017, 1024, 4895, 2890, 27108, 5732, 11906, 1006, 2887, 1024, 1856, 1806, 1671, 30222, 30218, 30259, 30227, 30255, 30258, 30219, 2509, 1010, 5507, 1012, 11748, 4801, 4360, 1997, 1996, 11686, 1017, 1007, 1010, 4141, 3615, 2000, 2004, 11748, 4801, 4360, 11906, 3523, 2648, 2900, 1010, 2003, 1037, 8608, 2535, 1030, 1011, 1030, 2652, 2678, 2208, 2764, 2011, 16562, 1998, 2865, 1012, 4432, 2005, 1996, 9160, 12109, 1012, 2207, 1999, 2254, 2249, 1999, 2900, 1010, 2009, 2003, 1996, 2353, 2208, 1999, 1996, 11748, 4801, 4360, 2186, 1012, 15440, 1996, 2168, 10077, 1997, 8608, 1998, 2613, 1030, 1011, 1030, 2051, 11247, 2004, 2049, 16372, 1010, 1996, 2466, 3216, 5903, 2000, 1996, 2034, 2208, 1998, 4076, 1996, 1000, 2171, 3238, 1000, 1010, 1037, 18476, 2510, 3131, 3529, 1996, 3842, 1997, 26033, 2401, 2076, 1996, 2117, 12124, 2078, 2162, 2040, 4685, 3595, 2304, 3136, 1998, 2024, 25895, 2114, 1996, 4461, 3131, 1000, 10250, 8067, 3723, 10000, 1000

In [86]:
def tokenize_function(examples):
    ret = tokenizer(examples['text'], truncation=True, padding=True, return_special_tokens_mask=True)
    return ret

tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




In [87]:
len(tokenized_dataset)

16323

In [88]:
tokenized_dataset[0]

{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [74]:
# tokenized_dataset_torch = [{key: torch.tensor(val[idx]) for key, val in tokenized_dataset.items()} ]

In [75]:
len(tokenized_dataset['input_ids'][100])

512

## TRAINER

In [76]:
type(tokenized_dataset[0])

dict

In [89]:
tokenized_dataset[0].keys()

dict_keys(['attention_mask', 'input_ids', 'special_tokens_mask', 'token_type_ids'])

In [90]:
[type(tokenized_dataset[0][k]) for k in tokenized_dataset[0].keys()]

[list, list, list, list]

In [91]:
from transformers import Trainer, TrainingArguments

model = bert_tiny
training_args = TrainingArguments(
    output_dir="./wiki_2-v1",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
#     prediction_loss_only=True,
)

In [92]:
%%time
trainer.train()

Step,Training Loss
500,4.181399


CPU times: user 1min 43s, sys: 32.5 s, total: 2min 16s
Wall time: 2min 19s


TrainOutput(global_step=511, training_loss=4.180028657857219)

In [95]:
bert_tiny.save_pretrained('./wikituned')

In [97]:
from transformers import AutoModelForSequenceClassification

bert_tiny_loaded = AutoModelForSequenceClassification.from_pretrained('./wikituned')

Some weights of the model checkpoint at ./wikituned were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./wikituned and are newly initial