In [1]:
import os
from tokenizers import ByteLevelBPETokenizer

In [5]:
%%time
path = os.getenv('HOME') + '/Desktop/dj_study/transformers/kant.txt'
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=path, vocab_size = 52_000, min_frequency=2, special_tokens=[
    '<s>',
    '<pad>',
    '</s>',
    '<unk>',
    '<mask>',
])

CPU times: user 9.43 s, sys: 2.25 s, total: 11.7 s
Wall time: 1.54 s


In [7]:
token_dir = os.getenv('HOME') + '/Desktop/dj_study/transformers/content/KantaiBERT'
if not os.path.exists(token_dir):
    os.makedirs(token_dir)
tokenizer.save_model(token_dir)

['/home/aiffel-dj46/Desktop/dj_study/transformers/content/KantaiBERT/vocab.json',
 '/home/aiffel-dj46/Desktop/dj_study/transformers/content/KantaiBERT/merges.txt']

In [8]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
tokenizer = ByteLevelBPETokenizer(
    token_dir + '/vocab.json',
    token_dir + '/merges.txt',
)

In [14]:
tokenizer.encode('The Critique of Pure Reason.').tokens

['<s>', 'The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.', '</s>']

In [15]:
tokenizer.encode('The Critique of Pure Reason.')

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [16]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ('</s>', tokenizer.token_to_id("</s>")),
    ('<s>', tokenizer.token_to_id('<s>')),
)
tokenizer.enable_truncation(max_length = 512)

In [18]:
tokenizer.encode('The Critique of Pure Reason.')

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [19]:
!nvidia-smi

Sat Mar 20 21:55:40 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 2070    Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   45C    P8    10W /  N/A |    809MiB /  7982MiB |     36%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [20]:
import torch
torch.cuda.is_available()

True

In [22]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size = 52_000,
    max_position_embeddings = 514,
    num_attention_heads = 12,
    num_hidden_layers = 6,
    type_vocab_size = 1,
)

In [24]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained(token_dir, max_length = 512)

In [25]:
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [26]:
print(model.num_parameters())

83504416


In [27]:
LP = list(model.parameters())
lp = len(LP)
print(lp)

106


In [28]:
for p in range(0, lp):
    print(LP[p])

Parameter containing:
tensor([[-0.0017,  0.0217,  0.0018,  ...,  0.0123,  0.0066, -0.0169],
        [ 0.0335,  0.0168, -0.0130,  ..., -0.0471, -0.0011, -0.0299],
        [ 0.0061, -0.0276, -0.0130,  ..., -0.0003,  0.0143, -0.0031],
        ...,
        [-0.0287,  0.0274, -0.0210,  ..., -0.0178, -0.0024, -0.0146],
        [ 0.0090, -0.0190, -0.0040,  ..., -0.0345, -0.0337, -0.0455],
        [-0.0232, -0.0260,  0.0261,  ..., -0.0098, -0.0134, -0.0228]],
       requires_grad=True)
Parameter containing:
tensor([[ 0.0158, -0.0216, -0.0188,  ...,  0.0251,  0.0347,  0.0440],
        [ 0.0095,  0.0249,  0.0010,  ..., -0.0295, -0.0172, -0.0175],
        [ 0.0330,  0.0206,  0.0004,  ...,  0.0027, -0.0352, -0.0014],
        ...,
        [-0.0134,  0.0188, -0.0099,  ...,  0.0147, -0.0222,  0.0022],
        [ 0.0049,  0.0312, -0.0135,  ..., -0.0158,  0.0138, -0.0416],
        [ 0.0167, -0.0194,  0.0008,  ..., -0.0186, -0.0197, -0.0118]],
       requires_grad=True)
Parameter containing:
tensor([[ 0.

In [31]:
np = 0
for p in range(0, lp):
    PL2 = True
    try:
        L2 = len(LP[p][0])
    except:
        L2 = 1
        PL2 = False
    L1 = len(LP[p])
    L3 = L1*L2
    np += L3
    if PL2 == True:
        print(p, L1, L2, L3)
    if PL2 == False:
        print(p, L1, L3)

print(np)

0 52000 768 39936000
1 514 768 394752
2 1 768 768
3 768 768
4 768 768
5 768 768 589824
6 768 768
7 768 768 589824
8 768 768
9 768 768 589824
10 768 768
11 768 768 589824
12 768 768
13 768 768
14 768 768
15 3072 768 2359296
16 3072 3072
17 768 3072 2359296
18 768 768
19 768 768
20 768 768
21 768 768 589824
22 768 768
23 768 768 589824
24 768 768
25 768 768 589824
26 768 768
27 768 768 589824
28 768 768
29 768 768
30 768 768
31 3072 768 2359296
32 3072 3072
33 768 3072 2359296
34 768 768
35 768 768
36 768 768
37 768 768 589824
38 768 768
39 768 768 589824
40 768 768
41 768 768 589824
42 768 768
43 768 768 589824
44 768 768
45 768 768
46 768 768
47 3072 768 2359296
48 3072 3072
49 768 3072 2359296
50 768 768
51 768 768
52 768 768
53 768 768 589824
54 768 768
55 768 768 589824
56 768 768
57 768 768 589824
58 768 768
59 768 768 589824
60 768 768
61 768 768
62 768 768
63 3072 768 2359296
64 3072 3072
65 768 3072 2359296
66 768 768
67 768 768
68 768 768
69 768 768 589824
70 768 768
71 768 768

In [32]:
%%time
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = path,
    block_size = 128,
)



CPU times: user 22 s, sys: 150 ms, total: 22.1 s
Wall time: 22.1 s


In [33]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer, mlm = True, mlm_probability = 0.15)

In [37]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir = token_dir,
    overwrite_output_dir = True,
    num_train_epochs = 10,
    per_device_train_batch_size = 64,
    save_steps = 10_000,
    save_total_limit = 2,
)
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset,
)

In [38]:
%%time
trainer.train()

Step,Training Loss
500,3.28488
1000,3.081902
1500,2.985844
2000,2.891247
2500,2.835145
3000,2.785469
3500,2.723331
4000,2.663544
4500,2.601546
5000,2.570707


CPU times: user 1h 22min 29s, sys: 10.3 s, total: 1h 22min 40s
Wall time: 1h 22min 56s


TrainOutput(global_step=26720, training_loss=2.1733015026161056)

In [39]:
trainer.save_model(token_dir)

In [41]:
from transformers import pipeline
fill_mask = pipeline(
    'fill-mask',
    model = token_dir,
    tokenizer = token_dir
)

Some weights of RobertaModel were not initialized from the model checkpoint at /home/aiffel-dj46/Desktop/dj_study/transformers/content/KantaiBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
fill_mask ("Human thinking involes human <mask>")

[{'sequence': '<s>Human thinking involes human mind</s>',
  'score': 0.2295064628124237,
  'token': 986,
  'token_str': 'Ġmind'},
 {'sequence': '<s>Human thinking involes human reason</s>',
  'score': 0.22299349308013916,
  'token': 393,
  'token_str': 'Ġreason'},
 {'sequence': '<s>Human thinking involes human nature</s>',
  'score': 0.07136901468038559,
  'token': 586,
  'token_str': 'Ġnature'},
 {'sequence': '<s>Human thinking involes human soul</s>',
  'score': 0.034939032047986984,
  'token': 1633,
  'token_str': 'Ġsoul'},
 {'sequence': '<s>Human thinking involes human will</s>',
  'score': 0.022876087576150894,
  'token': 484,
  'token_str': 'Ġwill'}]