In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/ERA_v1/Session17/dev4

Mounted at /content/drive/
/content/drive/MyDrive/ERA_v1/Session17/dev4


In [None]:
import torch
import random
import numpy as np
from collections import Counter
from os.path import exists
import torch.optim as optim
import torch.nn as nn

In [None]:
from transformer_model.common_model import Transformer
from transformer_model.datamodules.bert_datamodule import SentencesDataset, create_sentences_and_vocab
from transformer_model.models.bert.bert_train import bert_train

Number of patches (N) with image height (H=224), width (W=224) and patch size (P=16): 196
Input shape (single 2D image): (224, 224, 3)
Output shape (single 2D image flattened into patches): (196, 768)


In [None]:
print('Initializing data for BERT...')
batch_size = 1024
seq_len = 20
embed_size = 128
inner_ff_size = embed_size * 4
n_heads = 8
n_code = 8
n_vocab = 40000
dropout = 0.1
# n_workers = 12

optim_kwargs = {'lr':1e-4, 'weight_decay':1e-4, 'betas':(.9,.999)}

#1) Configure text
print('Configuring Text...')
sentence_path = 'BERT_data/training.txt'
vocab_path = "vocab.txt"

sentences, vocab = create_sentences_and_vocab(sentence_path, vocab_path)
print('Creating Dataset...')
dataset = SentencesDataset(sentences, vocab, seq_len)
kwargs = {'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
data_loader = torch.utils.data.DataLoader(dataset, **kwargs)

Initializing data for BERT...
Configuring Text...
tokenizing sentences...
creating/loading vocab...
Creating Dataset...


In [None]:
print('Initializing BERT Transformer model...')
model = Transformer(n_code=n_code, n_heads=n_heads, embed_size=embed_size,
                    inner_ff_size=inner_ff_size, n_embeddings=len(dataset.vocab),
                    seq_len=seq_len, dropout=dropout, algorithm="BERT")
model = model.cuda()

print('Initializing Optimizer and Loss functions...')
optimizer = optim.Adam(model.parameters(), **optim_kwargs)
loss_model = nn.CrossEntropyLoss(ignore_index=dataset.IGNORE_IDX)

model = bert_train(model, optimizer, data_loader, loss_model)

print('Saving Embeddings...')
N = 3000
np.savetxt('values.tsv',
           np.round(model.embeddings.weight.detach().cpu().numpy()[0:N], 2),
                    delimiter='\t', fmt='%1.2f')
s = [dataset.rvocab[i] for i in range(N)]
open('names.tsv', 'w+').write('\n'.join(s) )

print('Training end')

Initializing BERT Transformer model...
Initializing Optimizer and Loss functions...
Training BERT...
it: 0  | loss 10.24  | Δw: 1.14
it: 10  | loss 9.57  | Δw: 0.547
it: 20  | loss 9.33  | Δw: 0.358
it: 30  | loss 9.21  | Δw: 0.29
it: 40  | loss 9.0  | Δw: 0.235
it: 50  | loss 8.84  | Δw: 0.222
it: 60  | loss 8.66  | Δw: 0.203
it: 70  | loss 8.53  | Δw: 0.195
it: 80  | loss 8.35  | Δw: 0.181
it: 90  | loss 8.19  | Δw: 0.174
it: 100  | loss 8.08  | Δw: 0.17
it: 110  | loss 7.94  | Δw: 0.159
it: 120  | loss 7.8  | Δw: 0.15
it: 130  | loss 7.67  | Δw: 0.159
it: 140  | loss 7.55  | Δw: 0.142
it: 150  | loss 7.43  | Δw: 0.148
it: 160  | loss 7.33  | Δw: 0.14
it: 170  | loss 7.22  | Δw: 0.142
it: 180  | loss 7.11  | Δw: 0.136
it: 190  | loss 6.99  | Δw: 0.133
it: 200  | loss 6.97  | Δw: 0.133
it: 210  | loss 6.88  | Δw: 0.137
it: 220  | loss 6.82  | Δw: 0.134
it: 230  | loss 6.75  | Δw: 0.132
it: 240  | loss 6.72  | Δw: 0.138
it: 250  | loss 6.69  | Δw: 0.139
it: 260  | loss 6.59  | Δw: 0.14