**Connect to Google Drive**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


**Set up runpath**

In [2]:
import os
import sys

In [3]:
curr_path = "/content/gdrive/MyDrive/ERA_V1/era_v1_session17"
os.chdir(curr_path)
cwd = os.getcwd()
cwd

'/content/gdrive/MyDrive/ERA_V1/era_v1_session17'

**Install libraries**

In [4]:
!pip install --quiet "torchinfo" "seaborn" "pytorch-lightning" "torchmetrics" "lightning-bolts" "torchtext" "datasets" "tokenizers" "transformers"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.6/774.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.8/300.8 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m829.5/829.5 kB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

**Import libraries**

In [5]:
import torch.nn.functional as F
from collections import Counter
from os.path import exists
import torch.optim as optim
import torch.nn as nn
import numpy as np
import torch
import math
import re

# user scripts
from transformer_model_scripts.BERT_utils import SentencesDataset, get_batch
from transformer_model_scripts.transformer_models import BERT


**Setup parameters**

In [6]:
print('initializing..')
batch_size = 1024
seq_len = 20
embed_size = 128
inner_ff_size = embed_size * 4
n_heads = 8
n_code = 8
n_vocab = 40000
dropout = 0.1
# n_workers = 12

#optimizer
optim_kwargs = {'lr':1e-4, 'weight_decay':1e-4, 'betas':(.9,.999)}

initializing..


**Set up data and dataloader**

In [8]:
os.listdir()


['pizza_steak_sushi.zip',
 'models',
 'download.jpeg',
 'gpt_data',
 'vit_data',
 'checkpoint',
 'bert_data',
 'S17_Vit.ipynb',
 'S17_Gpt.ipynb',
 'S17_Bert.ipynb',
 'transformer_model_scripts']

In [11]:
# load text
print('Loading training text...')
bert_path = f'{cwd}/bert_data/training.txt'
sentences = open(bert_path).read().lower().split('\n')

# tokenize sentences
print('Tokenizing sentences...')
special_chars = ',?;.:/*!+-()[]{}"\'&'
sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in sentences]
sentences = [[w for w in s if len(w)] for s in sentences]

# create vocab if not already created
print('Creating/loading vocab...')
pth = 'vocab.txt'
if not exists(pth):
    words = [w for s in sentences for w in s]
    vocab = Counter(words).most_common(n_vocab) #keep the N most frequent words
    vocab = [w[0] for w in vocab]
    open(pth, 'w+').write('\n'.join(vocab))
else:
    vocab = open(pth).read().split('\n')

# dataloader
print('Creating dataset/dataloader...')
dataset = SentencesDataset(sentences, vocab, seq_len)
# kwargs = {'num_workers':n_workers, 'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
kwargs = {'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
data_loader = torch.utils.data.DataLoader(dataset, **kwargs)


Loading training text...
Tokenizing sentences...
Creating/loading vocab...
Creating dataset/dataloader...


**Model**

In [12]:
# Model
print('Initializing model...')
model = BERT(n_code, n_heads, embed_size, inner_ff_size, len(dataset.vocab), seq_len, dropout)
model = model.cuda()

# Optimizer
print('Initializing optimizer and loss...')
optimizer = optim.Adam(model.parameters(), **optim_kwargs)
loss_model = nn.CrossEntropyLoss(ignore_index=dataset.IGNORE_IDX)


Initializing model...
Initializing optimizer and loss...


**Train Model**

In [14]:
# Train
print('Training...')
print_each = 10
model.train()
batch_iter = iter(data_loader)
n_iteration = 10000
for it in range(n_iteration):

    #get batch
    batch, batch_iter = get_batch(data_loader, batch_iter)

    #infer
    masked_input = batch['input']
    masked_target = batch['target']

    masked_input = masked_input.cuda(non_blocking=True)
    masked_target = masked_target.cuda(non_blocking=True)
    output = model(masked_input)

    #compute the cross entropy loss
    output_v = output.view(-1,output.shape[-1])
    target_v = masked_target.view(-1,1).squeeze()
    loss = loss_model(output_v, target_v)

    #compute gradients
    loss.backward()

    #apply gradients
    optimizer.step()

    #print step
    if it % print_each == 0:
        print('iter:', it,
              ' | loss', np.round(loss.item(),2),
              ' | dw:', round(model.embeddings.weight.grad.abs().sum().item(),3))

    #reset gradients
    optimizer.zero_grad()



Training...
iter: 0  | loss 8.76  | dw: 0.27
iter: 10  | loss 8.56  | dw: 0.144
iter: 20  | loss 8.43  | dw: 0.134
iter: 30  | loss 8.26  | dw: 0.13
iter: 40  | loss 8.09  | dw: 0.122
iter: 50  | loss 7.99  | dw: 0.116
iter: 60  | loss 7.83  | dw: 0.11
iter: 70  | loss 7.7  | dw: 0.103
iter: 80  | loss 7.55  | dw: 0.097
iter: 90  | loss 7.49  | dw: 0.095
iter: 100  | loss 7.4  | dw: 0.095
iter: 110  | loss 7.27  | dw: 0.091
iter: 120  | loss 7.12  | dw: 0.088
iter: 130  | loss 7.07  | dw: 0.085
iter: 140  | loss 6.93  | dw: 0.081
iter: 150  | loss 6.91  | dw: 0.081
iter: 160  | loss 6.82  | dw: 0.079
iter: 170  | loss 6.75  | dw: 0.075
iter: 180  | loss 6.67  | dw: 0.075
iter: 190  | loss 6.68  | dw: 0.073
iter: 200  | loss 6.6  | dw: 0.072
iter: 210  | loss 6.63  | dw: 0.077
iter: 220  | loss 6.54  | dw: 0.079
iter: 230  | loss 6.6  | dw: 0.079
iter: 240  | loss 6.49  | dw: 0.077
iter: 250  | loss 6.43  | dw: 0.084
iter: 260  | loss 6.49  | dw: 0.079
iter: 270  | loss 6.39  | dw: 0.08

**Results**

In [15]:
# =============================================================================
# Results analysis
# =============================================================================
print('saving embeddings...')
N = 3000
np.savetxt('values.tsv', np.round(model.embeddings.weight.detach().cpu().numpy()[0:N], 2), delimiter='\t', fmt='%1.2f')
s = [dataset.rvocab[i] for i in range(N)]
open('names.tsv', 'w+').write('\n'.join(s) )


print('end')


saving embeddings...
end
