In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
! pip install --quiet "pandas" "torch" "torchvision" "ipython[notebook]" "seaborn" "pytorch-lightning>=1.4" "torchmetrics>=0.6" "lightning-bolts" "torch-lr-finder" "grad-cam" "gradio" "torchinfo"

In [3]:
! git clone https://github.com/niharikavadapalli/ERA-V1.git
!git -C ERA-V1 pull

import sys
sys.path.append("ERA-V1/ERA-V1-Session-17")

Cloning into 'ERA-V1'...
remote: Enumerating objects: 934, done.[K
remote: Counting objects: 100% (126/126), done.[K
remote: Compressing objects: 100% (94/94), done.[K
remote: Total 934 (delta 68), reused 82 (delta 32), pack-reused 808[K
Receiving objects: 100% (934/934), 54.93 MiB | 28.75 MiB/s, done.
Resolving deltas: 100% (467/467), done.
Already up to date.


In [10]:
from torch.utils.data import Dataset
import torch.nn.functional as F
from collections import Counter
from os.path import exists
import torch.optim as optim
import torch.nn as nn
import numpy as np
import random
import torch
import math
import re
from transformer import BERT

In [7]:
%ls

[0m[01;34mERA-V1[0m/


In [14]:
# =============================================================================
# Dataset
# =============================================================================
class SentencesDataset(Dataset):
    #Init dataset
    def __init__(self, sentences, vocab, seq_len):
        dataset = self
        
        dataset.sentences = sentences
        dataset.vocab = vocab + ['<ignore>', '<oov>', '<mask>']
        dataset.vocab = {e:i for i, e in enumerate(dataset.vocab)} 
        dataset.rvocab = {v:k for k,v in dataset.vocab.items()}
        dataset.seq_len = seq_len
        
        #special tags
        dataset.IGNORE_IDX = dataset.vocab['<ignore>'] #replacement tag for tokens to ignore
        dataset.OUT_OF_VOCAB_IDX = dataset.vocab['<oov>'] #replacement tag for unknown words
        dataset.MASK_IDX = dataset.vocab['<mask>'] #replacement tag for the masked word prediction task
    
    
    #fetch data
    def __getitem__(self, index, p_random_mask=0.15):
        dataset = self
        
        #while we don't have enough word to fill the sentence for a batch
        s = []
        while len(s) < dataset.seq_len:
            s.extend(dataset.get_sentence_idx(index % len(dataset)))
            index += 1
        
        #ensure that the sequence is of length seq_len
        s = s[:dataset.seq_len]
        [s.append(dataset.IGNORE_IDX) for i in range(dataset.seq_len - len(s))] #PAD ok
        
        #apply random mask
        s = [(dataset.MASK_IDX, w) if random.random() < p_random_mask else (w, dataset.IGNORE_IDX) for w in s]
        
        return {'input': torch.Tensor([w[0] for w in s]).long(),
                'target': torch.Tensor([w[1] for w in s]).long()}

    #return length
    def __len__(self):
        return len(self.sentences)

    #get words id
    def get_sentence_idx(self, index):
        dataset = self
        s = dataset.sentences[index]
        s = [dataset.vocab[w] if w in dataset.vocab else dataset.OUT_OF_VOCAB_IDX for w in s] 
        return s

# =============================================================================
# Methods / Class
# =============================================================================
def get_batch(loader, loader_iter):
    try:
        batch = next(loader_iter)
    except StopIteration:
        loader_iter = iter(loader)
        batch = next(loader_iter)
    return batch, loader_iter

# =============================================================================
# #Init
# =============================================================================
print('initializing..')
batch_size = 1024
seq_len = 20
embed_size = 128
inner_ff_size = embed_size * 4
n_heads = 8
n_code = 8
n_vocab = 40000
dropout = 0.1
# n_workers = 12

#optimizer
optim_kwargs = {'lr':1e-4, 'weight_decay':1e-4, 'betas':(.9,.999)}

# =============================================================================
# Input
# =============================================================================
#1) load text
print('loading text...')
pth = 'ERA-V1/ERA-V1-Session-17/BERT/training.txt'
sentences = open(pth).read().lower().split('\n')

#2) tokenize sentences (can be done during training, you can also use spacy udpipe)
print('tokenizing sentences...')
special_chars = ',?;.:/*!+-()[]{}"\'&'
sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in sentences]
sentences = [[w for w in s if len(w)] for s in sentences]

#3) create vocab if not already created
print('creating/loading vocab...')
pth = 'ERA-V1/ERA-V1-Session-17/BERT/vocab.txt'
if not exists(pth):
    words = [w for s in sentences for w in s]
    vocab = Counter(words).most_common(n_vocab) #keep the N most frequent words
    vocab = [w[0] for w in vocab]
    open(pth, 'w+').write('\n'.join(vocab))
else:
    vocab = open(pth).read().split('\n')

#4) create dataset
print('creating dataset...')
dataset = SentencesDataset(sentences, vocab, seq_len)
# kwargs = {'num_workers':n_workers, 'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
kwargs = {'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
data_loader = torch.utils.data.DataLoader(dataset, **kwargs)


# =============================================================================
# Model
# =============================================================================
#init model
print('initializing model...')
model = BERT(n_code, n_heads, embed_size, inner_ff_size, len(dataset.vocab), seq_len, dropout)
model = model.cuda()

# =============================================================================
# Optimizer
# =============================================================================
print('initializing optimizer and loss...')
optimizer = optim.Adam(model.parameters(), **optim_kwargs)
loss_model = nn.CrossEntropyLoss(ignore_index=dataset.IGNORE_IDX)

# =============================================================================
# Train
# =============================================================================
print('training...')
print_each = 10
model.train()
batch_iter = iter(data_loader)
n_iteration = 10000
for it in range(n_iteration):
    
    #get batch
    batch, batch_iter = get_batch(data_loader, batch_iter)
    
    #infer
    masked_input = batch['input']
    masked_target = batch['target']
    
    masked_input = masked_input.cuda(non_blocking=True)
    masked_target = masked_target.cuda(non_blocking=True)
    output = model(masked_input)
    
    #compute the cross entropy loss 
    output_v = output.view(-1,output.shape[-1])
    target_v = masked_target.view(-1,1).squeeze()
    loss = loss_model(output_v, target_v)
    
    #compute gradients
    loss.backward()
    
    #apply gradients
    optimizer.step()
    
    #print step
    if it % print_each == 0:
        print('it:', it, 
              ' | loss', np.round(loss.item(),2),
              ' | Δw:', round(model.embeddings.weight.grad.abs().sum().item(),3))
    
    #reset gradients
    optimizer.zero_grad()
    


initializing..
loading text...
tokenizing sentences...
creating/loading vocab...
creating dataset...
initializing model...
initializing optimizer and loss...
training...
it: 0  | loss 10.26  | Δw: 1.147
it: 10  | loss 9.58  | Δw: 0.543
it: 20  | loss 9.33  | Δw: 0.336
it: 30  | loss 9.16  | Δw: 0.28
it: 40  | loss 9.06  | Δw: 0.236
it: 50  | loss 8.89  | Δw: 0.213
it: 60  | loss 8.74  | Δw: 0.2
it: 70  | loss 8.59  | Δw: 0.186
it: 80  | loss 8.46  | Δw: 0.176
it: 90  | loss 8.24  | Δw: 0.168
it: 100  | loss 8.1  | Δw: 0.162
it: 110  | loss 7.95  | Δw: 0.157
it: 120  | loss 7.87  | Δw: 0.145
it: 130  | loss 7.69  | Δw: 0.139
it: 140  | loss 7.52  | Δw: 0.141
it: 150  | loss 7.49  | Δw: 0.134
it: 160  | loss 7.38  | Δw: 0.129
it: 170  | loss 7.28  | Δw: 0.127
it: 180  | loss 7.17  | Δw: 0.133
it: 190  | loss 7.08  | Δw: 0.122
it: 200  | loss 6.93  | Δw: 0.122
it: 210  | loss 6.86  | Δw: 0.121
it: 220  | loss 6.76  | Δw: 0.124
it: 230  | loss 6.74  | Δw: 0.12
it: 240  | loss 6.74  | Δw: 0

KeyboardInterrupt: 

In [15]:
# =============================================================================
# Results analysis
# =============================================================================
print('saving embeddings...')
N = 3000
np.savetxt('values.tsv', np.round(model.embeddings.weight.detach().cpu().numpy()[0:N], 2), delimiter='\t', fmt='%1.2f')
s = [dataset.rvocab[i] for i in range(N)]
open('ERA-V1/ERA-V1-Session-17/BERT/names.tsv', 'w+').write('\n'.join(s) )


print('end')

saving embeddings...
end
