In [1]:
!pip install GPUtil

Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hBuilding wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l- \ done
[?25h  Created wheel for GPUtil: filename=GPUtil-1.4.0-py3-none-any.whl size=7411 sha256=9e3d48d6bfda6b53ec2eff46bd881789569a669f2d121d618fb6c0b7be9f635c
  Stored in directory: /root/.cache/pip/wheels/6e/f8/83/534c52482d6da64622ddbf72cd93c35d2ef2881b78fd08ff0c
Successfully built GPUtil
Installing collected packages: GPUtil
Successfully installed GPUtil-1.4.0
[0m

In [2]:
import logging
  
def getlogger(logfile):
    logger = logging.getLogger()
    logger.handlers = []
    logger.setLevel(logging.INFO)
    ch = logging.FileHandler(logfile, 'w')
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    return logger

In [3]:
from bpe_dataset_fft import BPEDataset
from bpe_tokenizer import BPETokenizer
from mixed_transformer_lm import TransformerLM
from utils import get_coverage, get_bleu_score
import torch.nn as nn
from torch.utils.data import random_split
import torch
from tqdm import tqdm
from datetime import datetime
import os
from sklearn.metrics import accuracy_score
from GPUtil import showUtilization as gpu_usage
import pickle
import numpy as np

In [4]:
max_vocab = 10000
max_len1 = 1024
max_len2 = 128

In [5]:
tokenizer = BPETokenizer(None, 
                         '../input/bpe-vocab/BPE_vocab_10000.pickle',
                         max_vocab,
                         drop=0.0)

In [6]:
bpedataset = BPEDataset('../input/onebigfile-train/onebigfile_train.npy',
                        '../input/onebigfile-validation/onebigfile_validation.npy',
                        max_len1,
                        max_len2,
                        batch_size = 32,
                        shuffle = True)
train, validation, train_loader, validation_loader = bpedataset.reset()

In [7]:
len(train)

116782

In [8]:
tokenizer.detokenize(train[0][0])

"early strikes from joel matip and eric maxim choupo-moting led schalke to a 2-1 victory over bitter rivals borussia dortmund in the ruhr derby .pierre-emerick aubameyang pulled one back for last season 's runners-ups but jurgen klopp 's men could not conjure up a crucial equaliser .dortmund captain mats hummels made his first start of the season after recovering from injury ,but the defender made a nightmare start as matip gave schalke a 10th-minute lead .joel matip (32 )celebrates his opening goal against borussia dortmund with his schalke team-mates .eric maxim choupo-moting celebrates after scoring in a huge win for the home side in gelsenkirchen .atsuto uchida is lifted into the air by matip and dennis aogo in among the wild celebrations .jens keller has been under pressure after a poor start at schalke but had reason to be happy here .schalke 's players ,led by man-of-the-match and keeper ralf faehrmann (centre ),jump for joy after victory .the schalke supporters salute their pla

In [9]:
tokenizer.detokenize(train[0][1])

"cameroonian duo matip and choupo-moting clinch derby victory for schalke .pierre-emerick aubameyang replies in vain for dortmund .a third defeat in six league games for jurgen klopp 's men already .schalke rise to seventh ahead of a champions league clash with maribor on tuesday .dortmund travel to anderlect in europe the following day ."

In [10]:
len(train[0][0]), len(train[0][0]), len(train[0][1])

(697, 697, 103)

In [11]:
# def getcounts(dataset, bos_idx, eos_idx):
#     counts = [1 for _ in range(max_vocab)]
#     counts = np.array(counts)
#     for i in range(len(dataset)):
#         for k in dataset[i][0]:
#             counts[k] += 1
#     counts[bos_idx] = len(dataset)
#     counts[eos_idx] = len(dataset)
#     return counts
# counts = getcounts(train, tokenizer.bos_idx, tokenizer.eos_idx)
# counts

In [12]:
# counts[:30], tokenizer.vocab[:30]

In [13]:
# from matplotlib import pyplot as plt
# plt.plot(counts[:100])

In [14]:
device = 'cuda'

In [15]:
# class LDAMLoss(nn.Module):
#     def __init__(self, num_list, device, max_m=0.5):
#         super().__init__()
#         delta = 1.0 / np.sqrt(np.sqrt(num_list))
#         delta = delta / np.max(delta) * max_m
#         self.delta = torch.tensor(delta, device=device)
#         self.loss = nn.CrossEntropyLoss()
    
#     def forward(self, x, target):
#         '''
#           x = (N * L) * vocab_size
#           target = (N * L)
#         '''
#         N, _ = x.shape
#         x_m = torch.zeros_like(x, dtype=torch.float64, device=x.device)
#         x_m.scatter_(-1, target.reshape(N, -1), 
#                      self.delta[target].reshape(N, -1))
#         x = x - x_m
#         return self.loss(x, target)

In [16]:
# ldamloss = LDAMLoss(counts, device)

In [17]:
model = TransformerLM(vocab_size = max_vocab,
                      max_len1 = max_len1,
                      max_len2 = max_len2,
                      dmodel = 256,
                      dk = 64,
                      dhead = 4,
                      dff = 512,
                      p = 0.1,
                      nlayer = 3,
                      kernel_type = 'vanilla')

In [18]:
class Trainer():
    def __init__(self, config, model, device):
        self.config = config
        self.device = device
        self.model = model.to(device)
        self.opt = torch.optim.Adam(self.model.parameters(), lr=config['lr'])
        self.log_dir = config['log_dir']
        self.ckpt_dir = config['ckpt_dir']
        os.makedirs(self.log_dir, exist_ok=True)
        os.makedirs(self.ckpt_dir, exist_ok=True)
    
    def evalulate(self, validation_loader):
        self.model.eval()
        with torch.no_grad():
            
            output_all = []
            z_all = []
            w_all = []
            
            for x, y, w in validation_loader:
                z = y[:, 1:]
                x = x.to(self.device)
                y = y.to(self.device)
                output = self.model(x, y[:,:-1])                
                output = torch.argmax(output, dim=-1).detach().cpu().numpy()
                torch.cuda.empty_cache()
                
                for i in range(output.shape[0]):
                    output_all.append(output[i][w[i]])
                    z_all.append(z[i][w[i]])
             
            score = get_bleu_score(tokenizer, output_all, z_all)
            print(tokenizer.detokenize(output_all[0]),
                  tokenizer.detokenize(z_all[0]))
        return score
    
    def train(self, bpedataset, ldamloss=None):
        time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        ckpt_dir = os.path.join(self.ckpt_dir, f"_{time}")
        os.makedirs(ckpt_dir)
        
        log_dir = os.path.join(self.log_dir, f"_{time}")
        os.makedirs(log_dir)
        logger = getlogger(os.path.join(log_dir, 'log.txt'))
        with open(os.path.join(log_dir,"config.pkl"),'wb') as fid:
            pickle.dump(self.config, fid)
        
        # determine whether we use ldam
        if ldamloss is None:
            criteria = nn.CrossEntropyLoss()
        else:
            criteria = ldamloss
            
        for epoch in range(self.config['epoches']):
            
            print('starting epoch {epoch}')
            _, _, train_loader, validation_loader = bpedataset.reset()
            
            # disable progress for kaggle, it's not very friendly
            #progress_bar = tqdm(train_loader)
            #progress_bar.set_description(f"epoch {epoch}")
            self.model.train()
            
            train_loss_sum = 0
            #for i, (x, y, w) in enumerate(progress_bar):
            for i, (x, y, w) in enumerate(train_loader):
                x = x.to(self.device)
                y = y.to(self.device)
                w = w.to(self.device)
                z = y[:, 1:]  # this is the label
                
                self.opt.zero_grad()
                output = self.model(x, y[:,:-1])
                train_loss = criteria(output[w], z[w])
                train_loss.backward()
                self.opt.step()                
                train_loss = train_loss.item()                
                torch.cuda.empty_cache()
                
                train_loss_sum += train_loss
                if (i % 100 == 0):
                    #progress_bar.set_postfix({"train_loss":train_loss})
                    print({"train_loss":train_loss})
                    torch.cuda.empty_cache()
            
            validation_accu = self.evalulate(validation_loader)
            print(f"{epoch}: validation {validation_accu}")
            
            train_loss_sum /= len(train_loader)
            logger.info(f"epoch{epoch} train loss: {train_loss_sum} vali accu: {validation_accu}")
#             if epoch % 10 == 0 or epoch == self.config['epoches'] - 1:
            torch.save({'model': self.model.state_dict(),
                        'opt': self.opt.state_dict()},
                       os.path.join(ckpt_dir, f"model_{epoch}.pickle"))

In [19]:
for name, p in model.named_parameters():
    print(name, p.shape)

encoder_layers.0.ff.fc1.weight torch.Size([512, 256])
encoder_layers.0.ff.fc1.bias torch.Size([512])
encoder_layers.0.ff.fc2.weight torch.Size([256, 512])
encoder_layers.0.ff.fc2.bias torch.Size([256])
encoder_layers.0.norm1.weight torch.Size([256])
encoder_layers.0.norm1.bias torch.Size([256])
encoder_layers.0.norm2.weight torch.Size([256])
encoder_layers.0.norm2.bias torch.Size([256])
encoder_layers.1.ff.fc1.weight torch.Size([512, 256])
encoder_layers.1.ff.fc1.bias torch.Size([512])
encoder_layers.1.ff.fc2.weight torch.Size([256, 512])
encoder_layers.1.ff.fc2.bias torch.Size([256])
encoder_layers.1.norm1.weight torch.Size([256])
encoder_layers.1.norm1.bias torch.Size([256])
encoder_layers.1.norm2.weight torch.Size([256])
encoder_layers.1.norm2.bias torch.Size([256])
encoder_layers.2.ff.fc1.weight torch.Size([512, 256])
encoder_layers.2.ff.fc1.bias torch.Size([512])
encoder_layers.2.ff.fc2.weight torch.Size([256, 512])
encoder_layers.2.ff.fc2.bias torch.Size([256])
encoder_layers.2.n

In [20]:
config = {'epoches':50, 'log_dir':'log_dir', 'ckpt_dir':'ckpt_dir', 'lr':0.5e-3}

In [21]:
torch.cuda.empty_cache()
trainer = Trainer(config, model, device)
trainer.train(bpedataset, ldamloss=None)

starting epoch {epoch}


  allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass


{'train_loss': 9.87405014038086}
{'train_loss': 8.025614738464355}
{'train_loss': 7.65108585357666}
{'train_loss': 7.597559452056885}
{'train_loss': 7.429440498352051}
{'train_loss': 7.382816791534424}
{'train_loss': 7.387852191925049}
{'train_loss': 7.407773971557617}
{'train_loss': 7.225297451019287}
{'train_loss': 7.119295597076416}
{'train_loss': 7.094257354736328}
{'train_loss': 7.006314277648926}
{'train_loss': 6.9828009605407715}
{'train_loss': 6.910526275634766}
{'train_loss': 6.807522296905518}
{'train_loss': 6.800410270690918}
{'train_loss': 6.760197639465332}
{'train_loss': 6.771595478057861}
{'train_loss': 6.6427435874938965}
{'train_loss': 6.594768524169922}
{'train_loss': 6.774539947509766}
{'train_loss': 6.497011184692383}
{'train_loss': 6.549858093261719}
{'train_loss': 6.6190032958984375}
{'train_loss': 6.532458782196045}
{'train_loss': 6.607478141784668}
{'train_loss': 6.539410591125488}
{'train_loss': 6.506620407104492}
{'train_loss': 6.537595272064209}
{'train_loss'