In [1]:
import os
import sys
import argparse
from typing import Tuple
from tensorboardX import SummaryWriter
import copy
import time
import math
import numpy as np
import pickle

# Progressbar
from tqdm import tqdm

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Highlevel from Pytorch
import torch as T
from torch import nn, Tensor
import torch.optim as opt

# Neural Network parts from Pytorch
from torch.nn import TransformerEncoder, TransformerEncoderLayer, init
import torch.nn.functional as F

# Pytorch's Dataset and Dataloader
from torch.utils.data import dataset
from torch.utils.data import DataLoader

# Dataset used
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Custom Libraries
import utils
device = T.device('cuda' if T.cuda.is_available() else 'cpu')
batch_size = 20
eval_batch_size = 10

train_iter, test_iter, val_iter = WikiText2.iters(batch_size = batch_size, device = device)


In [2]:
training_batch = next(iter(train_iter))
print(training_batch.text.size())
print(training_batch.target.size())

torch.Size([35, 20])
torch.Size([35, 20])


In [3]:
min_list = []
max_list = []
for data in train_iter:
    for index in range(data.text.size()[1]):
        min_list.append(min(data.text[index]))
        max_list.append(max(data.text[index]))

In [4]:
print(max(max_list))
print(min(min_list))

tensor(33278)
tensor(0)


In [5]:

# Use a Parser to specify Hyperparams etc.
parser = argparse.ArgumentParser()
# TODO: Think about adding the seed or experiment number
# TODO: Change the {utils.checkdir(f"{os.getcwd()}/some/path/")} expressions to something like
# TODO: {utils.checkdir(f"{os.getcwd()}/some/path/Seed{seed}/")} or {utils.checkdir(f"{os.getcwd()}/some/path/Experiment{experiment}/")}
# TODO: Obviously change the save commands as well
# Set Hyperparams for Batches
parser.add_argument("--batch_size", type=int, default=20, help="The Batchsize used for Training")
parser.add_argument("--eval_batch_size", type=int, default=10, help="The Batchsize used for Evaluation")
parser.add_argument("--bptt", type=int, default=35, help="The Length of Backpropagation through Time")
# Set Hyperparams specifying the Model
parser.add_argument("--ntokens", type=int, default=33280, help="The Number of Tokens used by the Model")
parser.add_argument("--emsize", type=int, default=200, help="The Embedding Dimension used by the Model")
parser.add_argument("--d_hid", type=int, default=200, help="The Dimension of the FFN Model used in the Encoder")
parser.add_argument("--nlayers", type=int, default=2, help="The Number of Encoderlayers used in the Encoder")
parser.add_argument("--nhead", type=int, default=2, help="The Number of Heads used in the Multihead-Attention")
parser.add_argument("--dropout", type=float, default=0.2, help="The Dropout Probability used in the Model")
# Set Hyperparams defining the Pruning Procedure
# TODO: Think about adding rewind option and number of warmup steps
# Facebook Paper uses num_prune_cycles = 20 and prune_percent = 20. as well as 50,000 updates (overall?)
parser.add_argument("--num_prune_cycles", type=int, default=20, help="The Number of Pruning Cycles")
parser.add_argument("--num_epochs_prune", type=int, default=50, help="The Number of Epochs per Pruning Cycle")
parser.add_argument("--prune_percent", type=float, default=20., help="The Percentage of remaining Weights to be pruned in each Iteration")
parser.add_argument("--print_freq_prune", type=int, default=1, help="The Printing-Frequency of Train- and Test Loss during Pruning")
parser.add_argument("--test_freq_prune", type=int, default=1, help="The Testing Frequency during Pruning")
# Set Hyperparams defining the Reintroduction Procedure
# TODO: Think about adding choice option (selecting reintroduction scheme)
parser.add_argument("--num_epochs_reint", type=int, default=50, help="The Number of Epochs per Reintialisation")
parser.add_argument("--print_freq_reint", type=int, default=1, help="The Printing Frequency of Train- and Test Loss durinig Reinitialisation")
parser.add_argument("--test_freq_reint", type=int, default=1, help="The Testing Frequency during Reinitialisation")
# TODO: Think about adding LR, the Factor used in scheduler, etc.
parser.add_argument("-v", "--verbosity", action="count", default=0)
args = parser.parse_args()


usage: ipykernel_launcher.py [-h] [--batch_size BATCH_SIZE]
                             [--eval_batch_size EVAL_BATCH_SIZE] [--bptt BPTT]
                             [--ntokens NTOKENS] [--emsize EMSIZE]
                             [--d_hid D_HID] [--nlayers NLAYERS]
                             [--nhead NHEAD] [--dropout DROPOUT]
                             [--num_prune_cycles NUM_PRUNE_CYCLES]
                             [--num_epochs_prune NUM_EPOCHS_PRUNE]
                             [--prune_percent PRUNE_PERCENT]
                             [--print_freq_prune PRINT_FREQ_PRUNE]
                             [--test_freq_prune TEST_FREQ_PRUNE]
                             [--num_epochs_reint NUM_EPOCHS_REINT]
                             [--print_freq_reint PRINT_FREQ_REINT]
                             [--test_freq_reint TEST_FREQ_REINT] [-v]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\Luca\AppData\Roaming\jupyter\runtime\kernel-3d2e9205-ec99-4d27-a2b4

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [6]:

# Defining the Architecture
class TransformerModel(nn.Module):
    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type          = 'Transformer'
        self.pos_encoder         = PositionalEncoding(d_model, dropout)
        encoder_layers           = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder             = nn.Embedding(ntoken, d_model)
        self.d_model             = d_model
        self.decoder             = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]
        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src    = self.encoder(src) * math.sqrt(self.d_model)  # Wordembeddings
        src    = self.pos_encoder(src)  # Positional Encoding
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return T.triu(T.ones(sz, sz) * float('-inf'), diagonal=1)


# Implementing Positional Encoding, i.e. where are the words in the Sentence
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout   = nn.Dropout(p=dropout)
        position       = T.arange(max_len).unsqueeze(1)
        div_term       = T.exp(T.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe             = T.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = T.sin(position * div_term)
        pe[:, 0, 1::2] = T.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)


# Building the Model to be trained
model = TransformerModel(33280, 200, 2, 200, 2, 0.2).to(device)
# Finished defining the Model

# Specify the objective Function
criterion = nn.CrossEntropyLoss()
lr        = 5.0  # learning rate
# Stated in Successfully applying ... (Aachen) Adafactor gives better results than Adam, they include warmup after reset
optimizer = T.optim.SGD(model.parameters(), lr=lr)

In [11]:
def warmup(num_warmup: int = 5) -> None:
    # Progressbar
    bar = tqdm(range(num_warmup))
    for epoch in bar:
        total_loss   = 0.
        comp_loss    = 0.  # Used for comparison down below
        log_interval = 200
        start_time   = time.time()
        src_mask     = generate_square_subsequent_mask(35).to(device)
        model.train()  # turn on train mode
        for batch_num, batch in enumerate(train_iter):
            optimizer.zero_grad()
            data_pts, targets = batch.text, batch.target
            batch_size_local = data_pts.size(0)
            if batch_size_local != 35:  # only on last batch
                src_mask = src_mask[:batch_size_local, :batch_size_local]
            output = model(data_pts, src_mask)
            t_loss = criterion(output.view(-1, 33280), targets.view(output.view(-1, 33280).size()[0]))
            t_loss.backward()
            # Clipping Gradients
            T.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            total_loss += t_loss.item()
            if batch_num % log_interval == 0 and batch_num > 0:
                ms_per_batch = (time.time() - start_time) * 1000 / log_interval
                cur_loss = (total_loss - comp_loss) / log_interval
                # ppl          = math.exp(cur_loss)
                comp_loss = total_loss
                print(f'| epoch {epoch:3d} | {batch_num:5d}/{len(train_iter):5d} batches | '
                      f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                      f'loss {cur_loss:5.2f}')  # | ppl {ppl:8.2f}')
                start_time = time.time()
    # Copying and Saving State after Warm-Up
    #utils.checkdir(f"{os.getcwd()}/saves/model_state_dicts/")
    #T.save(model, f"{os.getcwd()}/saves/model_state_dicts/warmup_state_dict.pth.tar")
    pass

In [12]:
warmup()

  0%|          | 0/5 [00:00<?, ?it/s]

| epoch   0 |   200/ 2984 batches | lr 5.00 | ms/batch 882.09 | loss  8.27
| epoch   0 |   400/ 2984 batches | lr 5.00 | ms/batch 954.36 | loss  7.00
| epoch   0 |   600/ 2984 batches | lr 5.00 | ms/batch 915.79 | loss  6.57
| epoch   0 |   800/ 2984 batches | lr 5.00 | ms/batch 913.77 | loss  6.39
| epoch   0 |  1000/ 2984 batches | lr 5.00 | ms/batch 889.67 | loss  6.28
| epoch   0 |  1200/ 2984 batches | lr 5.00 | ms/batch 926.50 | loss  6.25
| epoch   0 |  1400/ 2984 batches | lr 5.00 | ms/batch 691.58 | loss  6.18
| epoch   0 |  1600/ 2984 batches | lr 5.00 | ms/batch 688.07 | loss  6.20
| epoch   0 |  1800/ 2984 batches | lr 5.00 | ms/batch 704.14 | loss  6.06
| epoch   0 |  2000/ 2984 batches | lr 5.00 | ms/batch 692.15 | loss  6.08
| epoch   0 |  2200/ 2984 batches | lr 5.00 | ms/batch 669.33 | loss  5.99
| epoch   0 |  2400/ 2984 batches | lr 5.00 | ms/batch 664.36 | loss  6.01
| epoch   0 |  2600/ 2984 batches | lr 5.00 | ms/batch 663.57 | loss  5.99
| epoch   0 |  2800/ 2984

 20%|██        | 1/5 [38:46<2:35:04, 2326.20s/it]

| epoch   1 |   200/ 2984 batches | lr 5.00 | ms/batch 715.18 | loss  5.92
| epoch   1 |   400/ 2984 batches | lr 5.00 | ms/batch 707.80 | loss  5.87
| epoch   1 |   600/ 2984 batches | lr 5.00 | ms/batch 703.15 | loss  5.72
| epoch   1 |   800/ 2984 batches | lr 5.00 | ms/batch 688.56 | loss  5.73
| epoch   1 |  1000/ 2984 batches | lr 5.00 | ms/batch 690.31 | loss  5.70
| epoch   1 |  1200/ 2984 batches | lr 5.00 | ms/batch 692.69 | loss  5.72
| epoch   1 |  1400/ 2984 batches | lr 5.00 | ms/batch 703.42 | loss  5.74
| epoch   1 |  1600/ 2984 batches | lr 5.00 | ms/batch 697.62 | loss  5.77
| epoch   1 |  1800/ 2984 batches | lr 5.00 | ms/batch 692.82 | loss  5.66
| epoch   1 |  2000/ 2984 batches | lr 5.00 | ms/batch 692.68 | loss  5.71
| epoch   1 |  2200/ 2984 batches | lr 5.00 | ms/batch 693.35 | loss  5.61
| epoch   1 |  2400/ 2984 batches | lr 5.00 | ms/batch 690.57 | loss  5.66
| epoch   1 |  2600/ 2984 batches | lr 5.00 | ms/batch 695.81 | loss  5.66
| epoch   1 |  2800/ 2984

 40%|████      | 2/5 [1:13:23<1:48:59, 2179.87s/it]

| epoch   2 |   200/ 2984 batches | lr 5.00 | ms/batch 693.06 | loss  5.64
| epoch   2 |   400/ 2984 batches | lr 5.00 | ms/batch 688.71 | loss  5.63
| epoch   2 |   600/ 2984 batches | lr 5.00 | ms/batch 689.21 | loss  5.47
| epoch   2 |   800/ 2984 batches | lr 5.00 | ms/batch 684.80 | loss  5.50
| epoch   2 |  1000/ 2984 batches | lr 5.00 | ms/batch 685.17 | loss  5.49
| epoch   2 |  1200/ 2984 batches | lr 5.00 | ms/batch 684.77 | loss  5.52
| epoch   2 |  1400/ 2984 batches | lr 5.00 | ms/batch 682.13 | loss  5.55
| epoch   2 |  1600/ 2984 batches | lr 5.00 | ms/batch 683.54 | loss  5.57
| epoch   2 |  1800/ 2984 batches | lr 5.00 | ms/batch 680.93 | loss  5.48
| epoch   2 |  2000/ 2984 batches | lr 5.00 | ms/batch 680.25 | loss  5.53
| epoch   2 |  2200/ 2984 batches | lr 5.00 | ms/batch 677.81 | loss  5.44
| epoch   2 |  2400/ 2984 batches | lr 5.00 | ms/batch 684.22 | loss  5.47
| epoch   2 |  2600/ 2984 batches | lr 5.00 | ms/batch 675.18 | loss  5.49
| epoch   2 |  2800/ 2984

 60%|██████    | 3/5 [1:47:20<1:10:28, 2114.42s/it]

| epoch   3 |   200/ 2984 batches | lr 5.00 | ms/batch 678.29 | loss  5.48
| epoch   3 |   400/ 2984 batches | lr 5.00 | ms/batch 676.16 | loss  5.50
| epoch   3 |   600/ 2984 batches | lr 5.00 | ms/batch 675.22 | loss  5.33
| epoch   3 |   800/ 2984 batches | lr 5.00 | ms/batch 675.80 | loss  5.36
| epoch   3 |  1000/ 2984 batches | lr 5.00 | ms/batch 672.54 | loss  5.35
| epoch   3 |  1200/ 2984 batches | lr 5.00 | ms/batch 674.46 | loss  5.36
| epoch   3 |  1400/ 2984 batches | lr 5.00 | ms/batch 674.71 | loss  5.41
| epoch   3 |  1600/ 2984 batches | lr 5.00 | ms/batch 673.34 | loss  5.45
| epoch   3 |  1800/ 2984 batches | lr 5.00 | ms/batch 668.65 | loss  5.35
| epoch   3 |  2000/ 2984 batches | lr 5.00 | ms/batch 696.57 | loss  5.41
| epoch   3 |  2200/ 2984 batches | lr 5.00 | ms/batch 699.95 | loss  5.30
| epoch   3 |  2400/ 2984 batches | lr 5.00 | ms/batch 708.91 | loss  5.35
| epoch   3 |  2600/ 2984 batches | lr 5.00 | ms/batch 699.69 | loss  5.38
| epoch   3 |  2800/ 2984

 80%|████████  | 4/5 [2:21:18<34:44, 2084.50s/it]  

| epoch   4 |   200/ 2984 batches | lr 5.00 | ms/batch 679.14 | loss  5.39
| epoch   4 |   400/ 2984 batches | lr 5.00 | ms/batch 668.24 | loss  5.38
| epoch   4 |   600/ 2984 batches | lr 5.00 | ms/batch 669.59 | loss  5.21
| epoch   4 |   800/ 2984 batches | lr 5.00 | ms/batch 669.26 | loss  5.26
| epoch   4 |  1000/ 2984 batches | lr 5.00 | ms/batch 665.86 | loss  5.25
| epoch   4 |  1200/ 2984 batches | lr 5.00 | ms/batch 666.06 | loss  5.29
| epoch   4 |  1400/ 2984 batches | lr 5.00 | ms/batch 664.84 | loss  5.32
| epoch   4 |  1600/ 2984 batches | lr 5.00 | ms/batch 666.14 | loss  5.35
| epoch   4 |  1800/ 2984 batches | lr 5.00 | ms/batch 663.28 | loss  5.31
| epoch   4 |  2000/ 2984 batches | lr 5.00 | ms/batch 671.55 | loss  5.32
| epoch   4 |  2200/ 2984 batches | lr 5.00 | ms/batch 677.42 | loss  5.22
| epoch   4 |  2400/ 2984 batches | lr 5.00 | ms/batch 668.50 | loss  5.27
| epoch   4 |  2600/ 2984 batches | lr 5.00 | ms/batch 666.26 | loss  5.27
| epoch   4 |  2800/ 2984

100%|██████████| 5/5 [2:54:34<00:00, 2094.85s/it]
