In [4]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP




In [None]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
class Generator(nn.Module):
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

In [None]:
class Layer_Norm(nn.Module):

  def __init__(self, features, eps=1e-6):
    super(Layer_Norm, self).__init__()
    self.a_2 = nn.Parameter(torch.ones(features))
    self.b_2 = nn.Parameter(torch.zeros(features))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1, keepdim=True)
    std = x.std(-1, keepdim=True)
    return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [None]:
class Sublayer_Connection(nn.Module):
 
  def __init__(self, size, dropout):
    super(Sublayer_Connection, self).__init__()
    self.dropout = nn.Dropout(p=0.1)
    self.norm = Layer_Norm(size)

  def forward(self, x, sublayer):
    return x + self.dropout(sublayer(self.norm(x)))   

In [None]:
class Encoding(nn.Module):

  def __init__(self, layer, N):
    super(Encoding, self).__init__()
    self.layers = clones(layer, N)
    self.norm = Layer_Norm(layer.size)

  def forward(self, x):
    for layer in self.layers:
      x = layer(x)
    return self.norm(x)

In [None]:
class Encoding_Layer(nn.Module):

  def __init__(self, size, self_attention, feed_forward, dropout):
    super(Encoding_Layer, self).__init__()
    self.dropout = nn.Dropout(p=0.1)
    self.self_attention = self_attention
    self.feed_forward = feed_forward
    self.sublayers = clones(Sublayer_Connection(size, dropout), 2)
    self.size = size

  def forward(self, x):
    x = self.sublayer[0](x, lambda x: self.self_attention(x, x, x))
    return self.sublayer[1](x, self.feed_forward)

In [None]:
class Decoding(nn.Module):

  def __init__(self, layer, N):
    super(Decoding, self).__init__()
    self.layers = clones(layer, N)
    self.norm = Layer_Norm(layer.size)

  def forward(self, x):
    for layer in self.layers:
      x = layer(x)
    return self.norm(x)

In [None]:
class Decoding_Layer(nn.Module):

  def __init__(self, size, self_attention, feed_forward, dropout):
    super(Decoding_Layer, self).__init__()
    self.dropout = nn.Dropout(p=0.1)
    self.self_attention = self_attention
    self.feed_forward = feed_forward
    self.sublayers = clones(Sublayer_Connection(size, dropout), 3)
    self.size = size

  def forward(self, x):
    x = self.sublayer[0](x, lambda x: self.self_attention(x, x, x))
    x = self.sublayer[1](x, lambda x: self.self_attention(x, x, x))
    return self.sublayer[2](x, self.feed_forward)

In [None]:
def Attention(q, k, v):
  d_k = q.size(-1)
  a = torch.matmul(q, torch.transpose(k,1,2))
  a /= math.sqrt(d_k)
  p_attention = a.softmax(dim=-1)
  return torch.matmul(p_attention, v)

In [None]:
a = torch.randn(32, 72, 512)
b = torch.randn(32, 72, 512)
c = torch.randn(32, 72, 512)
d = Attention(a,b,c)
print(d.size())

torch.Size([32, 72, 512])


In [None]:
class multihead(nn.Module):
  def __init__(self, h, d_model, dropout=0.1):
    super(multihead, self).__init__()
    assert d_model % h == 0
    self.d_k = d_model // h
    self.h = h
    self.linears = clones(nn.Linear(d_model, d_model), 4)
    self.attn = None
    self.dropout = nn.Dropout(p=dropout)

  def forward(self, q, k, v):
    liste = [q, k, v]
    nbatches = q.size(0)
    for i in range(len(liste)):
      for lin, x in zip(self.linears, liste[i]):
        liste[i] = lin(x)

    q = liste[0]
    k = liste[1]
    v = liste[2]

    x = Attention(q, k, v) 
    return x

In [None]:
class FFN(nn.Module):
  def __init__(self, d_model, d_ff):
    super(FFN, self).__init__()
    self.fc1 = nn.Linear(d_model, d_ff)
    self.fc2 = nn.Linear(d_ff, d_model)
  
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x

In [None]:
class embeddings(nn.Module):
  def __init__(self, d_model, vocab):
    super(embeddings, self).__init__()
    self.layer = nn.Embedding(vocab, d_model)
    self.d_model = d_model

  def forward(self, x):
    return self.layer(x) * math.sqrt(self.d_model)

In [None]:
class positional_encoding(nn.Module):
  def __init__(self, d_model, dropout=0.1, max_len=5000):
    super(positional_encoding, self).__init__()
    self.dropout = nn.Dropout(p=dropout)
    self.d_model = d_model
    
    pe = torch.zeros(max_len, d_model)

    position = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(
        torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
    )
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [None]:
"""
class Label_Smoothing(nn.Module):
  # Label Smoothing is a regularization technique that introduces noise for the labels. 
  # This accounts for the fact that datasets may have mistakes in them, so maximizing the likelihood of log( p(y | x) ) directly can be harmful.
  # a set y is correct with probability (1 - epsilon)
  # Label Smoothing regularizes a model based on a softmax with k output values by replacing the hard 0 and 1 \n
  # classification targets with targets of epsilon/(k-1) and (1 - epsilon) respectively
 
  def __init__(self, k, padding_index, epsilon):
    super(Label_Smoothing, self).__init__()
    self.epsilon = epsilon
    self.other = 1 - self.epsilon
    self.k = k
    self.criterion = nn.KLDivLoss(reduction = "sum")
    self.padding_index = padding_index

  def forward(self, x, target):
    return self.criterion(x)
"""

In [None]:
def rate(step, model_size, factor, warmup):
    if step == 0:
        step = 1
    return factor * ((model_size ** (-0.5)) * min((step ** (-0.5)), step * (warmup ** -1.5)))

In [None]:
# We used the Adam optimizer with β1 = 0.9, β2 = 0.98 and epsilon = (10 ** (-9))

In [7]:
# Load spacy tokenizer models, download them if they haven't been
# downloaded already


def load_tokenizers():

    try:
        spacy_de = spacy.load("de_core_news_sm")
    except IOError:
        os.system("python -m spacy download de_core_news_sm")
        spacy_de = spacy.load("de_core_news_sm")

    try:
        spacy_en = spacy.load("en_core_web_sm")
    except IOError:
        os.system("python -m spacy download en_core_web_sm")
        spacy_en = spacy.load("en_core_web_sm")

    return spacy_de, spacy_en

In [8]:
def tokenize(text, tokenizer):
    return [tok.text for tok in tokenizer.tokenizer(text)]


def yield_tokens(data_iter, tokenizer, index):
    for from_to_tuple in data_iter:
        yield tokenizer(from_to_tuple[index])

In [9]:
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [5]:
# Some convenience helper functions used throughout the notebook


def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

In [10]:
def build_vocabulary(spacy_de, spacy_en):
    def tokenize_de(text):
        return tokenize(text, spacy_de)

    def tokenize_en(text):
        return tokenize(text, spacy_en)

    print("Building German Vocabulary ...")
    train, val, test = datasets.Multi30k(language_pair=("de", "en"))
    vocab_src = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_de, index=0),
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    print("Building English Vocabulary ...")
    train, val, test = datasets.Multi30k(language_pair=("de", "en"))
    vocab_tgt = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_en, index=1),
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    vocab_src.set_default_index(vocab_src["<unk>"])
    vocab_tgt.set_default_index(vocab_tgt["<unk>"])

    return vocab_src, vocab_tgt


def load_vocab(spacy_de, spacy_en):
    if not exists("vocab.pt"):
        vocab_src, vocab_tgt = build_vocabulary(spacy_de, spacy_en)
        torch.save((vocab_src, vocab_tgt), "vocab.pt")
    else:
        vocab_src, vocab_tgt = torch.load("vocab.pt")
    print("Finished.\nVocabulary sizes:")
    print(len(vocab_src))
    print(len(vocab_tgt))
    return vocab_src, vocab_tgt


if is_interactive_notebook:
    # global variables used later in the script
    spacy_de, spacy_en = show_example(load_tokenizers)
    vocab_src, vocab_tgt = show_example(load_vocab, args=[spacy_de, spacy_en])

Building German Vocabulary ...


RuntimeError: ignored

In [21]:
!pip install torchdata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata
  Downloading torchdata-0.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 5.0 MB/s 
[?25hCollecting portalocker>=2.0.0
  Downloading portalocker-2.6.0-py2.py3-none-any.whl (15 kB)
Collecting urllib3>=1.25
  Downloading urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 65.4 MB/s 
Collecting torch==1.13.1
  Downloading torch-1.13.1-cp38-cp38-manylinux1_x86_64.whl (887.4 MB)
[K     |██████████████████████████████  | 834.1 MB 1.2 MB/s eta 0:00:47tcmalloc: large alloc 1147494400 bytes == 0x6648e000 @  0x7fcdc6a8c615 0x5d6f4c 0x51edd1 0x51ef5b 0x4f750a 0x4997a2 0x4fd8b5 0x4997c7 0x4fd8b5 0x49abe4 0x4f5fe9 0x55e146 0x4f5fe9 0x55e146 0x4f5fe9 0x55e146 0x5d8868 0x5da092 0x587116 0x5d8d8c 0x55dc1e 0x55cd91 0x5d8941 0x49abe4 0x55cd91 0x5d8941 0x4990ca 0x5d8868 