In [1]:
import torch
from torch import nn
from torch import Tensor
from torch.nn import Transformer
import math
from typing import Iterable, List
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 max_len: int = 2000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000)/ emb_size)
        pos = torch.arange(0, max_len).reshape(max_len, 1)
        pos_embedding = torch.zeros((max_len, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.register_buffer('pos_embedding', pos_embedding)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, token_embedding: Tensor):
        x = self.pos_embedding[:token_embedding.size(0), :]
        x = token_embedding + x
        
        x = self.dropout(x)
        
        return x

In [3]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


class Seq2Seq(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):

        super(Seq2Seq, self).__init__()
        self.transformer = Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )

        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout
        )

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask:Tensor,
                memory_key_padding_mask: Tensor):

        src_emb = self.positional_encoding(self.src_tok_emb(src))
        trg_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        
        outs = self.transformer(src_emb,
                                trg_emb,
                                src_mask,
                                tgt_mask,
                                None,
                                src_padding_mask,
                                tgt_padding_mask,
                                memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        pos = self.positional_encoding(self.src_tok_emb(src))
        return self.transformer.encoder(
            pos, 
            src_mask
        )

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)

In [4]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [5]:
from torchtext.vocab import vocab
from collections import OrderedDict

In [6]:
import json
with open("../data/processed/vocab.json", "r") as outfile:
    tokens = json.load(outfile)

In [7]:
tokens = list(tokens.keys())

In [8]:
v2 = vocab(OrderedDict([(token, 1) for token in tokens]), 
            specials=["<unk>", "<bos>", "<eos>", "<pad>", "<eng>", "<swa>", "<lug>", "<kik>"],
            special_first=True)
v2.set_default_index(0)


In [9]:
UNK_IDX, BOS_IDX, EOS_IDX, PAD_IDX = 0, 1, 2, 3

In [10]:
def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [11]:
class TrainDataset(Dataset):
    def __init__(self):
        super(TrainDataset, self).__init__()
        filename = pd.read_csv("../data/processed/processed_data.csv")
        self.filename = filename.fillna("")

        with open("../data/processed/vocab.json", "r") as outfile:
            self.tokens = json.load(outfile)

        self.v2 = vocab(OrderedDict([(token, 1) for token in tokens]), 
                    specials=["<unk>", "<bos>", "<eos>", 
                              "<pad>", "<eng>", "<swa>", 
                              "<lug>", "<kik>"],
                    special_first=True)
        
        self.v2.set_default_index(0)
        
    def tokenize_text(self, text):
        text = text.split()
        return [self.v2[token] for token in text]

    def tensor_transform(self, token_ids: List[int]):
        return torch.cat((torch.tensor([BOS_IDX]),
                          torch.tensor(token_ids),
                          torch.tensor([EOS_IDX])))
        
    def __len__(self):
        return len(self.filename)
        
    def __getitem__(self, idx):
        source = self.filename.loc[idx, "source"]
        target = self.filename.loc[idx, "target"]

        # print(source)
        source = self.tokenize_text(source)
        target = self.tokenize_text(target)

        source = self.tensor_transform(source)
        target = self.tensor_transform(target)

        return source, target

    def collate_fn(self, batch):
        src_batch, tgt_batch = [], []
        for src_tensor, tgt_tensor in batch:
            src_batch.append(src_tensor)
            tgt_batch.append(tgt_tensor)
    
        src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
        tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
        return src_batch, tgt_batch


In [12]:
train_data = TrainDataset()
train_loader = DataLoader(train_data,
                          batch_size=8,
                          collate_fn = train_data.collate_fn,
                          shuffle=True
                         )

In [13]:
for s,t in train_loader:
    print(s),
    print(t)
    break

tensor([[    1,     1,     1,     1,     1,     1,     1,     1],
        [    6,     4,     6,     6,     5,     6,     6,     6],
        [   72,  5277,     0,     0,     0,    72,     0, 12062],
        [    0,  5690,     0,    37,     0,     0,  5393,     0],
        [    0,  5605,     0,    72,     0,     0,     0,  1151],
        [    0,  7005,     0,     0,     0,     0,     0,    72],
        [    0,     2,     0,     0,     0,  1151, 12062,     0],
        [   73,     3,     0,     0,     0,    72,   130,     0],
        [ 2238,     3,  1179,    72,     2,     0,     0,    31],
        [ 1379,     3,     0,     0,     3,    37,     0,    72],
        [    2,     3,     0,  9747,     3,    75,     2,     0],
        [    3,     3,     2,     2,     3,     0,     3,     0],
        [    3,     3,     3,     3,     3,     2,     3,     0],
        [    3,     3,     3,     3,     3,     3,     3,     2]])
tensor([[    1,     1,     1,     1,     1,     1,     1,     1],
        [

In [14]:
# t[:-1,:]

In [15]:
# t[1:]

In [16]:
def train_epoch(model, optimizer):
    model.train()
    losses = 0
    
    for src, tgt in train_loader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)


        optimizer.zero_grad()
        
        tgt_out = tgt[1:, :]
        tgt_out = tgt_out.type(torch.cuda.LongTensor)
        
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))

        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_loader))


In [17]:
# train_epoch(transformer, optimizer)

In [46]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])

        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

In [69]:
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    
    src = train_data.tokenize_text(src_sentence)
    src = train_data.tensor_transform(src)

    src = src.reshape(-1,1)
    
    num_tokens = src.shape[0]

    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    

    return " ".join(v2.lookup_tokens(f.cpu().numpy())).replace("<eos>", "").replace("<bos>", "")

In [20]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(train_data.v2)
TGT_VOCAB_SIZE = len(train_data.v2)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2Seq(NUM_ENCODER_LAYERS, 
                      NUM_DECODER_LAYERS,
                      EMB_SIZE,
                      NHEAD, 
                      SRC_VOCAB_SIZE, 
                      TGT_VOCAB_SIZE, 
                      FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

  from .autonotebook import tqdm as notebook_tqdm


In [73]:
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    # start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    # end_time = timer()
    # val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}"))
    translated = translate(transformer, "<eng> wano tuli wana lkjnasf klnsaf")
    print(translated)
    


Epoch: 1, Train loss: 3.411
 <unk> <unk> <unk> <unk> <unk> <unk> 


KeyboardInterrupt: 