# Implementation of the paper Data2Vis ([link to paper](https://arxiv.org/abs/1804.03126))

In [None]:
import tqdm
import time
import os
import random

import numpy as np 
import pandas as pd 

import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import TensorDataset, DataLoader


import matplotlib.pyplot as plt
plt.style.use('default')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
path = "/kaggle/input/dltpa3-3"
data = dict()
for filename in os.listdir(path):
    with open(path + '/' + filename) as file:
        data[filename] = list(line.replace('\n', '').replace('[', '').replace(']', '') for line in file.readlines())
        
for key in data.keys():
    print(key, ':',len(data[key]))

In [None]:
seen = set()
idx = []
for j, line in enumerate(data['train.sources']):
    if line not in seen:
        seen.add(line)
        idx.append(j)
print(len(seen), len(idx))
del seen

x_train = [data['train.sources'][i] for i in idx]
y_train = [data['train.targets'][i] for i in idx]
print(len(x_train), len(y_train))

def yield_tokens(data_iter, s):
    for text in data_iter:
        yield list(text)

s_iter = iter(x_train)
t_iter = iter(y_train)
source_vocab = build_vocab_from_iterator(yield_tokens(s_iter, True), specials=["<sos>", "<eos>", "<pad>"])
target_vocab = build_vocab_from_iterator(yield_tokens(t_iter, False), specials=["<sos>", "<eos>", "<pad>"])
print(source_vocab.get_stoi(), len(source_vocab.get_stoi()))
print(target_vocab.get_stoi(), len(target_vocab.get_stoi()))

In [None]:
idx2chr = target_vocab.get_itos()
print(idx2chr)

In [None]:
def vec2str(vec, itos=idx2chr):
    string = ''
    for i in vec:
        string = string + itos[i]
    return string

In [None]:
def pad(t, max_len):
    padding = max_len - len(t) 
    return t + [2]*padding

def build_array(lines, vocab):
    vecs = [vocab(list(line)) for line in lines]
    vecs = [vocab(['<sos>']) + vec + vocab(['<eos>']) for vec in vecs]
    vecs = [pad(vec, 500) for vec in vecs]
    return vecs

In [None]:
xv_train = build_array(x_train, source_vocab)
yv_train = build_array(y_train, target_vocab)

traindataset = TensorDataset(torch.tensor(xv_train), torch.tensor(yv_train))
trainloader = DataLoader(traindataset, shuffle=True, batch_size=8)

del xv_train, yv_train

In [None]:
class encoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, dp):
        super(encoder, self).__init__()
        self.num_embeddings = num_embeddings
        self.dropout = nn.Dropout(dp)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim , hidden_size=hidden_size, num_layers=num_layers, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, hidden_size)
    def forward(self, x):
        x = self.dropout(self.embedding(x))
        o,h = self.rnn(x)
        h = torch.tanh(self.fc(torch.cat((h[-2, :, :], h[-1, :, :]), dim=1)))
        return o, h
    
class attention(nn.Module):
    def __init__(self, hidden_size):
        super(attention, self).__init__()
        self.a = nn.Linear(hidden_size*2 + hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)
    def forward(self, d_hidden, e_outputs):
        b_s = e_outputs.shape[1]
        seq_len = e_outputs.shape[0]
        d_hidden = d_hidden.unsqueeze(1).repeat(1, seq_len, 1)
        e_outputs = e_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.a(torch.cat((d_hidden, e_outputs),dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.nn.functional.softmax(attention, dim=1)
        
class decoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, dp, attention):
        super(decoder, self).__init__()
        self.num_embeddings = num_embeddings
        self.attention = attention
        self.dropout = nn.Dropout(dp)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.rnn = nn.GRU(hidden_size*2 + embedding_dim, hidden_size)
        self.fc = nn.Linear(hidden_size*2+hidden_size+embedding_dim, num_embeddings)
    
    def forward(self, inp, h, e_outputs):
        inp = inp.unsqueeze(0)
        emb = self.dropout(self.embedding(inp))
        a = self.attention(h, e_outputs)
        a = a.unsqueeze(1)
        e_outputs = e_outputs.permute(1, 0, 2)
        wtd = torch.bmm(a, e_outputs)
        wtd = wtd.permute(1, 0, 2)
        rnn_inp = torch.cat((emb, wtd), dim=2)
        o,h = self.rnn(rnn_inp, h.unsqueeze(0))
        
        emb = emb.squeeze(0)
        o = o.squeeze(0)
        wtd = wtd.squeeze(0)
        
        p = self.fc(torch.cat((o, wtd, emb),dim=1))
        return p, h.squeeze(0)
    
class seq2seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, trg_len, t_f):
        outputs = torch.zeros(trg_len, src.shape[1], self.decoder.num_embeddings).to(self.device)
        e_outputs, hidden = self.encoder(src)
        inp = trg[0, :]
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(inp, hidden, e_outputs)
            outputs[t] = output
            teacher_force = random.random() < t_f
            top1 = output.argmax(1)
            inp = trg[t, :] if teacher_force else top1
        return outputs
        

In [None]:
num_src_embeddings = len(source_vocab.get_stoi()) 
num_trg_embeddings = len(target_vocab.get_stoi())

In [None]:
embedding_dim = 128
hidden_size = 128
dp = 0.5
num_layers_enc = 2
num_layers_dec = 1
x,y = next(iter(trainloader))
e1 = encoder(num_embeddings=num_src_embeddings, embedding_dim=embedding_dim, hidden_size=hidden_size, num_layers=num_layers_enc, dp=dp)
attn = attention(hidden_size)
d1 = decoder(num_embeddings=num_trg_embeddings, embedding_dim=embedding_dim, hidden_size=hidden_size, num_layers=num_layers_dec, dp=dp, attention=attn)
model1 = seq2seq(e1, d1, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
# model1.apply(init_weights)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# print(f'The model has {count_parameters(model1):,} trainable parameters')

In [None]:
optimizer = torch.optim.Adam(model1.parameters(), lr=1e-3)
trg_pad_idx = target_vocab(['<pad>'])[0]
loss_fn = nn.CrossEntropyLoss(ignore_index = trg_pad_idx)

In [None]:
def train(model, dataloader, optimizer, loss_fn):
    model.train()
    epoch_loss = 0
    t_f = 0.2
    for x,y in dataloader:
        x,y = x.permute(1,0).to(model.device), y.permute(1,0).to(model.device)
        optimizer.zero_grad()
        output = model(x, y, 500, t_f)
        output = output[1:].view(-1, output.shape[-1])
        y = y[1:].reshape(-1)
        loss = loss_fn(output, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss/len(dataloader)

def evaluate(model, dataloader, loss_fn):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for x,y in dataloader:
            x,y = x.permute(1,0).to(model.device), y.permute(1,0).to(model.device)
            output = model(x, y, 500, 0)
            output = output[1:].view(-1, output.shape[-1])
            y = y[1:].reshape(-1)
            loss = loss_fn(output, y)
            epoch_loss += loss.item()
    return epoch_loss/len(dataloader)

In [None]:
epochs = 4
for i in range(epochs):
    start = time.time()
    epoch_loss = train(model1, trainloader, optimizer, loss_fn)
    end = time.time()
    print(f'Epoch:{i+1}, Loss:{epoch_loss}, Time:{end-start}s')

In [None]:
torch.save(model1.state_dict(), '/kaggle/working/q3-2')

In [None]:
xv_test = build_array(data['test.sources'], source_vocab)
yv_test = build_array(data['test.targets'], target_vocab)

testdataset = TensorDataset(torch.tensor(xv_test), torch.tensor(yv_test))
testloader = DataLoader(testdataset, shuffle=True, batch_size=16)

del xv_test, yv_test

In [None]:
test_loss = evaluate(model1, testloader, loss_fn)
print(test_loss)

In [None]:
embedding_dim = 128
hidden_size = 128
dp = 0.5
num_layers_enc = 2
num_layers_dec = 1
e1 = encoder(num_embeddings=num_src_embeddings, embedding_dim=embedding_dim, hidden_size=hidden_size, num_layers=num_layers_enc, dp=dp)
attn = attention(hidden_size)
d1 = decoder(num_embeddings=num_trg_embeddings, embedding_dim=embedding_dim, hidden_size=hidden_size, num_layers=num_layers_dec, dp=dp, attention=attn)
model_inf = seq2seq(e1, d1, device).to(device)
model_inf.load_state_dict(torch.load('/kaggle/input/tdlq3-1-1/q3-1.pth'))

In [None]:
embedding_dim = 256
hidden_size = 256
dp = 0.5
num_layers_enc = 2
num_layers_dec = 1
x,y = next(iter(trainloader))
e2 = encoder(num_embeddings=num_src_embeddings, embedding_dim=embedding_dim, hidden_size=hidden_size, num_layers=num_layers_enc, dp=dp)
attn = attention(hidden_size)
d2 = decoder(num_embeddings=num_trg_embeddings, embedding_dim=embedding_dim, hidden_size=hidden_size, num_layers=num_layers_dec, dp=dp, attention=attn)
model2 = seq2seq(e2, d2, device).to(device)

optimizer = torch.optim.Adam(model2.parameters(), lr=1e-3)
trg_pad_idx = target_vocab(['<pad>'])[0]
loss_fn = nn.CrossEntropyLoss(ignore_index = trg_pad_idx)

print(f'The model has {count_parameters(model2):,} trainable parameters')
model2.apply(init_weights)

In [None]:
epochs = 4
for i in range(epochs):
    start = time.time()
    epoch_loss = train(model2, trainloader, optimizer, loss_fn)
    end = time.time()
    print(f'Epoch:{i+1}, Loss:{epoch_loss}, Time:{end-start}s')

In [None]:
torch.save(model2.state_dict(), '/kaggle/working/q3-3.pth')

In [None]:
test_loss = evaluate(model2, testloader, loss_fn)
print(test_loss)

In [None]:
embedding_dim = 256
hidden_size = 256
dp = 0.5
num_layers_enc = 2
num_layers_dec = 1
e3 = encoder(num_embeddings=num_src_embeddings, embedding_dim=embedding_dim, hidden_size=hidden_size, num_layers=num_layers_enc, dp=dp)
attn = attention(hidden_size)
d3 = decoder(num_embeddings=num_trg_embeddings, embedding_dim=embedding_dim, hidden_size=hidden_size, num_layers=num_layers_dec, dp=dp, attention=attn)
model_inf2 = seq2seq(e3, d3, device).to(device)
model_inf2.load_state_dict(torch.load('/kaggle/input/dltp3-3/q3-3.pth'))