In [74]:
import pandas as pd
import numpy as np
import math
import ast

import torch
from torch import nn
import torch.nn.functional as F

torch.manual_seed(1337)

<torch._C.Generator at 0x7f0a23040f90>

In [75]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [76]:
test_data = pd.read_csv('test_data.csv')
train_data = pd.read_csv('train_data.csv')
valid_data = pd.read_csv('valid_data.csv')
word_vectors = pd.read_csv('wv.csv')

In [126]:
dictionary = {}
dictionary_size = word_vectors['vectors'].shape[0]

for i in range(dictionary_size):
    wv = np.fromstring(word_vectors['vectors'][i][1:-1], sep=' ')
    dictionary[word_vectors['word'][i]] = torch.from_numpy(wv)

In [203]:
X_train = []
y_train = []

X_test = []
y_test = []

X_valid = []
y_valid = []

for i in range(len(train_data)):
    tokens = ast.literal_eval(train_data['tokens'][i])
    pos_tags = ast.literal_eval(train_data['pos_tags'][i])

    for j in range(len(tokens)):
        if (tokens[j] not in dictionary):
            X_train += [torch.zeros(64)]
        else:
            X_train += [dictionary[tokens[j]]]
        y_train += [torch.tensor(pos_tags[j])]

for i in range(len(valid_data)):
    tokens = ast.literal_eval(valid_data['tokens'][i])
    pos_tags = ast.literal_eval(valid_data['pos_tags'][i])

    for j in range(len(tokens)):
        if (tokens[j] not in dictionary):
            X_valid += [torch.zeros(64)]
        else:
            X_valid += [dictionary[tokens[j]]]
        y_valid += [torch.tensor(pos_tags[j])]

for i in range(len(test_data)):
    tokens = ast.literal_eval(test_data['tokens'][i])
    pos_tags = ast.literal_eval(test_data['pos_tags'][i])

    for j in range(len(tokens)):
        if (tokens[j] not in dictionary):
            X_test += [torch.zeros(64)]
        else:
            X_test += [dictionary[tokens[j]]]
        y_test += [torch.tensor(pos_tags[j])]

In [204]:
block_size = 10
batch_size = 32

def get_batch(split):
    if split == 'train':
        ix = torch.randint(len(X_train) - block_size, (batch_size, ))
        x = torch.stack([torch.stack(X_train[i:i+block_size]) for i in ix])
        y = torch.stack([torch.stack(y_train[i:i+block_size]) for i in ix])
    elif split == 'valid':
        ix = torch.randint(len(X_valid) - block_size, (batch_size, ))
        x = torch.stack([torch.stack(X_valid[i:i+block_size]) for i in ix])
        y = torch.stack([torch.stack(y_valid[i:i+block_size]) for i in ix])
    else:
        ix = torch.randint(len(X_test) - block_size, (batch_size, ))
        x = torch.stack([torch.stack(X_test[i:i+block_size]) for i in ix])
        y = torch.stack([torch.stack(y_test[i:i+block_size]) for i in ix])
    x, y = x.type(torch.FloatTensor), y
    x, y = x.to(device), y.to(device)
    return x, y

In [205]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / (d_k ** 0.5)
    if mask is not None:
        scaled += mask
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size, max_sequence_length, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, max_sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.reshape(batch_size, max_sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out


class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y  + self.beta
        return out


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.linear1(x)
        x = self.tanh(x)
        x = self.linear2(x)
        return x


class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])

    def forward(self, x):
        x = self.norm1(x + self.attention(x, mask=None))
        x = self.norm2(x + self.ffn(x))
        return x

class Encoder(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers, block_size, d_input, d_output):
        super().__init__()
        self.position_embedding_table = nn.Embedding(block_size, d_model)
        self.layers = nn.Sequential(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])
        self.linear = nn.Linear(d_model, d_output)

    def forward(self, x, target=None):
        B, T, C = x.shape

        pos_embds = self.position_embedding_table(torch.arange(T, device=device))
        x = x + pos_embds

        x = self.layers(x)
        logits = self.linear(x)

        if target is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            target = target.view(B*T)
            loss = F.cross_entropy(logits, target)

        return logits, loss

In [216]:
d_model = 64
num_heads = 4
drop_prob = 0.0
batch_size = 30
max_sequence_length = 64
ffn_hidden = 256
num_layers = 4
learning_rate = 1e-3
max_iters = 30000
eval_interval = 500
eval_iters = 10

encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, block_size, 64, 47)

In [217]:
encoder = encoder.to(device)
print(sum(p.numel() for p in encoder.parameters())/1e6, 'M parameters')

0.203631 M parameters


In [220]:
@torch.no_grad()
def estimate_loss(model):
    out = {}
    accuracy = {}
    model.eval()
    for split in ['train', 'valid']:
        losses = torch.zeros(eval_iters)
        total = 0
        correct = 0
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()

            N, _ = logits.shape
            Y = Y.view(N)
            total += N
            for i in range(N):
                probs = F.softmax(logits[i], -1)
                idx = torch.multinomial(probs, num_samples=1)
                if (idx in [21, 22, 23, 24, 25, 28, 29] and Y[i] in [21, 22, 23, 24, 25, 28, 29]):
                    correct += 1
                elif (idx in [37, 38, 39, 40, 41, 42] and Y[i] in [37, 38, 39, 40, 41, 42]):
                    correct += 1
                elif (idx in [16, 17, 18, 30, 31, 32] and Y[i] in [16, 17, 18, 30, 31, 32]):
                    correct += 1
                elif (idx not in [21, 22, 23, 24, 25, 28, 29, 37, 38, 39, 40, 41, 42, 16, 17, 18, 30, 31, 32] and Y[i] not in [21, 22, 23, 24, 25, 28, 29, 37, 38, 39, 40, 41, 42, 16, 17, 18, 30, 31, 32]):
                    correct += 1

        out[split] = losses.mean()
        accuracy[split] = (float(correct)/total)
    model.train()
    return out, accuracy

In [221]:
optimizer = torch.optim.AdamW(encoder.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01)

for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses, accuracy = estimate_loss(encoder)
        print(f"step {iter}: train loss {losses['train']:.4f} accuracy: {accuracy['train']:.4f} valid loss {losses['valid']:.4f} accuracy: {accuracy['valid']:.4f}")

    Xb, yb = get_batch('train')
    _, loss = encoder(Xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 0.4422 accuracy: 0.8713 valid loss 0.4606 accuracy: 0.8730
step 500: train loss 0.4458 accuracy: 0.8853 valid loss 0.5097 accuracy: 0.8590
step 1000: train loss 0.4349 accuracy: 0.8733 valid loss 0.5195 accuracy: 0.8680
step 1500: train loss 0.4231 accuracy: 0.8830 valid loss 0.5006 accuracy: 0.8727
step 2000: train loss 0.4764 accuracy: 0.8760 valid loss 0.4928 accuracy: 0.8770
step 2500: train loss 0.4212 accuracy: 0.8817 valid loss 0.5215 accuracy: 0.8550
step 3000: train loss 0.4145 accuracy: 0.8897 valid loss 0.4301 accuracy: 0.8827
step 3500: train loss 0.4362 accuracy: 0.8730 valid loss 0.4335 accuracy: 0.8840
step 4000: train loss 0.3926 accuracy: 0.8930 valid loss 0.4677 accuracy: 0.8787
step 4500: train loss 0.4429 accuracy: 0.8797 valid loss 0.4439 accuracy: 0.8843
step 5000: train loss 0.4366 accuracy: 0.8797 valid loss 0.4667 accuracy: 0.8863
step 5500: train loss 0.4028 accuracy: 0.8893 valid loss 0.4855 accuracy: 0.8827
step 6000: train loss 0.4233 acc