In [1]:
import json
import csv
import random
import torch
import torch.nn as nn
import numpy as np
# use this library https://github.com/facebookresearch/fastText/tree/master/python
import fastText
import time

In [18]:
trainset = []
with open('data/snips_processed/snips_train.csv', 'r') as f:
    reader = csv.reader(x.replace('\0', '') for x in f)
    for line in reader:
        trainset.append(line)
trainset = np.array(trainset)

testset = []
with open('data/snips_processed/snips_test.csv', 'r') as f:
    reader = csv.reader(x.replace('\0', '') for x in f)
    for line in reader:
        testset.append(line)
testset = np.array(testset)

labels = list(set(trainset[:,0]))
lab2id = {}
id2lab = {}

for i in range(len(labels)):
    lab2id[labels[i]] = i
    id2lab[i] = labels[i]

In [3]:
sv_model = fastText.load_model('data/cc.sv.300.bin')
en_model = fastText.load_model('data/cc.en.300.bin')

In [4]:
SOS_VEC = np.zeros((1, 300))
EOS_VEC = np.zeros((1, 300))
SOS_VEC[0,0] = 1
EOS_VEC[0,1] = 1


In [15]:
def sentence2vecs(sentence, model):
    result = [SOS_VEC]
    sentence = sentence.strip()
    for word in sentence:
        result.append([model.get_word_vector(word.lower())])
    result.append(EOS_VEC)
    return result
    

In [19]:
def prepare_pair(label, sentence, model):
    return (lab2id[label]),(sentence2vecs(sentence, model))

def prepare_pairs(data, lang = 'en'):
    if lang == 'en':
        model = en_model
        slab = 1
    elif lang == 'sv':
        model = sv_model
        slab = 2
    else:
        raise RuntimeError('lang is not supported')
    labels = []
    vectors = []
    for sample in data:
        l, v = prepare_pair(sample[0], sample[slab], model)
        labels.append(l)
        vectors.append(v)
        
    return labels, vectors

In [20]:
p = prepare_pairs(trainset, 'en')

In [21]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size=300):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        output, hidden = self.gru(input, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)
    
en_enc = EncoderRNN()

In [22]:
def comp_sent_vec(sentence_as_vecs, enc):
    hidden = enc.initHidden()
    for v in sentence_as_vecs:
        input = torch.tensor([v]).float()
        out, hidden = enc.forward(input, hidden)
    return out

    

In [23]:
class Baseline(nn.Module):
    def __init__(self, in_size = 300, out_size = 10):
        super(Baseline, self).__init__()

        self.W = nn.Linear(300, 10)
        self.out = nn.LogSoftmax(2)
        
    def forward(self, x):
        x = self.W(x)
        return self.out(x)

In [24]:
def train(model, criterion, en_optimizer,net_optimizer, labels, vectors, enc):
    model.zero_grad()
    loss = 0
    
    
    vectors = torch.cat([comp_sent_vec(x,enc) for x in vectors], 0)
    labels = torch.tensor(labels)

    model_out = model.forward(vectors)
    loss += criterion(model_out[:,0], labels)
    
    loss.backward()
    en_optimizer.step()
    net_optimizer.step()
    
    return loss.item()/len(labels)

In [25]:
def evaluate(model, labels, vectors, enc):
    with torch.no_grad():
        vectors = torch.cat([comp_sent_vec(x,enc) for x in vectors], 0)
        labels = torch.tensor(labels)
    
        model_out = model.forward(vectors)
        right = 0
        
        for i  in range(len(model_out)):
            k, v = model_out[i].topk(1)
            predicted, true = v.item(), labels[i].item()
            if predicted == true:
                right +=1

                
        loss = criterion(model_out[:,0], labels)
        return loss.item(), right/len(model_out)

In [26]:
net = Baseline()
enc_optimizer = torch.optim.Adam(en_enc.parameters())
net_optimizer = torch.optim.Adam(net.parameters())
criterion = torch.nn.NLLLoss()
labs, vecs = prepare_pairs(trainset, lang = 'en')
labst, vecst = prepare_pairs(testset, lang = 'en')
t =  time.time()

for i in range(1,1000):
    loss = train(net, criterion, enc_optimizer,net_optimizer, labs, vecs, en_enc)
    if not i% 5:
        eval_loss, acc = evaluate(net, labst, vecst, en_enc)
        _, train_acc = evaluate(net, labs, vecs, en_enc)
        print('#{:3d}, {:5d} sec. train loss: {:.7f}, eval loss: {:.4f}, acc = {:.3f}, train_acc = {:.3f}'.format(i, int(time.time() - t), loss, eval_loss, acc, train_acc))
        
    

#  5, 24682 sec. train loss: 0.0001263, eval loss: 1.0497, acc = 0.851, train_acc = 0.856
# 10, 55418 sec. train loss: 0.0000606, eval loss: 0.6883, acc = 0.851, train_acc = 0.856


KeyboardInterrupt: 

In [27]:
evaluate(net, labst, vecst, en_enc)

(0.6516625881195068, 0.8510255487585462)

In [24]:
'{:4d}'.format(int(time.time() - t))

'  68'