In [28]:
import json
import csv
import random
import time
import torch
import torch.nn as nn
import numpy as np
# use this library https://github.com/facebookresearch/fastText/tree/master/python
import fastText



In [68]:
dataset = []
with open('data/snips_processed/snips.csv', 'r') as f:
    reader = csv.reader(x.replace('\0', '') for x in f)
    for line in reader:
        dataset.append(line)
dataset = np.array(dataset)

testset = dataset[10000 :]

trainset = dataset[:10000]

train_sent_en = trainset[:,1]
train_sent_sv = trainset[:,2]

train_lab = trainset[:,0]

test_sent_en = testset[:,1]
test_sent_sv = testset[:,2]

test_lab = testset[:,0]

labels = list(set(test_lab))
lab2id = {}
id2lab = {}

for i in range(len(labels)):
    lab2id[labels[i]] = i
    id2lab[i] = labels[i]

In [43]:
sv_model = fastText.load_model('data/cc.sv.300.bin')
en_model = fastText.load_model('data/cc.en.300.bin')

In [22]:
def sentence_vec(sentence, model):
    result = np.zeros((1, 300))
    sentence = sentence.strip()
    for word in sentence:
        result += model.get_word_vector(word.lower())
    return result/len(sentence)
    

In [23]:
def prepare_pair(label, sentence, model):
    return (lab2id[label]),(sentence_vec(sentence, model))

def prepare_pairs(data, lang = 'en'):
    if lang == 'en':
        model = en_model
        slab = 1
    elif lang == 'sv':
        model = sv_model
        slab = 2
    else:
        raise RuntimeError('lang is not supported')
    labels = []
    vectors = []
    for sample in data:
        l, v = prepare_pair(sample[0], sample[slab], model)
        labels.append(l)
        vectors.append(v)
        
    return labels, vectors

In [24]:
class Baseline(nn.Module):
    def __init__(self, in_size = 300, out_size = 7):
        super(Baseline, self).__init__()

        self.W = nn.Linear(300, 7)
        self.out = nn.LogSoftmax(2)
        
    def forward(self, x):
        x = self.W(x)
        return self.out(x)



In [25]:
def train(model, criterion, optimizer, labels, vectors):
    model.zero_grad()
    loss = 0
    
    vectors = torch.tensor(vectors).float()
    labels = torch.tensor(labels)
    
    model_out = model.forward(vectors)
    loss += criterion(model_out[:,0], labels)
    
    loss.backward()
    optimizer.step()
    
    return loss.item()/len(labels)

In [26]:
def eval(model, labels, vectors):
    with torch.no_grad():
        vectors = torch.tensor(vectors).float()
        labels = torch.tensor(labels)
    
        model_out = model.forward(vectors)
        right = 0
        
        for i  in range(len(model_out)):
            k, v = model_out[i].topk(1)
            predicted, true = v.item(), labels[i].item()
            if predicted == true:
                right +=1

                
        loss = criterion(model_out[:,0], labels)
        return loss.item(), right/len(model_out)
    
        

In [27]:
net = Baseline()
optimizer = torch.optim.Adam(net.parameters())
criterion = torch.nn.NLLLoss()
labs, vecs = prepare_pairs(trainset, lang = 'sv')
labst, vecst = prepare_pairs(testset, lang = 'sv')

t = time.time()
for i in range(4001):
    loss = train(net, criterion, optimizer, labs, vecs)
    if not i% 100:
        eval_loss, acc = eval(net, labst, vecst)
        print('#{:3d}, {:5d} sec. train loss: {:.7f}, eval loss: {:.4f}, acc = {:.3f}'.format(i, int(time.time() - t), loss, eval_loss, acc))
        
    

#   0, train loss: 0.000177, eval loss: 1.944315, acc = 0.177484
# 100, train loss: 0.000164, eval loss: 1.804358, acc = 0.575724
# 200, train loss: 0.000154, eval loss: 1.685173, acc = 0.594426
# 300, train loss: 0.000144, eval loss: 1.585696, acc = 0.603227
# 400, train loss: 0.000137, eval loss: 1.502525, acc = 0.613861
# 500, train loss: 0.000130, eval loss: 1.432555, acc = 0.625229
# 600, train loss: 0.000125, eval loss: 1.373170, acc = 0.633297
# 700, train loss: 0.000120, eval loss: 1.322265, acc = 0.640631
# 800, train loss: 0.000116, eval loss: 1.278196, acc = 0.648331
# 900, train loss: 0.000113, eval loss: 1.239691, acc = 0.654565
#1000, train loss: 0.000109, eval loss: 1.205762, acc = 0.658966
#1100, train loss: 0.000107, eval loss: 1.175633, acc = 0.663733
#1200, train loss: 0.000104, eval loss: 1.148698, acc = 0.668867
#1300, train loss: 0.000102, eval loss: 1.124469, acc = 0.672167
#1400, train loss: 0.000100, eval loss: 1.102557, acc = 0.672901
#1500, train loss: 0.0000

In [29]:
net = Baseline()
optimizer = torch.optim.Adam(net.parameters())
criterion = torch.nn.NLLLoss()
labs, vecs = prepare_pairs(trainset, lang = 'en')
labst, vecst = prepare_pairs(testset, lang = 'en')

t = time.time()
for i in range(4001):
    loss = train(net, criterion, optimizer, labs, vecs)
    if not i% 100:
        eval_loss, acc = eval(net, labst, vecst)
        print('#{:3d}, {:5d} sec. train loss: {:.7f}, eval loss: {:.4f}, acc = {:.3f}'.format(i, int(time.time() - t), loss, eval_loss, acc))
        
    

#  0,     0 sec. train loss: 0.0001774, eval loss: 1.9437, acc = 0.150
#100,    31 sec. train loss: 0.0001611, eval loss: 1.7688, acc = 0.579
#200,    73 sec. train loss: 0.0001481, eval loss: 1.6255, acc = 0.612
#300,   105 sec. train loss: 0.0001375, eval loss: 1.5095, acc = 0.635
#400,   137 sec. train loss: 0.0001290, eval loss: 1.4151, acc = 0.646
#500,   179 sec. train loss: 0.0001220, eval loss: 1.3376, acc = 0.653
#600,   210 sec. train loss: 0.0001162, eval loss: 1.2733, acc = 0.662
#700,   244 sec. train loss: 0.0001114, eval loss: 1.2194, acc = 0.669
#800,   276 sec. train loss: 0.0001074, eval loss: 1.1740, acc = 0.675
#900,   309 sec. train loss: 0.0001039, eval loss: 1.1352, acc = 0.680
#1000,   339 sec. train loss: 0.0001009, eval loss: 1.1020, acc = 0.681
#1100,   369 sec. train loss: 0.0000984, eval loss: 1.0732, acc = 0.685
#1200,   398 sec. train loss: 0.0000961, eval loss: 1.0481, acc = 0.689
#1300,   427 sec. train loss: 0.0000941, eval loss: 1.0261, acc = 0.695
#1

In [302]:
def eval_visual(model, labels, vectors):
    with torch.no_grad():
        vectors = torch.tensor(vectors).float()
        labels = torch.tensor(labels)
    
        model_out = model.forward(vectors)
        right = 0
        wrong = 0
        for i  in range(len(model_out)):
            k, v = model_out[i].topk(1)
            predicted, true = v.item(), labels[i].item()
            if predicted == true:
                right +=1
            else:
                print(id2lab[predicted], id2lab[true])
                wrong +=1
                
        print('{} out of {} = {}'.format(right, right+wrong, right/(right+wrong)))

In [269]:
eval_visual(net, labst, vecst)

ComparePlaces GetPlaceDetails
SearchPlace RequestRide
BookRestaurant GetWeather
GetPlaceDetails GetWeather
RequestRide GetTrafficInformation
SearchPlace GetPlaceDetails
GetTrafficInformation GetDirections
GetPlaceDetails GetWeather
GetDirections GetPlaceDetails
GetWeather GetPlaceDetails
GetDirections SearchPlace
SearchPlace ShareETA
SearchPlace GetPlaceDetails
GetPlaceDetails GetTrafficInformation
GetTrafficInformation GetWeather
GetWeather ShareCurrentLocation
GetPlaceDetails ComparePlaces
SearchPlace BookRestaurant
GetPlaceDetails GetTrafficInformation
SearchPlace RequestRide
ComparePlaces GetPlaceDetails
GetWeather GetPlaceDetails
GetDirections GetPlaceDetails
GetWeather GetPlaceDetails
GetPlaceDetails GetWeather
RequestRide GetWeather
BookRestaurant GetWeather
BookRestaurant GetTrafficInformation
50 out of 78 = 0.6410256410256411


In [50]:
trainset

array([['BookRestaurant',
        'I need a table for a party of 2 at Co-operative Block Building in Old Glory',
        'Jag behöver ett bord för en part 2 på Co-operative Blockera Byggnad i Gamla Ära'],
       ['BookRestaurant',
        'I need a table for a party of 2 at Co-operative Block Building in Old Glory',
        'Jag behöver ett bord för en part 2 på Co-operative Blockera Byggnad i Gamla Ära'],
       ['RateBook', 'I think this saga only deserves a 0 out of 6.',
        'Jag tror att detta bara saga förtjänar en 0 av 6.'],
       ...,
       ['PlayMusic', 'play the best album from the seventies',
        'spela bästa album från sjuttiotalet'],
       ['PlayMusic', 'Play a 1991 song by Anila Mirza',
        'Spela en 1991 låt av Anila Mirza'],
       ['PlayMusic',
        'Play a top five Jonny Buckland symphony from around 1989',
        'Spela en topp fem Jonny Buckland symphony från omkring 1989']],
      dtype='<U187')

In [82]:
labs, vecs = prepare_pairs(trainset, lang = 'sv')
labst, vecst = prepare_pairs(testset, lang = 'sv')

In [69]:
labs, vecs = prepare_pairs(dataset, lang = 'sv')


In [70]:
len(list(trainset[:,2])),len(set(trainset[:,2]))

(10000, 9880)

In [71]:
np.save('data/snips_processed/W2V-sv',np.array(vecs))

In [41]:
npvecs = np.squeeze(np.array(vecs), axis=1)

In [66]:
np.linalg.matrix_rank(npvecs)

70

In [83]:
eval(net, labs[:300], vecs[:300]), eval(net, labst, vecst)

((9.434146881103516, 0.13), (9.236837387084961, 0.10914376321353066))

In [81]:
labst[:10],'\n', labs[10000:10010], '\n', testset[:10,0], prepare_pairs(testset[:10], lang = 'sv')

([5, 0, 6, 4, 6, 3, 1, 0, 0, 6],
 '\n',
 [2, 2, 4, 6, 2, 3, 4, 6, 0, 4],
 '\n',
 array(['BookRestaurant', 'BookRestaurant', 'RateBook', 'GetWeather',
        'BookRestaurant', 'AddToPlaylist', 'RateBook', 'GetWeather',
        'SearchCreativeWork', 'RateBook'], dtype='<U187'),
 ([2, 2, 4, 6, 2, 3, 4, 6, 0, 4],
  [array([[ 4.46218504e-02, -5.82794164e-02,  3.71881625e-02,
            4.30101094e-02,  4.80079499e-02,  2.42398227e-02,
           -6.10103861e-02, -7.48268379e-02,  5.00496890e-02,
           -1.22885271e-01,  6.81696118e-02, -1.68878107e-01,
           -4.91728637e-02,  7.21554516e-02, -1.52816554e-01,
            2.79369355e-02,  8.14167189e-02,  6.50745787e-02,
            3.53033417e-02,  2.21541326e-02, -8.14735925e-02,
            1.37550606e-02, -1.88545898e-02,  3.75311122e-02,
            4.05550914e-02,  6.63977120e-03,  4.50894047e-03,
            5.88019016e-02, -1.97608137e-03,  5.69817123e-02,
           -8.99550740e-02,  2.38793726e-02, -9.29237566e-02,
      