In [23]:
import json
import csv
import random
import torch
import torch.nn as nn
import numpy as np
# use this library https://github.com/facebookresearch/fastText/tree/master/python
import fastText


In [33]:
trainset = []
with open('data/snips_processed/snips_train.csv', 'r') as f:
    reader = csv.reader(x.replace('\0', '') for x in f)
    for line in reader:
        trainset.append(line)
trainset = np.array(trainset)

testset = []
with open('data/snips_processed/snips_test.csv', 'r') as f:
    reader = csv.reader(x.replace('\0', '') for x in f)
    for line in reader:
        testset.append(line)
testset = np.array(testset)

labels = list(set(trainset[:,0]))
lab2id = {}
id2lab = {}

for i in range(len(labels)):
    lab2id[labels[i]] = i
    id2lab[i] = labels[i]

In [34]:
sv_model = fastText.load_model('data/cc.sv.300.bin')
en_model = fastText.load_model('data/cc.en.300.bin')

In [35]:
def sentence_vec(sentence, model):
    result = np.zeros((1, 300))
    sentence = sentence.strip()
    for word in sentence:
        result += model.get_word_vector(word.lower())
    return result/len(sentence)
    

In [36]:
def prepare_pair(label, sentence, model):
    return (lab2id[label]),(sentence_vec(sentence, model))

def prepare_pairs(data, lang = 'en'):
    if lang == 'en':
        model = en_model
        slab = 1
    elif lang == 'sv':
        model = sv_model
        slab = 2
    else:
        raise RuntimeError('lang is not supported')
    labels = []
    vectors = []
    for sample in data:
        l, v = prepare_pair(sample[0], sample[slab], model)
        labels.append(l)
        vectors.append(v)
        
    return labels, vectors

In [37]:
class Baseline(nn.Module):
    def __init__(self, in_size = 300, out_size = 10):
        super(Baseline, self).__init__()

        self.W = nn.Linear(300, 10)
        self.out = nn.LogSoftmax(2)
        
    def forward(self, x):
        x = self.W(x)
        return self.out(x)



In [38]:
def train(model, criterion, optimizer, labels, vectors):
    model.zero_grad()
    loss = 0
    
    vectors = torch.tensor(vectors).float()
    labels = torch.tensor(labels)
    
    model_out = model.forward(vectors)
    loss += criterion(model_out[:,0], labels)
    
    loss.backward()
    optimizer.step()
    
    return loss.item()/len(labels)

In [39]:
def eval(model, labels, vectors):
    with torch.no_grad():
        vectors = torch.tensor(vectors).float()
        labels = torch.tensor(labels)
    
        model_out = model.forward(vectors)
        right = 0
        
        for i  in range(len(model_out)):
            k, v = model_out[i].topk(1)
            predicted, true = v.item(), labels[i].item()
            if predicted == true:
                right +=1

                
        loss = criterion(model_out[:,0], labels)
        return loss.item(), right/len(model_out)
    
        

In [40]:
net = Baseline()
optimizer = torch.optim.Adam(net.parameters())
criterion = torch.nn.NLLLoss()
labs, vecs = prepare_pairs(trainset, lang = 'sv')
labst, vecst = prepare_pairs(testset, lang = 'sv')


for i in range(4001):
    loss = train(net, criterion, optimizer, labs, vecs)
    if not i% 100:
        eval_loss, acc = eval(net, labst, vecst)
        print('#{:4d}, train loss: {:3f}, eval loss: {:3f}, acc = {:3f}'.format(i, loss, eval_loss, acc))
        
    

#   0, train loss: 0.000209, eval loss: 2.249134, acc = 0.697373
# 100, train loss: 0.000052, eval loss: 0.572726, acc = 0.851026
# 200, train loss: 0.000040, eval loss: 0.440538, acc = 0.851026
# 300, train loss: 0.000036, eval loss: 0.396479, acc = 0.851026
# 400, train loss: 0.000033, eval loss: 0.369428, acc = 0.851745
# 500, train loss: 0.000031, eval loss: 0.349387, acc = 0.853544
# 600, train loss: 0.000030, eval loss: 0.333423, acc = 0.857143
# 700, train loss: 0.000029, eval loss: 0.320255, acc = 0.861821
# 800, train loss: 0.000028, eval loss: 0.309168, acc = 0.863980
# 900, train loss: 0.000027, eval loss: 0.299699, acc = 0.869377
#1000, train loss: 0.000026, eval loss: 0.291517, acc = 0.874775
#1100, train loss: 0.000026, eval loss: 0.284377, acc = 0.879813
#1200, train loss: 0.000025, eval loss: 0.278093, acc = 0.882332
#1300, train loss: 0.000025, eval loss: 0.272518, acc = 0.885570
#1400, train loss: 0.000024, eval loss: 0.267535, acc = 0.889529
#1500, train loss: 0.0000

In [41]:
net = Baseline()
optimizer = torch.optim.Adam(net.parameters())
criterion = torch.nn.NLLLoss()
labs, vecs = prepare_pairs(trainset, lang = 'en')
labst, vecst = prepare_pairs(testset, lang = 'en')


for i in range(4001):
    loss = train(net, criterion, optimizer, labs, vecs)
    if not i% 100:
        eval_loss, acc = eval(net, labst, vecst)
        print('#{:4d}, train loss: {:3f}, eval loss: {:3f}, acc = {:3f}'.format(i, loss, eval_loss, acc))
        
    

#   0, train loss: 0.000214, eval loss: 2.296857, acc = 0.148974
# 100, train loss: 0.000048, eval loss: 0.526288, acc = 0.851026
# 200, train loss: 0.000037, eval loss: 0.406894, acc = 0.851026
# 300, train loss: 0.000033, eval loss: 0.360803, acc = 0.853185
# 400, train loss: 0.000030, eval loss: 0.331020, acc = 0.862540
# 500, train loss: 0.000028, eval loss: 0.309052, acc = 0.870817
# 600, train loss: 0.000027, eval loss: 0.292021, acc = 0.879813
# 700, train loss: 0.000026, eval loss: 0.278468, acc = 0.890248
# 800, train loss: 0.000025, eval loss: 0.267494, acc = 0.898884
# 900, train loss: 0.000024, eval loss: 0.258484, acc = 0.902123
#1000, train loss: 0.000023, eval loss: 0.251004, acc = 0.903922
#1100, train loss: 0.000023, eval loss: 0.244733, acc = 0.908960
#1200, train loss: 0.000022, eval loss: 0.239429, acc = 0.911479
#1300, train loss: 0.000022, eval loss: 0.234910, acc = 0.912199
#1400, train loss: 0.000021, eval loss: 0.231031, acc = 0.914358
#1500, train loss: 0.0000

In [302]:
def eval_visual(model, labels, vectors):
    with torch.no_grad():
        vectors = torch.tensor(vectors).float()
        labels = torch.tensor(labels)
    
        model_out = model.forward(vectors)
        right = 0
        wrong = 0
        for i  in range(len(model_out)):
            k, v = model_out[i].topk(1)
            predicted, true = v.item(), labels[i].item()
            if predicted == true:
                right +=1
            else:
                print(id2lab[predicted], id2lab[true])
                wrong +=1
                
        print('{} out of {} = {}'.format(right, right+wrong, right/(right+wrong)))

In [269]:
eval_visual(net, labst, vecst)

ComparePlaces GetPlaceDetails
SearchPlace RequestRide
BookRestaurant GetWeather
GetPlaceDetails GetWeather
RequestRide GetTrafficInformation
SearchPlace GetPlaceDetails
GetTrafficInformation GetDirections
GetPlaceDetails GetWeather
GetDirections GetPlaceDetails
GetWeather GetPlaceDetails
GetDirections SearchPlace
SearchPlace ShareETA
SearchPlace GetPlaceDetails
GetPlaceDetails GetTrafficInformation
GetTrafficInformation GetWeather
GetWeather ShareCurrentLocation
GetPlaceDetails ComparePlaces
SearchPlace BookRestaurant
GetPlaceDetails GetTrafficInformation
SearchPlace RequestRide
ComparePlaces GetPlaceDetails
GetWeather GetPlaceDetails
GetDirections GetPlaceDetails
GetWeather GetPlaceDetails
GetPlaceDetails GetWeather
RequestRide GetWeather
BookRestaurant GetWeather
BookRestaurant GetTrafficInformation
50 out of 78 = 0.6410256410256411


In [21]:
trainset

array([['RateBook', 'rate the current textbook zero out of 6 points'],
       ['BookRestaurant',
        'restaurant in Elberta for alma, deana and olga at 18:49:20 that serves tsipouro'],
       ['SearchScreeningEvent',
        'What movies are currently at Star Theatres?'],
       ...,
       ['PlayMusic', 'Please play Different Slanguages by Fred Labour.'],
       ['PlayMusic', 'play some King Tubby from the eighties'],
       ['AddToPlaylist',
        'add M-CABI to the playlist named Pre-Party R&B Jams']],
      dtype='<U186')

In [32]:
lab2id

{'what films are on the movie schedule for ten at a close by theater': 0,
 'play new music by Elmore James with google music': 1,
 'What is the weather going to be like in Flint Hill in 9 years  ': 2,
 'add the artist Tomohisa Yamashita to the conexiones playlist': 3,
 'Rate The Hollow Man 5 points': 4,
 'Give A History of the Mind a 2 out of 6 points.': 5,
 'add spanish castle magic to Daily Lift': 6,
 'Rate the current novel a 3. ': 7,
 'Can you locate the novel, Ismol Family': 8,
 'Will it be rainy this Monday in Le Center, Thailand': 9,
 'What will the weather be in Grand Coteau UT at six pm?': 10,
 'whats the closest movie theatre showing animated movies': 11,
 'My group of seven wants to eat at Yogurt Mountain in Slick tonight.': 12,
 "I want to hear Sarban's greatest hits": 13,
 'book a pub for pepperoni near their house and not far': 14,
 'Book me a reestaurant that is close in the country of Cocos Islands': 15,
 'show Chocolate Rain creativity': 16,
 'Add another album to my k