In [2]:
import json
import csv
import random
import time
import torch
import torch.nn as nn
import numpy as np
# use this library https://github.com/facebookresearch/fastText/tree/master/python
import fastText

import contractions
import unicodedata
import re

from utils import *


In [5]:
dataset, lab2id, id2lab =import_data('data/snips_processed/snips.csv')

In [6]:
sv_model = fastText.load_model('data/cc.sv.300.bin')
en_model = fastText.load_model('data/cc.en.300.bin')

In [7]:
def sentence_vec(sentence, model):
    result = np.zeros((1, 300))
    for word in sentence.split():
        result += model.get_word_vector(word)
    return result/len(sentence)
    

In [8]:
def prepare_sentence_vecs(data, lang = 'en', preprocess=False):
    
    if lang == 'en':
        model = en_model
        slab = 1
    elif lang == 'sv':
        model = sv_model
        slab = 2
    else:
        raise RuntimeError('lang is not supported')
    vectors = []
    
    sents = pre_process_text(data[:, slab])  if preprocess else data[:, slab]
    vecs = list(map(lambda x:sentence_vec(x, model), sents))
        
    return vecs

In [9]:
lab2id

{'BookRestaurant': 0,
 'GetWeather': 1,
 'SearchScreeningEvent': 2,
 'RateBook': 3,
 'SearchCreativeWork': 4,
 'AddToPlaylist': 5,
 'PlayMusic': 6}

In [10]:
vecs_en = prepare_sentence_vecs(dataset, lang='en', preprocess=True)
print('en-done')
vecs_sv = prepare_sentence_vecs(dataset, lang='sv', preprocess=True)
print('sv-done')
labs = prepare_labs(dataset[:,0], lab2id)



en-done
sv-done


In [77]:
def full_train_model(train_vecs, train_labs, test_vecs, test_labs, verbose = False, runs = 4001):
    net = Baseline(in_size=300)
    optimizer = torch.optim.Adam(net.parameters())
    criterion = torch.nn.NLLLoss()
    
    
    tvecs = torch.tensor(train_vecs).float()
    tvecst = torch.tensor(test_vecs).float()
    tlabs = torch.tensor(train_labs)
    tlabst = torch.tensor(test_labs)
        
    t = time.time()
    for i in range(6001):
        loss = train(net, criterion, optimizer, tlabs, tvecs)
        if verbose and not i% 100:
            eval_loss, acc = evaluate(net, tlabst, tvecst, criterion)
            print('#{:3d}, {:5d} sec. train loss: {:.7f}, eval loss: {:.4f}, acc = {:.3f}'.format(i, int(time.time() - t), loss, eval_loss, acc))
    
    eval_loss, acc = evaluate(net, tlabst, tvecst, criterion)
    return acc


In [78]:
def cv(vecs, labs, folds = 5):
    
    delims = np.arange(0, len(vecs), len(vecs)//folds)
    results = []
    t = time.time()
    for i in range(folds):
        results.append(
            full_train_model(vecs[:delims[i]] + vecs[delims[i+1]:],
                             labs[:delims[i]] + labs[delims[i+1]:],
                             vecs[delims[i] : delims[i+1]],
                             labs[delims[i] : delims[i+1]],
                                  False, runs = 6001))
        print('#{:3d}, {:5d} sec. acc = {:.3f}'.format(i, int(time.time() - t), results[-1]))

    return(sum(results)/len(results))   
    

In [79]:
cv(vecs_en, labs)

#  0,    68 sec. acc = 0.914
#  1,   135 sec. acc = 0.930
#  2,   202 sec. acc = 0.919
#  3,   267 sec. acc = 0.922
#  4,   347 sec. acc = 0.913


0.9195210449927431

In [80]:
cv(vecs_sv, labs)

#  0,   289 sec. acc = 0.882
#  1,  1332 sec. acc = 0.889
#  2,  1571 sec. acc = 0.888
#  3,  1660 sec. acc = 0.878
#  4,  1735 sec. acc = 0.881


0.8835994194484762

In [13]:
vecs_sv[0].shape

(1, 300)

In [76]:
np.linalg.matrix_rank(np.squeeze(np.array(vecs_en), axis=1))

300

In [81]:
np.save('data/snips_processed/FastText-en',np.array(vecs_en))
np.save('data/snips_processed/FastText-sv',np.array(vecs_sv))