In [6]:
import json
import csv
import random
import torch
import torch.nn as nn
import numpy as np
import contractions
import unicodedata
import re
import time

from utils import *


In [7]:
dataset, lab2id, id2lab = import_data('data/snips_processed/snips.csv')

In [8]:
from elmoformanylangs import Embedder

en_model = Embedder('models/144')
sv_model = Embedder('models/173')

2019-05-09 19:15:17,096 INFO: char embedding size: 4939
2019-05-09 19:15:18,224 INFO: word embedding size: 167642
2019-05-09 19:15:25,790 INFO: Model(
  (token_embedder): ConvTokenEmbedder(
    (word_emb_layer): EmbeddingLayer(
      (embedding): Embedding(167642, 100, padding_idx=3)
    )
    (char_emb_layer): EmbeddingLayer(
      (embedding): Embedding(4939, 50, padding_idx=4936)
    )
    (convolutions): ModuleList(
      (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
      (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
      (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
      (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
      (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
      (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
      (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
    )
    (highways): Highway(
      (_layers): ModuleList(
        (0): Linear(in_features=2048, out_features=4096, bias=True)
        (1): Linear(in_features=2048, out_fe

In [15]:
def prepare_sentence_vecs(data, lang = 'en', preprocess = True):
    
    if lang == 'en':
        model = en_model
        slab = 1
    elif lang == 'sv':
        model = sv_model
        slab = 2
    else:
        raise RuntimeError('lang is not supported')
    vectors = []
    
    sents = pre_process_text(data[:, slab]) if preprocess else data[:, slab]
    sents = list(map(lambda x: x.split(), sents))
    vecs = model.sents2elmo(sents)
    vecs = list(map(lambda x:[x.mean(axis=0)], vecs))
        
    return vecs

In [16]:
vecs_en = prepare_sentence_vecs(dataset, lang='en', preprocess=True)
print('en-done')
vecs_sv = prepare_sentence_vecs(dataset, lang='sv', preprocess=True)
print('sv-done')
labs = prepare_labs(dataset[:,0], lab2id)


2019-05-09 19:17:22,582 INFO: 216 batches, avg len: 11.0
2019-05-09 19:18:16,419 INFO: Finished 1000 sentences.
2019-05-09 19:19:28,068 INFO: Finished 2000 sentences.
2019-05-09 19:21:03,398 INFO: Finished 3000 sentences.
2019-05-09 19:22:43,078 INFO: Finished 4000 sentences.
2019-05-09 19:25:07,442 INFO: Finished 5000 sentences.
2019-05-09 19:26:44,032 INFO: Finished 6000 sentences.
2019-05-09 19:27:39,693 INFO: Finished 7000 sentences.
2019-05-09 19:28:58,364 INFO: Finished 8000 sentences.
2019-05-09 19:30:53,596 INFO: Finished 9000 sentences.
2019-05-09 19:32:11,276 INFO: Finished 10000 sentences.
2019-05-09 19:33:47,273 INFO: Finished 11000 sentences.
2019-05-09 19:35:08,786 INFO: Finished 12000 sentences.
2019-05-09 19:36:02,834 INFO: Finished 13000 sentences.


en-done


2019-05-09 19:36:58,923 INFO: 216 batches, avg len: 10.7
2019-05-09 19:37:46,885 INFO: Finished 1000 sentences.
2019-05-09 19:38:41,387 INFO: Finished 2000 sentences.
2019-05-09 19:39:22,613 INFO: Finished 3000 sentences.
2019-05-09 19:40:10,497 INFO: Finished 4000 sentences.
2019-05-09 19:41:07,780 INFO: Finished 5000 sentences.
2019-05-09 19:41:56,874 INFO: Finished 6000 sentences.
2019-05-09 19:42:50,912 INFO: Finished 7000 sentences.
2019-05-09 19:43:36,157 INFO: Finished 8000 sentences.
2019-05-09 19:44:24,914 INFO: Finished 9000 sentences.
2019-05-09 19:45:13,743 INFO: Finished 10000 sentences.
2019-05-09 19:45:56,937 INFO: Finished 11000 sentences.
2019-05-09 19:46:52,175 INFO: Finished 12000 sentences.
2019-05-09 19:47:39,212 INFO: Finished 13000 sentences.


sv-done


In [40]:
def full_train_model(train_vecs, train_labs, test_vecs, test_labs, verbose = False, runs = 6001):
    net = Baseline(in_size=1024)
    optimizer = torch.optim.Adam(net.parameters())
    criterion = torch.nn.NLLLoss()
    
    tvecs = torch.tensor(train_vecs).float()
    tvecst = torch.tensor(test_vecs).float()
    tlabs = torch.tensor(train_labs)
    tlabst = torch.tensor(test_labs)
    
    t = time.time()
    for i in range(runs):
        loss = train(net, criterion, optimizer, tlabs, tvecs)
        if verbose and not i% 100:
            eval_loss, acc = evaluate(net, tlabst, tvecst, criterion)
            print('#{:3d}, {:5d} sec. train loss: {:.7f}, eval loss: {:.4f}, acc = {:.3f}'.format(i, int(time.time() - t), loss, eval_loss, acc))
    
    eval_loss, acc = evaluate(net, tlabst, tvecst, criterion)
    return acc, net
    

In [21]:
def cv(vecs, labs, folds = 5):
    
    delims = np.arange(0, len(vecs), len(vecs)//folds)
    results = []
    t = time.time()
    for i in range(folds):
        acc, model = full_train_model(vecs[:delims[i]] + vecs[delims[i+1]:],
                             labs[:delims[i]] + labs[delims[i+1]:],
                             vecs[delims[i] : delims[i+1]],
                             labs[delims[i] : delims[i+1]],
                                  False, runs = 6001)
        
        results.append(acc)
        print('#{:3d}, {:5d} sec. acc = {:.3f}'.format(i, int(time.time() - t), results[-1]))

    return(sum(results)/len(results))   
    

In [22]:
cv(vecs_en, labs)

#  0,   132 sec. acc = 0.978
#  1,   266 sec. acc = 0.982
#  2,   399 sec. acc = 0.980
#  3,   532 sec. acc = 0.975
#  4,   675 sec. acc = 0.975


0.9779390420899855

In [23]:
cv(vecs_sv, labs)

#  0,   134 sec. acc = 0.957
#  1,   269 sec. acc = 0.967
#  2,   402 sec. acc = 0.962
#  3,   535 sec. acc = 0.960
#  4,   666 sec. acc = 0.959


0.9609579100145137

In [41]:

acc, model = full_train_model(vecs_en[:10000], labs[:10000],vecs_en[10000:], labs[10000:],verbose=True, runs = 2001)

#  0,     0 sec. train loss: 0.0001957, eval loss: 1.9207, acc = 0.296
#100,     2 sec. train loss: 0.0000438, eval loss: 0.4498, acc = 0.942
#200,     5 sec. train loss: 0.0000251, eval loss: 0.2686, acc = 0.957
#300,     7 sec. train loss: 0.0000183, eval loss: 0.2032, acc = 0.964
#400,    10 sec. train loss: 0.0000147, eval loss: 0.1689, acc = 0.969
#500,    12 sec. train loss: 0.0000124, eval loss: 0.1478, acc = 0.971
#600,    14 sec. train loss: 0.0000108, eval loss: 0.1334, acc = 0.972
#700,    17 sec. train loss: 0.0000096, eval loss: 0.1230, acc = 0.973
#800,    19 sec. train loss: 0.0000087, eval loss: 0.1151, acc = 0.972
#900,    21 sec. train loss: 0.0000079, eval loss: 0.1089, acc = 0.972
#1000,    23 sec. train loss: 0.0000073, eval loss: 0.1040, acc = 0.973
#1100,    26 sec. train loss: 0.0000067, eval loss: 0.1000, acc = 0.974
#1200,    29 sec. train loss: 0.0000062, eval loss: 0.0966, acc = 0.975
#1300,    31 sec. train loss: 0.0000058, eval loss: 0.0938, acc = 0.975
#1

In [51]:

my_test_data = ['is it going to be sunny tomorrow?',
                'Ponedelnik nachinayensa v subboty is great, rate it five out of five on litres',
                'show paintings by Edward Munk',
                'get me a table at Molleys on thursday nigt', 
                'I want to hear zvezda po imeny solntse' ,
                'Can you add this song to my list of terrible music ?',
                'whant to see avengers movie',
                'where can i see avengers movie',
                'where can i see avengers movie tomorrow',
                'give me schedule for avengers movie']

sents = pre_process_text(my_test_data)
sents = list(map(lambda x: x.split(), sents))
mtv = en_model.sents2elmo(sents)
mtv = list(map(lambda x:[x.mean(axis=0)], mtv))
        
mtv = torch.tensor(mtv).float()

model_out = model.forward(mtv)

for i  in range(len(model_out)):
    k, v = model_out[i].topk(1)
    predicted = id2lab[v.item()]
    print(predicted.center(20), my_test_data[i])

2019-05-10 12:36:02,791 INFO: 1 batches, avg len: 9.8


     GetWeather      is it going to be sunny tomorrow?
      RateBook       Ponedelnik nachinayensa v subboty is great, rate it five out of five on litres
 SearchCreativeWork  show paintings by Edward Munk
   BookRestaurant    get me a table at Molleys on thursday nigt
     PlayMusic       I want to hear zvezda po imeny solntse
   AddToPlaylist     Can you add this song to my list of terrible music ?
 SearchCreativeWork  whant to see avengers movie
 SearchCreativeWork  where can i see avengers movie
SearchScreeningEvent where can i see avengers movie tomorrow
SearchScreeningEvent give me schedule for avengers movie


In [67]:
vecs_en

[[array([ 0.04016051,  0.09717539, -0.34789434, ..., -0.00368522,
          0.12931311, -0.07019039], dtype=float32)],
 [array([-0.22952786,  0.43852776, -0.41361895, ...,  0.02657003,
          0.17530717,  0.09127494], dtype=float32)],
 [array([-0.16851561,  0.00349546, -0.13739517, ..., -0.17584783,
         -0.04614789, -0.08018098], dtype=float32)],
 [array([-0.01151042,  0.53884536, -0.47006974, ...,  0.26074356,
          0.12880903,  0.07174455], dtype=float32)],
 [array([ 0.11014143,  0.14462428, -0.3916171 , ..., -0.03997735,
          0.15999442,  0.11326421], dtype=float32)],
 [array([ 0.09576765,  0.1827814 , -0.00362419, ...,  0.21593502,
         -0.0611985 , -0.08854359], dtype=float32)],
 [array([-0.25031805,  0.13836421, -0.24467887, ...,  0.13460772,
          0.1628169 , -0.01556131], dtype=float32)],
 [array([-0.22694956,  0.18063396, -0.22006586, ..., -0.250987  ,
          0.1468881 ,  0.14770496], dtype=float32)],
 [array([-0.00519705,  0.30699226, -0.32516843, 

In [212]:
sv_vecs = prepare_sentence_vecs(dataset[:,2], lang = 'sv')

2019-05-08 14:49:54,488 INFO: 216 batches, avg len: 10.7
2019-05-08 14:50:54,030 INFO: Finished 1000 sentences.
2019-05-08 14:51:58,782 INFO: Finished 2000 sentences.
2019-05-08 14:52:51,576 INFO: Finished 3000 sentences.
2019-05-08 14:53:41,265 INFO: Finished 4000 sentences.
2019-05-08 14:54:32,148 INFO: Finished 5000 sentences.
2019-05-08 14:55:29,312 INFO: Finished 6000 sentences.
2019-05-08 14:56:20,994 INFO: Finished 7000 sentences.
2019-05-08 14:57:04,597 INFO: Finished 8000 sentences.
2019-05-08 14:58:07,532 INFO: Finished 9000 sentences.
2019-05-08 14:59:51,630 INFO: Finished 10000 sentences.
2019-05-08 15:01:10,891 INFO: Finished 11000 sentences.
2019-05-08 15:02:34,081 INFO: Finished 12000 sentences.
2019-05-08 15:03:32,505 INFO: Finished 13000 sentences.


In [192]:
len(train_labs), len(sv_vecs_train)

(10000, 10000)

In [56]:
np.save('data/snips_processed/ELMO-en',np.array(vecs_en))
np.save('data/snips_processed/ELMO-sv',np.array(vecs_sv))
np.save('data/snips_processed/labs',np.array(labs))

In [53]:
np.linalg.matrix_rank(np.squeeze(np.array(vecs_sv), axis=1))

1024