In [166]:
import json
import csv
import random
import torch
import torch.nn as nn
import numpy as np
import contractions
import unicodedata
import re
import time


In [173]:
dataset = []
with open('data/snips_processed/snips.csv', 'r') as f:
    reader = csv.reader(x.replace('\0', '') for x in f)
    for line in reader:
        dataset.append(line)
dataset = np.array(dataset)

testset = dataset[10000 :]

trainset = dataset[:10000]

train_sent_en = trainset[:,1]
train_sent_sv = trainset[:,2]

train_lab = trainset[:,0]

test_sent_en = testset[:,1]
test_sent_sv = testset[:,2]

test_lab = testset[:,0]

labels = list(set(test_lab))
lab2id = {}
id2lab = {}

for i in range(len(labels)):
    lab2id[labels[i]] = i
    id2lab[i] = labels[i]

In [175]:
id2lab

{0: 'SearchScreeningEvent',
 1: 'RateBook',
 2: 'AddToPlaylist',
 3: 'PlayMusic',
 4: 'GetWeather',
 5: 'SearchCreativeWork',
 6: 'BookRestaurant'}

In [176]:
from elmoformanylangs import Embedder

en_model = Embedder('models/144')
sv_model = Embedder('models/173')

2019-04-30 11:50:20,047 INFO: char embedding size: 4939
2019-04-30 11:50:21,354 INFO: word embedding size: 167642
2019-04-30 11:50:31,628 INFO: Model(
  (token_embedder): ConvTokenEmbedder(
    (word_emb_layer): EmbeddingLayer(
      (embedding): Embedding(167642, 100, padding_idx=3)
    )
    (char_emb_layer): EmbeddingLayer(
      (embedding): Embedding(4939, 50, padding_idx=4936)
    )
    (convolutions): ModuleList(
      (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
      (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
      (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
      (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
      (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
      (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
      (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
    )
    (highways): Highway(
      (_layers): ModuleList(
        (0): Linear(in_features=2048, out_features=4096, bias=True)
        (1): Linear(in_features=2048, out_fe

In [10]:
sents = [['what', 'is','my','name'],['my','name','is','Jan']]
# the list of lists which store the sentences 
# after segment if necessary.
a = en_model.sents2elmo(sents)
a[0].shape

2019-04-17 13:14:28,443 INFO: 1 batches, avg len: 6.0


(4, 1024)

In [34]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def expand_contractions(text):
    return contractions.fix(text)



def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

@np.vectorize
def pre_process_text(document):
    
    # lower case
    document = document.lower()
    
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    document = remove_accented_chars(document)
    
    # expand contractions    
    document = expand_contractions(document)
               
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  
        
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document

In [35]:
pre_process_text(test_sent_en)

array(['i want to rate the turbulent term of tyke tiler a',
       'when is the great question playing at the closest movie house',
       'will mondamin be hot on july', ...,
       'give me the movie times for fox theatres',
       'can i get the movie times for fox theatres',
       'what movies are scheduled in the neighbourhood'], dtype='<U127')

In [35]:
def sentence_vec(sentence, model):
    result = np.zeros((1, 1024))
    sentence = pre_process_te(sentence)
    sentence = list(map(lambda x: x.split(), sentence
    
    
    for word in sentence:
        result += model.get_word_vector(word.lower())
    return result/len(sentence)
    

In [136]:
def prepare_labs(labs):
    out = []
    for lab in labs:
        out.append(lab2id[lab])
    return out

def prepare_sentence_vecs(sents, lang = 'en'):
    
    if lang == 'en':
        model = en_model
        slab = 1
    elif lang == 'sv':
        model = sv_model
        slab = 2
    else:
        raise RuntimeError('lang is not supported')
    vectors = []
    
    sents = pre_process_text(sents)
    sents = list(map(lambda x: x.split(), sents))
    vecs = model.sents2elmo(sents)
    vecs = list(map(lambda x:[x.mean(axis=0)], vecs))
        
    return vecs

In [133]:
class Baseline(nn.Module):
    def __init__(self, in_size = 1024, out_size = 7):
        super(Baseline, self).__init__()

        self.W = nn.Linear(in_size, 7)
        self.out = nn.LogSoftmax(2)
        
    def forward(self, x):
        x = self.W(x)
        return self.out(x)



In [194]:
def train(model, criterion, optimizer, labels, vectors):
    model.zero_grad()
    loss = 0
    
    #vectors = torch.tensor(vectors).float()
    #labels = torch.tensor(labels)
    
    model_out = model.forward(vectors)
    loss += criterion(model_out[:,0], labels)
    
    loss.backward()
    optimizer.step()
    
    return loss.item()/len(labels)

In [195]:
def eval(model, labels, vectors):
    with torch.no_grad():
        #vectors = torch.tensor(vectors).float()
        #labels = torch.tensor(labels)
    
        model_out = model.forward(vectors)
        right = 0
        
        for i  in range(len(model_out)):
            k, v = model_out[i].topk(1)
            predicted, true = v.item(), labels[i].item()
            if predicted == true:
                right +=1

                
        loss = criterion(model_out[:,0], labels)
        return loss.item(), right/len(model_out)
    
        

In [151]:
#vecs, labss = prepare_sentence_vecs(train_sent_en), prepare_labs(train_lab)
#vecst, labst = prepare_sentence_vecs(test_sent_en), prepare_labs(test_lab)


In [201]:
net = Baseline()
optimizer = torch.optim.Adam(net.parameters())
criterion = torch.nn.NLLLoss()

tvecs = torch.tensor(vecs).float()
tvecst = torch.tensor(vecst).float()
tlabs = torch.tensor(labs)
tlabst = torch.tensor(labst)

t = time.time()
for i in range(4001):
    loss = train(net, criterion, optimizer, tlabs, tvecs)
    if not i% 100:
        eval_loss, acc = eval(net, tlabst, tvecst)
        print('#{:3d}, {:5d} sec. train loss: {:.7f}, eval loss: {:.4f}, acc = {:.3f}'.format(i, int(time.time() - t), loss, eval_loss, acc))

    

#  0,     0 sec. train loss: 0.0002009, eval loss: 1.9417, acc = 0.141
#100,     2 sec. train loss: 0.0000289, eval loss: 2.0385, acc = 0.607
#200,     4 sec. train loss: 0.0000158, eval loss: 2.0120, acc = 0.691
#300,     6 sec. train loss: 0.0000112, eval loss: 2.0577, acc = 0.699
#400,     9 sec. train loss: 0.0000088, eval loss: 2.1031, acc = 0.702
#500,    11 sec. train loss: 0.0000074, eval loss: 2.1404, acc = 0.705
#600,    13 sec. train loss: 0.0000064, eval loss: 2.1697, acc = 0.706
#700,    15 sec. train loss: 0.0000056, eval loss: 2.1916, acc = 0.708
#800,    18 sec. train loss: 0.0000050, eval loss: 2.2073, acc = 0.709
#900,    20 sec. train loss: 0.0000046, eval loss: 2.2176, acc = 0.710
#1000,    22 sec. train loss: 0.0000042, eval loss: 2.2231, acc = 0.710
#1100,    24 sec. train loss: 0.0000038, eval loss: 2.2246, acc = 0.711
#1200,    26 sec. train loss: 0.0000035, eval loss: 2.2226, acc = 0.712
#1300,    29 sec. train loss: 0.0000033, eval loss: 2.2176, acc = 0.712
#1

In [199]:
net_sv = Baseline()
optimizer = torch.optim.Adam(net_sv.parameters())
criterion = torch.nn.NLLLoss()
#labs, vecs = prepare_pairs(trainset, lang = 'en')
#labst, vecst = prepare_pairs(testset, lang = 'en')
sv_vecs_train = torch.tensor(sv_vecs[:10000]).float()
sv_vecs_test = torch.tensor(sv_vecs[10000:]).float()
train_labs = torch.tensor(prepare_labs(train_lab))
test_labs = torch.tensor(prepare_labs(test_lab))

t = time.time()
for i in range(4001):
    loss = train(net_sv, criterion, optimizer, train_labs, sv_vecs_train)
    if not i% 100:
        eval_loss, acc = eval(net_sv, test_labs, sv_vecs_test)
        print('#{:3d}, {:5d} sec. train loss: {:.7f}, eval loss: {:.4f}, acc = {:.3f}'.format(i, int(time.time() - t), loss, eval_loss, acc))
        
    

#  0,     0 sec. train loss: 0.0001956, eval loss: 1.9371, acc = 0.193
#100,     2 sec. train loss: 0.0001027, eval loss: 1.0330, acc = 0.743
#200,     4 sec. train loss: 0.0000766, eval loss: 0.7820, acc = 0.792
#300,     6 sec. train loss: 0.0000642, eval loss: 0.6639, acc = 0.818
#400,     8 sec. train loss: 0.0000566, eval loss: 0.5932, acc = 0.832
#500,    10 sec. train loss: 0.0000514, eval loss: 0.5451, acc = 0.848
#600,    13 sec. train loss: 0.0000476, eval loss: 0.5098, acc = 0.856
#700,    15 sec. train loss: 0.0000445, eval loss: 0.4824, acc = 0.864
#800,    17 sec. train loss: 0.0000420, eval loss: 0.4602, acc = 0.870
#900,    19 sec. train loss: 0.0000399, eval loss: 0.4419, acc = 0.873
#1000,    22 sec. train loss: 0.0000381, eval loss: 0.4264, acc = 0.877
#1100,    24 sec. train loss: 0.0000365, eval loss: 0.4130, acc = 0.880
#1200,    26 sec. train loss: 0.0000351, eval loss: 0.4014, acc = 0.883
#1300,    29 sec. train loss: 0.0000338, eval loss: 0.3911, acc = 0.886
#1

In [302]:
def eval_visual(model, labels, vectors):
    with torch.no_grad():
        vectors = torch.tensor(vectors).float()
        labels = torch.tensor(labels)
    
        model_out = model.forward(vectors)
        right = 0
        wrong = 0
        for i  in range(len(model_out)):
            k, v = model_out[i].topk(1)
            predicted, true = v.item(), labels[i].item()
            if predicted == true:
                right +=1
            else:
                print(id2lab[predicted], id2lab[true])
                wrong +=1
                
        print('{} out of {} = {}'.format(right, right+wrong, right/(right+wrong)))

In [269]:
eval_visual(net, labst, vecst)

ComparePlaces GetPlaceDetails
SearchPlace RequestRide
BookRestaurant GetWeather
GetPlaceDetails GetWeather
RequestRide GetTrafficInformation
SearchPlace GetPlaceDetails
GetTrafficInformation GetDirections
GetPlaceDetails GetWeather
GetDirections GetPlaceDetails
GetWeather GetPlaceDetails
GetDirections SearchPlace
SearchPlace ShareETA
SearchPlace GetPlaceDetails
GetPlaceDetails GetTrafficInformation
GetTrafficInformation GetWeather
GetWeather ShareCurrentLocation
GetPlaceDetails ComparePlaces
SearchPlace BookRestaurant
GetPlaceDetails GetTrafficInformation
SearchPlace RequestRide
ComparePlaces GetPlaceDetails
GetWeather GetPlaceDetails
GetDirections GetPlaceDetails
GetWeather GetPlaceDetails
GetPlaceDetails GetWeather
RequestRide GetWeather
BookRestaurant GetWeather
BookRestaurant GetTrafficInformation
50 out of 78 = 0.6410256410256411


In [21]:
trainset

array([['RateBook', 'rate the current textbook zero out of 6 points'],
       ['BookRestaurant',
        'restaurant in Elberta for alma, deana and olga at 18:49:20 that serves tsipouro'],
       ['SearchScreeningEvent',
        'What movies are currently at Star Theatres?'],
       ...,
       ['PlayMusic', 'Please play Different Slanguages by Fred Labour.'],
       ['PlayMusic', 'play some King Tubby from the eighties'],
       ['AddToPlaylist',
        'add M-CABI to the playlist named Pre-Party R&B Jams']],
      dtype='<U186')

In [170]:
np.linalg.matrix_rank(np.squeeze(np.array(vecs), axis=1))

1024

In [179]:
svp = pre_process_text(dataset[:, 2])

In [181]:
sv_vecs = prepare_sentence_vecs(dataset[:,2])

2019-04-30 11:54:03,067 INFO: 216 batches, avg len: 10.4
2019-04-30 11:55:08,015 INFO: Finished 1000 sentences.
2019-04-30 11:56:04,033 INFO: Finished 2000 sentences.
2019-04-30 11:56:57,281 INFO: Finished 3000 sentences.
2019-04-30 11:58:15,155 INFO: Finished 4000 sentences.
2019-04-30 11:59:09,727 INFO: Finished 5000 sentences.
2019-04-30 11:59:59,573 INFO: Finished 6000 sentences.
2019-04-30 12:01:17,737 INFO: Finished 7000 sentences.
2019-04-30 12:02:22,889 INFO: Finished 8000 sentences.
2019-04-30 12:03:17,264 INFO: Finished 9000 sentences.
2019-04-30 12:04:13,081 INFO: Finished 10000 sentences.
2019-04-30 12:05:06,521 INFO: Finished 11000 sentences.
2019-04-30 12:06:10,445 INFO: Finished 12000 sentences.
2019-04-30 12:07:10,576 INFO: Finished 13000 sentences.


In [192]:
len(train_labs), len(sv_vecs_train)

(10000, 10000)

In [183]:
#np.save('data/snips_processed/ELMO-sv',np.array(sv_vecs))

In [204]:
np.linalg.matrix_rank(np.squeeze(np.array(sv_vecs), axis=1))

694