In [166]:
import json
import csv
import random
import torch
import torch.nn as nn
import numpy as np
import contractions
import unicodedata
import re
import time


In [172]:
trainset = []
with open('data/snips_processed/snips.csv', 'r') as f:
    reader = csv.reader(x.replace('\0', '') for x in f)
    for line in reader:
        trainset.append(line)
trainset = np.array(trainset)

testset = trainset[10000 :]

trainset = trainset[:10000]

train_sent_en = trainset[:,1]
#train_sent_sv = trainset[:,2]
train_lab = trainset[:,0]

testset = []
with open('data/snips_processed/snips.csv', 'r') as f:
    reader = csv.reader(x.replace('\0', '') for x in f)
    for line in reader:
        testset.append(line)
testset = np.array(testset)

test_sent_en = testset[:,1]
#test_sent_sv = testset[:,2]
test_lab = testset[:,0]




labels = list(set(test_lab))
lab2id = {}
id2lab = {}

for i in range(len(labels)):
    lab2id[labels[i]] = i
    id2lab[i] = labels[i]

In [95]:
id2lab

{0: 'SearchScreeningEvent',
 1: 'RateBook',
 2: 'AddToPlaylist',
 3: 'PlayMusic',
 4: 'GetWeather',
 5: 'SearchCreativeWork',
 6: 'BookRestaurant'}

In [31]:
from elmoformanylangs import Embedder

en_model = Embedder('models/144')

2019-04-17 13:25:04,017 INFO: char embedding size: 4939
2019-04-17 13:25:04,831 INFO: word embedding size: 167642
2019-04-17 13:25:12,257 INFO: Model(
  (token_embedder): ConvTokenEmbedder(
    (word_emb_layer): EmbeddingLayer(
      (embedding): Embedding(167642, 100, padding_idx=3)
    )
    (char_emb_layer): EmbeddingLayer(
      (embedding): Embedding(4939, 50, padding_idx=4936)
    )
    (convolutions): ModuleList(
      (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
      (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
      (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
      (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
      (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
      (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
      (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
    )
    (highways): Highway(
      (_layers): ModuleList(
        (0): Linear(in_features=2048, out_features=4096, bias=True)
        (1): Linear(in_features=2048, out_fe

In [10]:
sents = [['what', 'is','my','name'],['my','name','is','Jan']]
# the list of lists which store the sentences 
# after segment if necessary.
a = en_model.sents2elmo(sents)
a[0].shape

2019-04-17 13:14:28,443 INFO: 1 batches, avg len: 6.0


(4, 1024)

In [34]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def expand_contractions(text):
    return contractions.fix(text)



def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

@np.vectorize
def pre_process_text(document):
    
    # lower case
    document = document.lower()
    
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    document = remove_accented_chars(document)
    
    # expand contractions    
    document = expand_contractions(document)
               
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  
        
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document

In [35]:
pre_process_text(test_sent_en)

array(['i want to rate the turbulent term of tyke tiler a',
       'when is the great question playing at the closest movie house',
       'will mondamin be hot on july', ...,
       'give me the movie times for fox theatres',
       'can i get the movie times for fox theatres',
       'what movies are scheduled in the neighbourhood'], dtype='<U127')

In [35]:
def sentence_vec(sentence, model):
    result = np.zeros((1, 1024))
    sentence = pre_process_te(sentence)
    sentence = list(map(lambda x: x.split(), sentence
    
    
    for word in sentence:
        result += model.get_word_vector(word.lower())
    return result/len(sentence)
    

In [136]:
def prepare_labs(labs):
    out = []
    for lab in labs:
        out.append(lab2id[lab])
    return out

def prepare_sentence_vecs(sents, lang = 'en'):
    
    if lang == 'en':
        model = en_model
        slab = 1
    elif lang == 'sv':
        model = sv_model
        slab = 2
    else:
        raise RuntimeError('lang is not supported')
    vectors = []
    
    sents = pre_process_text(sents)
    sents = list(map(lambda x: x.split(), sents))
    vecs = model.sents2elmo(sents)
    vecs = list(map(lambda x:[x.mean(axis=0)], vecs))
        
    return vecs

In [133]:
class Baseline(nn.Module):
    def __init__(self, in_size = 1024, out_size = 7):
        super(Baseline, self).__init__()

        self.W = nn.Linear(in_size, 7)
        self.out = nn.LogSoftmax(2)
        
    def forward(self, x):
        x = self.W(x)
        return self.out(x)



In [149]:
def train(model, criterion, optimizer, labels, vectors):
    model.zero_grad()
    loss = 0
    
    vectors = torch.tensor(vectors).float()
    labels = torch.tensor(labels)
    
    model_out = model.forward(vectors)
    loss += criterion(model_out[:,0], labels)
    
    loss.backward()
    optimizer.step()
    
    return loss.item()/len(labels)

In [150]:
def eval(model, labels, vectors):
    with torch.no_grad():
        vectors = torch.tensor(vectors).float()
        labels = torch.tensor(labels)
    
        model_out = model.forward(vectors)
        right = 0
        
        for i  in range(len(model_out)):
            k, v = model_out[i].topk(1)
            predicted, true = v.item(), labels[i].item()
            if predicted == true:
                right +=1

                
        loss = criterion(model_out[:,0], labels)
        return loss.item(), right/len(model_out)
    
        

In [151]:
#vecs, labss = prepare_sentence_vecs(train_sent_en), prepare_labs(train_lab)
#vecst, labst = prepare_sentence_vecs(test_sent_en), prepare_labs(test_lab)


In [168]:
net = Baseline()
optimizer = torch.optim.Adam(net.parameters())
criterion = torch.nn.NLLLoss()

t = time.time()
for i in range(4001):
    loss = train(net, criterion, optimizer, labs, vecs)
    if not i% 100:
        eval_loss, acc = eval(net, labst, vecst)
        print('#{:3d}, {:5d} sec. train loss: {:.7f}, eval loss: {:.4f}, acc = {:.3f}'.format(i, int(time.time() - t), loss, eval_loss, acc))

    

#  0,     2 sec. train loss: 0.0002024, eval loss: 1.9434, acc = 0.156
#100,   116 sec. train loss: 0.0000291, eval loss: 2.0364, acc = 0.601
#200,   222 sec. train loss: 0.0000158, eval loss: 2.0020, acc = 0.691
#300,   324 sec. train loss: 0.0000112, eval loss: 2.0452, acc = 0.699
#400,   425 sec. train loss: 0.0000088, eval loss: 2.0888, acc = 0.702
#500,   557 sec. train loss: 0.0000074, eval loss: 2.1245, acc = 0.705
#600,   672 sec. train loss: 0.0000064, eval loss: 2.1521, acc = 0.707
#700,   778 sec. train loss: 0.0000056, eval loss: 2.1725, acc = 0.708
#800,   883 sec. train loss: 0.0000050, eval loss: 2.1866, acc = 0.709
#900,   986 sec. train loss: 0.0000046, eval loss: 2.1952, acc = 0.710
#1000,  1090 sec. train loss: 0.0000042, eval loss: 2.1992, acc = 0.710
#1100,  1193 sec. train loss: 0.0000038, eval loss: 2.1992, acc = 0.711
#1200,  1297 sec. train loss: 0.0000035, eval loss: 2.1957, acc = 0.712
#1300,  3699 sec. train loss: 0.0000033, eval loss: 2.1894, acc = 0.712
#1

In [41]:
net = Baseline()
optimizer = torch.optim.Adam(net.parameters())
criterion = torch.nn.NLLLoss()
labs, vecs = prepare_pairs(trainset, lang = 'en')
labst, vecst = prepare_pairs(testset, lang = 'en')


for i in range(4001):
    loss = train(net, criterion, optimizer, labs, vecs)
    if not i% 100:
        eval_loss, acc = eval(net, labst, vecst)
        print('#{:4d}, train loss: {:3f}, eval loss: {:3f}, acc = {:3f}'.format(i, loss, eval_loss, acc))
        
    

#   0, train loss: 0.000214, eval loss: 2.296857, acc = 0.148974
# 100, train loss: 0.000048, eval loss: 0.526288, acc = 0.851026
# 200, train loss: 0.000037, eval loss: 0.406894, acc = 0.851026
# 300, train loss: 0.000033, eval loss: 0.360803, acc = 0.853185
# 400, train loss: 0.000030, eval loss: 0.331020, acc = 0.862540
# 500, train loss: 0.000028, eval loss: 0.309052, acc = 0.870817
# 600, train loss: 0.000027, eval loss: 0.292021, acc = 0.879813
# 700, train loss: 0.000026, eval loss: 0.278468, acc = 0.890248
# 800, train loss: 0.000025, eval loss: 0.267494, acc = 0.898884
# 900, train loss: 0.000024, eval loss: 0.258484, acc = 0.902123
#1000, train loss: 0.000023, eval loss: 0.251004, acc = 0.903922
#1100, train loss: 0.000023, eval loss: 0.244733, acc = 0.908960
#1200, train loss: 0.000022, eval loss: 0.239429, acc = 0.911479
#1300, train loss: 0.000022, eval loss: 0.234910, acc = 0.912199
#1400, train loss: 0.000021, eval loss: 0.231031, acc = 0.914358
#1500, train loss: 0.0000

In [302]:
def eval_visual(model, labels, vectors):
    with torch.no_grad():
        vectors = torch.tensor(vectors).float()
        labels = torch.tensor(labels)
    
        model_out = model.forward(vectors)
        right = 0
        wrong = 0
        for i  in range(len(model_out)):
            k, v = model_out[i].topk(1)
            predicted, true = v.item(), labels[i].item()
            if predicted == true:
                right +=1
            else:
                print(id2lab[predicted], id2lab[true])
                wrong +=1
                
        print('{} out of {} = {}'.format(right, right+wrong, right/(right+wrong)))

In [269]:
eval_visual(net, labst, vecst)

ComparePlaces GetPlaceDetails
SearchPlace RequestRide
BookRestaurant GetWeather
GetPlaceDetails GetWeather
RequestRide GetTrafficInformation
SearchPlace GetPlaceDetails
GetTrafficInformation GetDirections
GetPlaceDetails GetWeather
GetDirections GetPlaceDetails
GetWeather GetPlaceDetails
GetDirections SearchPlace
SearchPlace ShareETA
SearchPlace GetPlaceDetails
GetPlaceDetails GetTrafficInformation
GetTrafficInformation GetWeather
GetWeather ShareCurrentLocation
GetPlaceDetails ComparePlaces
SearchPlace BookRestaurant
GetPlaceDetails GetTrafficInformation
SearchPlace RequestRide
ComparePlaces GetPlaceDetails
GetWeather GetPlaceDetails
GetDirections GetPlaceDetails
GetWeather GetPlaceDetails
GetPlaceDetails GetWeather
RequestRide GetWeather
BookRestaurant GetWeather
BookRestaurant GetTrafficInformation
50 out of 78 = 0.6410256410256411


In [21]:
trainset

array([['RateBook', 'rate the current textbook zero out of 6 points'],
       ['BookRestaurant',
        'restaurant in Elberta for alma, deana and olga at 18:49:20 that serves tsipouro'],
       ['SearchScreeningEvent',
        'What movies are currently at Star Theatres?'],
       ...,
       ['PlayMusic', 'Please play Different Slanguages by Fred Labour.'],
       ['PlayMusic', 'play some King Tubby from the eighties'],
       ['AddToPlaylist',
        'add M-CABI to the playlist named Pre-Party R&B Jams']],
      dtype='<U186')

In [160]:
net.forward(torch.tensor(vecs))[:,0].shape

torch.Size([10000, 7])

In [170]:
np.linalg.matrix_rank(np.squeeze(np.array(vecs), axis=1))

1024