In [67]:
import json
import csv
import random
import torch
import torch.nn as nn
import numpy as np
import contractions
import unicodedata
import re
import time
from collections import defaultdict, Counter


In [59]:
dataset = []
with open('data/snips_processed/snips.csv', 'r') as f:
    reader = csv.reader(x.replace('\0', '') for x in f)
    for line in reader:
        dataset.append(line)
dataset = np.array(dataset)
topics = ['BookRestaurant','GetWeather', 'SearchScreeningEvent','RateBook', 'SearchCreativeWork', 'AddToPlaylist', 'PlayMusic']

In [60]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def expand_contractions(text):
    return contractions.fix(text)



def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

@np.vectorize
def pre_process_text(document):
    
    # lower case
    document = document.lower()
    
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    document = remove_accented_chars(document)
    
    # expand contractions    
    document = expand_contractions(document)
               
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  
        
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document

In [70]:
en_words = pre_process_text(dataset[:, 1])
sv_words = pre_process_text(dataset[:, 2])
t_stats = Counter(dataset[:, 0])

In [203]:
def get_util(lprop, lload, n):
    a = list(map(lambda x:(x - 1/7)**2, lprop))
    b = lload[lprop.index(max(lprop))]
    
    return sum([aa * bb for aa, bb in zip(a, lload)]) * (7/6)

In [204]:
def compute_per_word_label(labels, sentences):
    en_stats = defaultdict(dict)

    for label, line in zip(labels, sentences):
        seen = set()
        for w in line.split():
            if w not in seen:
                en_stats[w][label] = en_stats[w].get(label, 0) + 1
                en_stats[w]['n'] = en_stats[w].get('n', 0) + 1
                seen.add(w)
    utils = []
    for k in en_stats:
        label_prop = []
        label_load = []
        for t in topics:
            en_stats[k][t] = en_stats[k].get(t, 0)
            label_prop.append(en_stats[k][t]/en_stats[k]['n'])
            label_load.append(en_stats[k][t]/t_stats[t])
        en_stats[k]['lprop'] = label_prop
        en_stats[k]['lload'] = label_load
        
        utility = get_util(label_prop, label_load, en_stats[k]['n'])
        utils.append(utility)
        en_stats[k]['u'] = utility
        
    return en_stats, utils


In [205]:
en_stats, en_u = compute_per_word_label(dataset[:,0], en_words)

sv_stats,sv_u = compute_per_word_label(dataset[:,0], sv_words)



In [206]:
def get_top_util(en_stats, n = 20):
    return sorted(en_stats.items(), key = lambda k : k[1]['u'], reverse = True)[:n]



In [207]:
def get_present_strint(w, i=''):
    return '{:2} {} {} {:5.3} {:5.3} {:5.3}'.format(i+1,
                                                    w[0].center(10),
                                                    topics[w[1]['lprop'].index(max(w[1]['lprop']))].center(20),
                                                    w[1]['u'],
                                                    max(w[1]['lprop']),
                                                    w[1]['lload'][w[1]['lprop'].index(max(w[1]['lprop']))])

header = '{:2} {:10} {:20} {:5} {:5} {:5}'.format(' #', 'word'.center(10), 'label'.center(20), 'u', 'mplp', 'mpll')
print(header, ' | ', header)
for i,w,s in zip(range(30),get_top_util(en_stats),get_top_util(sv_stats)):
    print(get_present_strint(w,i),' | ', get_present_strint(s,i))
       

 #    word           label         u     mplp  mpll   |   #    word           label         u     mplp  mpll 
 1    add        AddToPlaylist     0.686 0.996 0.809  |   1    lagg       AddToPlaylist      0.58 0.995 0.683
 2    play         PlayMusic       0.629 0.924 0.883  |   2    boka       BookRestaurant    0.508 0.966 0.642
 3  playlist     AddToPlaylist     0.464 0.931  0.64  |   3 spellista     AddToPlaylist     0.441 0.942 0.592
 4    rate          RateBook       0.436 0.997 0.512  |   4   spela         PlayMusic       0.415 0.782 0.867
 5  weather        GetWeather      0.376 0.999  0.44  |   5    till       AddToPlaylist     0.388 0.775 0.829
 6   movie    SearchScreeningEvent  0.36 0.917 0.514  |   6    bord       BookRestaurant    0.306 0.994 0.361
 7 restaurant    BookRestaurant    0.313 0.993 0.372  |   7 restaurang    BookRestaurant    0.302 0.993 0.358
 8    book       BookRestaurant     0.28 0.764 0.622  |   8   filmer   SearchScreeningEvent 0.277 0.995 0.327
 9     be 

In [57]:
en_labels_stats = defaultdict(list)

for k, v in en_stats.items():
    if v['n'] < 30:
        continue
    for kk, vv in v.items():
        if kk == 'n':
            continue
        en_labels_stats[kk].append((vv/v['n'],v['n'], k))


In [58]:
len(en_labels_stats['BookRestaurant'])

203

In [42]:
en_stats['at']

{'BookRestaurant': 1014,
 'n': 2105,
 'GetWeather': 274,
 'SearchScreeningEvent': 764,
 'RateBook': 19,
 'SearchCreativeWork': 24,
 'AddToPlaylist': 8,
 'PlayMusic': 2}

In [181]:
def count_stats(labels, sentences):
    total = 0
    unique = 0
    seen = set()

    for label, line in zip(labels, sentences):
        for w in line.split():
            total += 1
            if w not in seen:
                unique += 1
                seen.add(w)
    return total, unique

In [185]:
count_stats(dataset[:,0], en_words),count_stats(dataset[:,0], sv_words), len(dataset)

((119529, 11282), (115914, 12852), 13784)

In [202]:
12852/11282

1.1391597234532884

In [200]:
lens_en = list(map(lambda x:len(x.split()), en_words))
lens_sv = list(map(lambda x:len(x.split()), sv_words))

np.max(lens_sv), sorted(lens_en)

print(sorted(sv_words, key = lambda x:len(x.split())))

['hitta', 'spela escapada', 'spela sivamani', 'hitta gamble', 'spela eve', 'spela attiotalet', 'tabell vanligen', 'spela darude', 'hitta forskrackelsen', 'spela piano', 'spelar latinamerikansk', 'spela primus', 'hitta sapfo', 'hitta forsprang', 'spela pop', 'visa smaken', 'visa nightcall', 'spela iheart', 'spela musik', 'spela nittiotalet', 'fimd ara', 'spela rytmisk', 'spela tune', 'hitta ranka', 'spela dansmusik', 'hitta kistan', 'spela pandora', 'hitta phalcon', 'spela musik', 'oppna youtube', 'spela extatisk', 'hitta goodreader', 'spela deezer', 'hitta bruden', 'play vara', 'priser biskopen', 'spela zvooq', 'spela humor', 'hitta tyrannen', 'priser drunkning', 'spela stilleben', 'hitta gesall', 'spela ballad', 'spela satir', 'hitta starcross', 'hitta vattenfall', 'spela necromancer', 'ge tvmoralserien', 'betygsatt apemaninom', 'spela iheart', 'spela spotify', 'hitta laserlight', 'betygsatt stentangenten', 'spela latar', 'spela tribal', 'spela spotify', 'spela iheart', 'hitta sommers

In [184]:
dataset[10, 2]

'betygsätt det aktuella albumet 2 poäng av 6'