In [1]:
import json
import csv
import random
import time
import torch
import torch.nn as nn
import numpy as np
# use this library https://github.com/facebookresearch/fastText/tree/master/python
import fastText

import contractions
import unicodedata
import re
from collections import defaultdict,Counter
from utils import *

from sklearn.cluster import KMeans

In [7]:
dataset, lab2id, id2lab =import_data('data/snips_processed/snipsf.csv')

In [3]:
#sv_model = fastText.load_model('data/cc.sv.300.bin')
#en_model = fastText.load_model('data/cc.en.300.bin')
fi_model = fastText.load_model('data/cc.fi.300.bin')

In [4]:
def sentence_vec(sentence, model, stats):
    result = np.zeros((1, 300))
    
    norm = 0
    
    for word in sentence.split():
        
        wv =  model.get_word_vector(word)
        word_util = stats[word].get('u', 0)
        result += wv*word_util
        norm += word_util
    return result/norm
    

In [9]:
def prepare_sentence_vecs(data, lang = 'en', preprocess=False):

    if lang == 'en':
        model = en_model
        slab = 1
    elif lang == 'sv':
        model = sv_model
        slab = 2
    elif lang == 'fi':
        model = fi_model
        slab = 3
    else:
        raise RuntimeError('lang is not supported')
    
    vectors = []
    
    sents = pre_process_text(data[:, slab])  if preprocess else data[:, slab]
    
    stats, utils = compute_per_word_label(data[:,0], sents)

    vecs = list(map(lambda x:sentence_vec(x, model, stats), sents))
        
    return vecs

In [10]:
#vecs_en_uw = prepare_sentence_vecs(dataset, lang='en', preprocess=True)
#print('en-done')
#vecs_sv_uw = prepare_sentence_vecs(dataset, lang='sv', preprocess=True)
#print('sv-done')

vecs_fi_uw = prepare_sentence_vecs(dataset, lang='fi', preprocess=True)

labs = prepare_labs(dataset[:,0], lab2id)

vecs_en = list(np.load('data/snips_processed/FastText-en.npy'))
vecs_sv = list(np.load('data/snips_processed/FastText-sv.npy'))



In [11]:
def full_train_model(train_vecs, train_labs, test_vecs, test_labs, verbose = False, runs = 4001):
    net = Baseline(in_size=train_vecs[0].shape[1])
    optimizer = torch.optim.Adam(net.parameters())
    criterion = torch.nn.NLLLoss()
    
    
    tvecs = torch.tensor(train_vecs).float()
    tvecst = torch.tensor(test_vecs).float()
    tlabs = torch.tensor(train_labs)
    tlabst = torch.tensor(test_labs)
        
    t = time.time()
    for i in range(6001):
        loss = train(net, criterion, optimizer, tlabs, tvecs)
        if verbose and not i% 100:
            eval_loss, acc = evaluate(net, tlabst, tvecst, criterion)
            print('#{:3d}, {:5d} sec. train loss: {:.7f}, eval loss: {:.4f}, acc = {:.3f}'.format(i, int(time.time() - t), loss, eval_loss, acc))
    
    eval_loss, acc = evaluate(net, tlabst, tvecst, criterion)
    return acc


In [12]:
def cv(vecs, labs, folds = 5):
    
    delims = np.arange(0, len(vecs), len(vecs)//folds)
    results = []
    t = time.time()
    for i in range(folds):
        results.append(
            full_train_model(vecs[:delims[i]] + vecs[delims[i+1]:],
                             labs[:delims[i]] + labs[delims[i+1]:],
                             vecs[delims[i] : delims[i+1]],
                             labs[delims[i] : delims[i+1]],
                                  False, runs = 6001))
        print('#{:3d}, {:5d} sec. acc = {:.3f}'.format(i, int(time.time() - t), results[-1]))

    return(sum(results)/len(results))   
    

In [43]:
cv(vecs_en_uw, labs)

#  0,   179 sec. acc = 0.966
#  1,   469 sec. acc = 0.962
#  2,  1436 sec. acc = 0.964
#  3,  1678 sec. acc = 0.961
#  4,  1770 sec. acc = 0.959


0.9624818577648766

In [44]:
cv(vecs_sv_uw, labs)

#  0,    60 sec. acc = 0.940
#  1,   124 sec. acc = 0.944
#  2,   198 sec. acc = 0.948
#  3,   277 sec. acc = 0.941
#  4,   347 sec. acc = 0.939


0.9423802612481857

In [161]:
cv(vecs_en, labs)

#  0,    61 sec. acc = 0.914
#  1,   123 sec. acc = 0.930
#  2,   181 sec. acc = 0.919
#  3,   246 sec. acc = 0.921
#  4,   313 sec. acc = 0.912


0.9193033381712628

In [15]:
cv(vecs_fi_uw, labs)

#  0,    92 sec. acc = 0.942
#  1,   184 sec. acc = 0.947
#  2,   275 sec. acc = 0.944
#  3,   365 sec. acc = 0.943
#  4,   420 sec. acc = 0.941


0.9432510885341074

In [16]:
np.save('data/snips_processed/FastText-uw-fi',np.array(vecs_fi_uw))
#np.save('data/snips_processed/FastText-uw-en',np.array(vecs_en_uw))
#np.save('data/snips_processed/FastText-uw-sv',np.array(vecs_sv_uw))

Как написано в статье https://openreview.net/forum?id=SyK00v5xx

После получения взвешенной суммы можно отфильтровать составляющую главной компоненты. Но получается что не помогает. 


In [108]:
def remove_prim_component(vecs):
    u, s, v = np.linalg.svd(np.squeeze(np.array(vecs), axis=1), full_matrices=False)
    uut = np.array([u[0]]).T @ np.array([u[0]])
    new_v = []
    for el in vecs:
        new_v.append(el - (uut @ el.T).T)
    return new_v

In [110]:
vecs_en_pcr  = remove_prim_component(vecs_en)

In [111]:
cv(vecs_en_pcr, labs)

#  0,    65 sec. acc = 0.965
#  1,   124 sec. acc = 0.962
#  2,   182 sec. acc = 0.964
#  3,   242 sec. acc = 0.961
#  4,   302 sec. acc = 0.960


0.9624092888243831

#### Переход к пространству расстояний до центроидов


In [151]:
def transform_to_kmeans(vecs, clusters=50):
    kmeans_en = KMeans(n_clusters=clusters).fit(np.squeeze(np.array(vecs), axis=1))
    centroids = kmeans_en.cluster_centers_
    new_vecs = []
    for old_v in vecs:
        new_v = np.array([list(map(np.linalg.norm, centroids - old_v))])
        new_vecs.append(new_v)
        
    return new_vecs

In [156]:
vecs_en_kmeans  = transform_to_kmeans(vecs_en)

In [157]:
cv(vecs_en_kmeans, labs)

#  0,    27 sec. acc = 0.731
#  1,    50 sec. acc = 0.729
#  2,    89 sec. acc = 0.722
#  3,   113 sec. acc = 0.746
#  4,   139 sec. acc = 0.714


0.7285195936139333

In [164]:
vecs_sv_kmeans  = transform_to_kmeans(vecs_sv)

In [166]:
cv(vecs_sv_kmeans, labs)

#  0,    25 sec. acc = 0.602
#  1,    63 sec. acc = 0.613
#  2,    98 sec. acc = 0.621
#  3,   135 sec. acc = 0.603
#  4,   168 sec. acc = 0.621


0.6121190130624092

In [167]:
vecs_en_kmeans100  = transform_to_kmeans(vecs_sv, clusters=100)

In [168]:
cv(vecs_en_kmeans100, labs)

#  0,    44 sec. acc = 0.674
#  1,    89 sec. acc = 0.677
#  2,   125 sec. acc = 0.689
#  3,   157 sec. acc = 0.674
#  4,   188 sec. acc = 0.687


0.6800435413642961

In [186]:
max(np.linalg.norm(vecs_en_uw, axis  = -1))

array([3.61104783])