In [3]:
import pandas as pd
from helpers.word2vec import visualize_most_frequent, visualize_words, full_train_model, get_vector_representations, distance_sum_metric, p_at_k_metric
from gensim.models import Word2Vec

# Interleaved

## Trénování modelu

In [4]:
interleaved = pd.read_csv("corpus/eurlex/interleaved.csv")
interleaved.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
%%time
window = 5
vector_size = 100
model_type = 'cbow'

interleaved_model = Word2Vec(window=window, min_count=3, negative=1, workers=4, sg=1 if 'sg' else 0, vector_size=vector_size)
full_train_model(interleaved_model, interleaved)
print("trained_models/eurlex_simple_joined_{:s}_{:d}x{:d}".format(model_type, window, vector_size))
interleaved_model.save("trained_models/eurlex_simple_joined_{:s}_{:d}x{:d}".format(model_type, window, vector_size))

# Simple joined

## Trénování modelu

In [15]:
simple_joined = pd.read_csv('corpus/eurlex/simple_joined.csv')
simple_joined.drop(columns=['Unnamed: 0'], inplace=True)

In [28]:
%%time
window = 75
vector_size = 50
simple_joined_model = Word2Vec(window=window, min_count=1, negative=1, workers=4, sg=1, vector_size=vector_size)
full_train_model(simple_joined_model, simple_joined)
simple_joined_model.save("trained_models/eurlex_simple_joined_sg_{:d}x{:d}".format(window, vector_size))

Iteration 0/4
Iteration 1/4
Iteration 2/4
Iteration 3/4
CPU times: user 25min 16s, sys: 2.45 s, total: 25min 18s
Wall time: 8min 13s


# Vizualizace výsledků pomocí PCA a t-SNE

In [6]:
interleaved_model = Word2Vec.load('trained_models/eurlex_interleaved_cbow_5x100')

In [7]:
get_vector_representations(interleaved_model).to_csv('corpus/eurlex/vector_representations.csv')

In [8]:
en_words = ['access', 'authoris', 'capit', 'committe', 'consum', 'content', 'deadlin', 'document', 'expenditur', 'full', 'geograph', 'group', 'growth', 'hear', 'instrument', 'investig', 'label', 'list', 'loan', 'network', 'opinion', 'partnership', 'pension', 'price', 'sale', 'secret', 'solut', 'trade', 'type', 'wast']
cz_words = ['přístup', 'povolen', 'kapitál', 'výbor', 'spotřebitel', 'obsah', 'lhůt', 'dokument', 'výdaj', 'pln', 'zeměpisn', 'skup', 'růst', 'slyšen', 'nástroj', 'šetřen', 'označen', 'seznam', 'úvěr', 'síť', 'stanovisk', 'partnerst', 'důchod', 'cen', 'prodej', 'tajemstv', 'řešen', 'obchod', 'typ', 'odpad']

In [10]:
en_freqs = pd.read_csv('corpus/eurlex/en_freqs.csv')
cz_freqs = pd.read_csv('corpus/eurlex/cz_freqs.csv')

In [14]:
visualize_most_frequent(interleaved_model, en_freqs, cz_freqs, perplexity=30)

In [13]:
visualize_words(interleaved_model, en_words, cz_words, perplexity=15)

# Evaluace úspěšnosti modelu

In [5]:
models = ['cbow']
for model_type in models:
    for vector_size in range(200, 300, 100):
        for window in range(100, 175, 25):
            model = Word2Vec.load("trained_models/eurlex_simple_joined_{:s}_{:d}x{:d}".format(model_type, window, vector_size))
            print("-----Vector Size: {:d}, Context Window: {:d}, Model Type: {:s}-----".format(vector_size, window, model_type))
            distance_sum = distance_sum_metric(model, en_words, cz_words, en_freqs)
            print("Distance sum: {:f}".format(distance_sum))
            p_at_k = p_at_k_metric(model, en_words, cz_words, en_freqs, k=10)
            print("P@10: {:f}".format(p_at_k))

-----Vector Size: 200, Context Window: 100, Model Type: cbow-----
P@10: 0.833333
-----Vector Size: 200, Context Window: 125, Model Type: cbow-----
P@10: 0.800000
-----Vector Size: 200, Context Window: 150, Model Type: cbow-----
P@10: 0.900000


In [7]:
vector_size = 200
window = 8
model_type = 'cbow'
model = Word2Vec.load("trained_models/eurlex_interleaved_{:s}_{:d}x{:d}".format(model_type, window, vector_size))
print("-----Vector Size: {:d}, Context Window: {:d}, Model Type: {:s}-----".format(vector_size, window, model_type))
distance_sum = distance_sum_metric(model, en_words, cz_words, en_freqs)
print("Distance sum: {:f}".format(distance_sum))
p_at_k = p_at_k_metric(model, en_words, cz_words, en_freqs, k=10)
print("P@10: {:f}".format(p_at_k))

-----Vector Size: 200, Context Window: 8, Model Type: cbow-----
P@10: 0.800000


In [5]:
from helpers.word2vec import get_single_representation, cosine_distance
def all_tests(model, en_words, cz_words, en_freqs):
    distance_sum = 0
    success_5 = 0
    success_10 = 0
    df = get_vector_representations(model)
    all_en_words = en_freqs['Unnamed: 0'].values.tolist()
    en_representations = df.loc[df['token'].isin(all_en_words)]
    for i in range(len(en_words)):
        cz_repr = get_single_representation(model, cz_words[i])
        en_repr = get_single_representation(model, en_words[i])
        distances = en_representations.apply(lambda x: cosine_distance(x[list(range(1, model.vector_size + 1))].tolist(), cz_repr), axis=1)
        distances.rename('distances', inplace=True)
        largest_dist = distances.max()
        distance_sum += cosine_distance(en_repr, cz_repr) / largest_dist

        merged = pd.concat([distances, en_representations], axis=1)
        neighbors_5 = merged.nsmallest(5, 'distances')['token'].tolist()
        neighbors_10 = merged.nsmallest(10, 'distances')['token'].tolist()
        if en_words[i] in neighbors_5:
            success_5 += 1
        if en_words[i] in neighbors_10:
            success_10 += 1
    print("P@5:", success_5 / len(en_words))
    print("P@10:", success_10 / len(en_words))
    print("distance_sum:", distance_sum)

In [6]:
models = ['sg']
vector_size = 50
for model_type in models:
    for window in range(50, 175, 25):
        model = Word2Vec.load("trained_models/eurlex_simple_joined_{:s}_{:d}x{:d}".format(model_type, window, vector_size))
        print("-----Vector Size: {:d}, Context Window: {:d}, Model Type: {:s}-----".format(vector_size, window, model_type))
        all_tests(model, en_words, cz_words, en_freqs)

-----Vector Size: 50, Context Window: 50, Model Type: sg-----
P@5: 0.26666666666666666
P@10: 0.36666666666666664
distance_sum: 4.772666817319418
-----Vector Size: 50, Context Window: 75, Model Type: sg-----
P@5: 0.4
P@10: 0.43333333333333335
distance_sum: 5.331978549992691
-----Vector Size: 50, Context Window: 100, Model Type: sg-----
P@5: 0.36666666666666664
P@10: 0.43333333333333335
distance_sum: 5.264594803225473
-----Vector Size: 50, Context Window: 125, Model Type: sg-----
P@5: 0.3333333333333333
P@10: 0.36666666666666664
distance_sum: 4.998102937632998
-----Vector Size: 50, Context Window: 150, Model Type: sg-----
P@5: 0.3333333333333333
P@10: 0.36666666666666664
distance_sum: 5.443800120999057
