Embeddings:
* www.nilc.icmc.usp.br/nilc/index.php/repositorio-de-word-embeddings-do-nilc

# Import Packages

In [1]:
from gensim.models.keyedvectors import KeyedVectors

import logging

# Enable logger
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Declare variables

In [2]:
embeddings_path = "../datasets/glove_s300.txt"
analogies_test_path = "../datasets/LX-4WAnalogiesBr.txt"
similarity_test_path = "../datasets/LX-WordSim-353.txt"

# Load data

In [3]:
%time embeddings = KeyedVectors.load_word2vec_format(embeddings_path, limit=100000)

2017-12-01 03:15:50,465 : INFO : loading projection weights from ../datasets/glove_s300.txt
2017-12-01 03:16:29,979 : INFO : loaded (100000, 300) matrix from ../datasets/glove_s300.txt


CPU times: user 39.4 s, sys: 200 ms, total: 39.6 s
Wall time: 39.5 s


# Intrinsic evaluation

In [4]:
%time intrinsic_results = embeddings.accuracy(analogies_test_path)

2017-12-01 03:16:30,049 : INFO : precomputing L2-norms of word weight vectors
2017-12-01 03:16:31,737 : INFO : capital-common-countries: 87.6% (268/306)
2017-12-01 03:16:36,556 : INFO : capital-world: 77.0% (718/932)
2017-12-01 03:16:37,007 : INFO : currency: 3.5% (3/86)
2017-12-01 03:16:40,052 : INFO : city-in-state: 32.3% (191/591)
2017-12-01 03:16:41,815 : INFO : family: 67.3% (230/342)
2017-12-01 03:16:44,420 : INFO : gram1-adjective-to-adverb: 12.6% (64/506)
2017-12-01 03:16:45,367 : INFO : gram2-opposite: 27.5% (50/182)
2017-12-01 03:16:45,527 : INFO : gram3-comparative: 56.7% (17/30)
2017-12-01 03:16:45,569 : INFO : gram4-superlative: 83.3% (5/6)
2017-12-01 03:16:48,910 : INFO : gram5-present-participle: 86.2% (560/650)
2017-12-01 03:16:53,951 : INFO : gram6-nationality-adjective: 94.7% (917/968)
2017-12-01 03:16:59,058 : INFO : gram7-past-tense: 50.1% (497/992)
2017-12-01 03:17:04,138 : INFO : gram8-plural: 58.2% (577/992)
2017-12-01 03:17:06,974 : INFO : gram9-plural-verbs: 47

CPU times: user 1min 13s, sys: 452 ms, total: 1min 13s
Wall time: 37 s


In [5]:
%time intrinsic_results_2 = embeddings.evaluate_word_pairs(similarity_test_path)

2017-12-01 03:17:07,253 : INFO : Pearson correlation coefficient against ../datasets/LX-WordSim-353.txt: 0.4809
2017-12-01 03:17:07,254 : INFO : Spearman rank-order correlation coefficient against ../datasets/LX-WordSim-353.txt: 0.4631
2017-12-01 03:17:07,257 : INFO : Pairs with unknown words ratio: 2.8%


CPU times: user 356 ms, sys: 16 ms, total: 372 ms
Wall time: 279 ms


# Visual Evaluation

## Most similar words

### Country

In [6]:
embeddings.similar_by_word("canadá", topn=3)

[('austrália', 0.7295444011688232),
 ('zelândia', 0.7236131429672241),
 ('grã-bretanha', 0.6537486910820007)]

### Job

In [7]:
embeddings.similar_by_word("analista", topn=3)

[('especialista', 0.5496712327003479),
 ('consultor', 0.530369222164154),
 ('economista', 0.5302985310554504)]

### Company

In [8]:
embeddings.similar_by_word("empreiteira", topn=3)

[('odebrecht', 0.7677455544471741),
 ('construtora', 0.7101113200187683),
 ('oas', 0.7029767632484436)]

In [9]:
embeddings.similar_by_word("carpinteiro", topn=3)

[('pedreiro', 0.6096227169036865),
 ('marceneiro', 0.5527135133743286),
 ('serralheiro', 0.5456693768501282)]

### Course

In [10]:
embeddings.similar_by_word("psicologia", topn=3)

[('sociologia', 0.7215724587440491),
 ('pedagogia', 0.6655964255332947),
 ('filosofia', 0.6611499190330505)]

### Job Hierarchy

In [11]:
embeddings.similar_by_word("supervisor", topn=3)

[('coordenador', 0.5576378703117371),
 ('diretor', 0.5296371579170227),
 ('gerente', 0.5212496519088745)]

In [12]:
embeddings.similar_by_word("estagiário", topn=3)

[('assistente', 0.4384298324584961),
 ('funcionário', 0.43782487511634827),
 ('aprendiz', 0.4022955596446991)]

## Semantic analogies

### Gender

In [13]:
embeddings.most_similar(positive=['mulher', 'rei'], negative=['homem'], topn=1)

[('rainha', 0.7193284034729004)]

### Country-Capital

In [14]:
embeddings.most_similar(positive=['canadá', 'tóquio'], negative=['japão'], topn=2)

[('londres', 0.6558359861373901), ('toronto', 0.6119667291641235)]

### Country-Currency

In [15]:
embeddings.most_similar(positive=['dólar', 'japão'], negative=['eua'], topn=1)

[('iene', 0.618338406085968)]

### Composition

In [16]:
embeddings.most_similar(positive=['roger', 'tênis'], topn=2)

[('torneio', 0.5396567583084106), ('federer', 0.5071016550064087)]

In [17]:
embeddings.most_similar(positive=['capital', 'inglaterra'], topn=3)

[('cidade', 0.6659485697746277),
 ('frança', 0.6098123788833618),
 ('londres', 0.6050515174865723)]