# word2vec evaluation


---
# Setups

In [1]:
import cProfile
import sys
import os
import re
from itertools import islice
from typing import Dict, List
import numpy as np
import tensorflow as tf

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=400) 

# Jupyter notebook setups

Auto reolaod causes an error in Jupyter notebooks. Restart the Jupyter kernel for the error:
```TypeError: super(type, obj): obj must be an instance or subtype of type```
See
- https://stackoverflow.com/a/52927102/4281353
- http://thomas-cokelaer.info/blog/2011/09/382/

> The problem resides in the mechanism of reloading modules.
> Reloading a module often changes the internal object in memory which
> makes the isinstance test of super return False.

In [2]:
%load_ext line_profiler
%load_ext autoreload

## Utilites

In [3]:
%autoreload 2

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

import function.fileio as fileio
import function.text as text

---
# Data Types


In [4]:
from common.constant import (
    TYPE_INT,
    TYPE_FLOAT,
    TYPE_LABEL,
    TYPE_TENSOR,
)

# Constants

In [15]:
USE_PTB = False
USE_TEXT8 = True

CORPUS_FILE = "text8_512" if USE_TEXT8 else "ptb_train"
CORPUS_URL = "https://data.deepai.org/text8.zip" \
    if USE_TEXT8 else f'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt'

TARGET_SIZE = 1   # Size of the target event (word)
CONTEXT_SIZE = 10  # Size of the context.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = 5   # Size of the negative samples
VECTOR_SIZE = 100  # Number of features in the event vector.

---

# Data
## Corpus

In [16]:
path_to_corpus = f"~/.keras/datasets/{CORPUS_FILE}"
if fileio.Function.is_file(path_to_corpus):
    pass
else:
    # text8, run "cat text8 | xargs -n 512 > text8_512" after download
    path_to_corpus = tf.keras.utils.get_file(
        fname=CORPUS_FILE,
        origin=CORPUS_URL,
        extract=True
    )
corpus = fileio.Function.read_file(path_to_corpus)
print(path_to_corpus)

/home/oonisim/.keras/datasets/text8_512


In [17]:
examples = corpus.split('\n')[:1]
for line in examples:
    print(line)

anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economic institutio

---
# Event (word) indexing
Index the events that have occurred in the event sequence.

In [18]:
%autoreload 2
from layer.preprocessing import (
    EventIndexing, 
)

In [19]:
word_indexing = EventIndexing(
    name="word_indexing_on_ptb",
    corpus=corpus
)
del corpus

---
# Word Embedding

Embedding is to train the model to group similar events in a close proximity in the event vector space. If two events e.g. 'pencil' and 'pen' are similar concepts, then their event vectors resides in a close distance in the event space. 

* [Thought Vectors](https://wiki.pathmind.com/thought-vectors)



In [20]:
%autoreload 2
from layer import (
    Embedding
)

In [21]:
embedding: Embedding = Embedding(
    name="embedding",
    num_nodes=WINDOW_SIZE,
    target_size=TARGET_SIZE,
    context_size=CONTEXT_SIZE,
    negative_sample_size=SAMPLE_SIZE,
    event_vector_size=VECTOR_SIZE,
    dictionary=word_indexing
)

In [22]:
STATE_FILE_100 = "../models/word2vec_text8_512_E1_C10_Wnormal_std_0.01_V100_LR100.0_S5_N1.pkl"

---
# Evaluate the vector space

Verify if the trained model, or the vector space W, has encoded the words in a way that **similar** words are close in the vector space.

* [How to measure the similarity among vectors](https://math.stackexchange.com/questions/4132458)

In [23]:
n = 20
context = "insurance".split()
word_indices = np.array(word_indexing.list_indices(context), dtype=TYPE_INT)

print(f"Words {context}")
print(f"Word indices {word_indices}")

Words ['insurance']
Word indices [5735]


## Vector size 100

In [24]:
state = embedding.load(STATE_FILE_100)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

print(word_indexing.list_events([embedding.predict(word_indices, n)]))

Model loaded.
event_size 1
context_size: 10
event_vector_size: 100

[['kovacevic' 'logged' 'uele' 'longhand' 'implanted' 'obe' 'sudamerikano' 'sandelson' 'atwa' 'principium' 'alldeutscher' 'eventers' 'electrics' 'arlima' 'ahonen' 'lvsborg' 'hamburgians' 'bianchinetta' 'vfax' 'millard']
 ['uele' 'atwa' 'principium' 'kovacevic' 'renewal' 'didymium' 'bojanowo' 'sudamerikano' 'obe' 'qoholet' 'militarize' 'lyxose' 'electrode' 'encumbrance' 'spenard' 'szczytno' 'chasey' 'ggj' 'shipbuilders' 'maceration']]


  return self._vocabulary[list(iter(indices))]


---
# Compare with [gensim word2vec](https://radimrehurek.com/gensim/models/word2vec.html)

In [373]:
from gensim.models import (
    Word2Vec
)
from gensim.models.word2vec import (
    LineSentence    
)

In [374]:
sentences = LineSentence(source="~/.keras/datasets/ptb.train.txt")

In [375]:
w2v = Word2Vec(
    sentences=sentences, 
    sg=0,
    window=5, 
    negative=5,
    vector_size=100, 
    min_count=1, 
    workers=4
)
del sentences

In [400]:
w2v.wv.most_similar(context, topn=n)

[('development', 0.8435715436935425),
 ('services', 0.8369989395141602),
 ('service', 0.8368431329727173),
 ('capital', 0.8317663669586182),
 ('savings', 0.8290608525276184),
 ('semiconductor', 0.828930675983429),
 ('commercial', 0.8268676400184631),
 ('resources', 0.8260855078697205),
 ('equipment', 0.8258163332939148),
 ('electronic', 0.822940468788147),
 ('machines', 0.8209294080734253),
 ('software', 0.8201037645339966),
 ('industries', 0.8191213607788086),
 ('international', 0.8188644647598267),
 ('electric', 0.81886225938797),
 ('chemical', 0.8178213238716125),
 ('aerospace', 0.8141826391220093),
 ('technology', 0.813803493976593),
 ('systems', 0.8132439851760864),
 ('american', 0.8110503554344177)]