# word2vec evaluation


---
# Setups

In [1]:
import cProfile
import sys
import os
import re
from itertools import islice
from typing import Dict, List
import numpy as np
import tensorflow as tf

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=400) 

# Jupyter notebook setups

Auto reolaod causes an error in Jupyter notebooks. Restart the Jupyter kernel for the error:
```TypeError: super(type, obj): obj must be an instance or subtype of type```
See
- https://stackoverflow.com/a/52927102/4281353
- http://thomas-cokelaer.info/blog/2011/09/382/

> The problem resides in the mechanism of reloading modules.
> Reloading a module often changes the internal object in memory which
> makes the isinstance test of super return False.

In [2]:
%load_ext line_profiler
%load_ext autoreload

## Utilites

In [3]:
%autoreload 2

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

import function.fileio as fileio
import function.text as text

---
# Data Types


In [4]:
from common.constant import (
    TYPE_INT,
    TYPE_FLOAT,
    TYPE_LABEL,
    TYPE_TENSOR,
)

# Constants

In [5]:
USE_PTB = True
DEBUG = False
VALIDATION = True

TARGET_SIZE = 1   # Size of the target event (word)
CONTEXT_SIZE = 10  # Size of the context.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = 5   # Size of the negative samples
VECTOR_SIZE = 100  # Number of features in the event vector.

---

# Data
## Corpus

In [6]:
corpus = "To be, or not to be, that is the question that matters"
_file = "ptb.train.txt"
if USE_PTB:
    if not fileio.Function.is_file(f"~/.keras/datasets/{_file}"):
        path_to_ptb = tf.keras.utils.get_file(
            _file, 
            f'https://raw.githubusercontent.com/tomsercu/lstm/master/data/{_file}'
        )
    corpus = fileio.Function.read_file(path_to_ptb)

In [7]:
examples = corpus.split('\n')[:5]
for line in examples:
    print(line)

 aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter 
 pierre <unk> N years old will join the board as a nonexecutive director nov. N 
 mr. <unk> is chairman of <unk> n.v. the dutch publishing group 
 rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate 
 a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported 


---
# Event (word) indexing
Index the events that have occurred in the event sequence.

In [8]:
%autoreload 2
from layer.preprocessing import (
    EventIndexing, 
)

In [9]:
word_indexing = EventIndexing(
    name="word_indexing_on_ptb",
    corpus=corpus
)
del corpus

---
# Word Embedding

Embedding is to train the model to group similar events in a close proximity in the event vector space. If two events e.g. 'pencil' and 'pen' are similar concepts, then their event vectors resides in a close distance in the event space. 

* [Thought Vectors](https://wiki.pathmind.com/thought-vectors)



In [19]:
%autoreload 2
from layer import (
    Embedding
)
from optimizer import (
    SGD
)

In [20]:
embedding: Embedding = Embedding(
    name="embedding",
    num_nodes=WINDOW_SIZE,
    target_size=TARGET_SIZE,
    context_size=CONTEXT_SIZE,
    negative_sample_size=SAMPLE_SIZE,
    event_vector_size=VECTOR_SIZE,
    optimizer=SGD(lr=TYPE_FLOAT(0.3)),
    dictionary=word_indexing
)

In [21]:
#STATE_FILE_20 = "../models/word2vec_vecsize_20.pkl"
STATE_FILE_50 = "../models/word2vec_vecsize_50.pkl"
STATE_FILE_100 = "../models/word2vec_vecsize_100.pkl"

---
# Evaluate the vector space

Verify if the trained model, or the vector space W, has encoded the words in a way that **similar** words are close in the vector space.

* [How to measure the similarity among vectors](https://math.stackexchange.com/questions/4132458)

In [93]:
n = 10
context = "laundering".split()
word_indices = np.array(word_indexing.list_indices(context), dtype=TYPE_INT)

print(f"Words {context}")
print(f"Word indices {word_indices}")

Words ['laundering']
Word indices [8239]


## Vector size 50

In [94]:
state = embedding.load(STATE_FILE_50)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

print(word_indexing.list_events([embedding.predict(word_indices, n)]))

Model loaded.
event_size 1
context_size: 10
event_vector_size: 50

[['opinions' 'drama' 'knowledge' 'gates' 'tenants' 'excess' 'shipments' 'irresponsible' 'minivans' 'details']
 ['opinions' 'railway' 'indexation' 'tips' 'teagan' 'jayark' 'supermarkets' 'wilfred' 'contrasts' 'blank']]


## Vector size 100

In [95]:
state = embedding.load(STATE_FILE_100)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

print(word_indexing.list_events([embedding.predict(word_indices, n)]))

Model loaded.
event_size 1
context_size: 10
event_vector_size: 100

[['biscuits' 'nomura' 'glazer' 'faded' 'should' 'tumbling' 'apart' 'credited' 'unique' 'guterman']
 ['unique' 'investigations' 'nomura' 'church' 'epo' 'apart' 'necessary' 'biscuits' 'incorrect' 'bleak']]


---
# Compare with [gensim word2vec](https://radimrehurek.com/gensim/models/word2vec.html)

In [96]:
from gensim.models import (
    Word2Vec
)
from gensim.models.word2vec import (
    LineSentence    
)

In [80]:
sentences = LineSentence(source="~/.keras/datasets/ptb.train.txt")

In [86]:
w2v = Word2Vec(
    sentences=sentences, 
    sg=0,
    window=5, 
    negative=5,
    vector_size=100, 
    min_count=1, 
    workers=4
)
del sentences

In [97]:
w2v.wv.most_similar(context, topn=10)

[('procedures', 0.9436922073364258),
 ('execute', 0.941146969795227),
 ('tasks', 0.9382684826850891),
 ('opportunities', 0.93809974193573),
 ('tougher', 0.9370934367179871),
 ('jurors', 0.9356532692909241),
 ('harm', 0.9328603148460388),
 ('overly', 0.9327043890953064),
 ('ideas', 0.9326834082603455),
 ('safer', 0.9326456785202026)]