# word2vec evaluation


---
# Setups

In [1]:
import cProfile
import sys
import os
import re
from itertools import islice
from typing import Dict, List
import numpy as np
import tensorflow as tf

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=400) 

# Jupyter notebook setups

Auto reolaod causes an error in Jupyter notebooks. Restart the Jupyter kernel for the error:
```TypeError: super(type, obj): obj must be an instance or subtype of type```
See
- https://stackoverflow.com/a/52927102/4281353
- http://thomas-cokelaer.info/blog/2011/09/382/

> The problem resides in the mechanism of reloading modules.
> Reloading a module often changes the internal object in memory which
> makes the isinstance test of super return False.

In [2]:
%load_ext line_profiler
%load_ext autoreload

## Utilites

In [3]:
%autoreload 2

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

import function.fileio as fileio
import function.text as text

---
# Data Types


In [4]:
from common.constant import (
    TYPE_INT,
    TYPE_FLOAT,
    TYPE_LABEL,
    TYPE_TENSOR,
)

# Constants

In [5]:
USE_TEXT8 = False
USE_PTB = not USE_TEXT8
USE_CBOW = False
USE_SGRAM = not USE_CBOW

CORPUS_FILE = "text8_512" if USE_TEXT8 else "ptb_train"
CORPUS_URL = "https://data.deepai.org/text8.zip" \
    if USE_TEXT8 else f'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt'

TARGET_SIZE = 1   # Size of the target event (word)
CONTEXT_SIZE = 10  # Size of the context.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = 5   # Size of the negative samples
VECTOR_SIZE = 100  # Number of features in the event vector.

---

# Data
## Corpus

In [6]:
path_to_corpus = f"~/.keras/datasets/{CORPUS_FILE}"
if fileio.Function.is_file(path_to_corpus):
    pass
else:
    # text8, run "cat text8 | xargs -n 512 > text8_512" after download
    path_to_corpus = tf.keras.utils.get_file(
        fname=CORPUS_FILE,
        origin=CORPUS_URL,
        extract=True
    )
corpus = fileio.Function.read_file(path_to_corpus)
print(path_to_corpus)

/home/oonisim/.keras/datasets/ptb_train


In [7]:
examples = corpus.split('\n')[:1]
for line in examples:
    print(line)

 aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter 


---
# Event (word) indexing
Index the events that have occurred in the event sequence.

In [8]:
%autoreload 2
from layer.preprocessing import (
    EventIndexing, 
)



In [9]:
word_indexing = EventIndexing(
    name="word_indexing_on_ptb",
    corpus=corpus
)
del corpus

---
# Word Embedding

Embedding is to train the model to group similar events in a close proximity in the event vector space. If two events e.g. 'pencil' and 'pen' are similar concepts, then their event vectors resides in a close distance in the event space. 

* [Thought Vectors](https://wiki.pathmind.com/thought-vectors)



In [10]:
%autoreload 2
if USE_CBOW:
    from layer.embedding_cbow_dual_vector_spaces.py import (
        Embedding
    )
else:
    from layer.embedding_sgram import (
        Embedding
    )

from optimizer import (
    SGD
)

In [11]:
embedding: Embedding = Embedding(
    name="embedding",
    num_nodes=WINDOW_SIZE,
    target_size=TARGET_SIZE,
    context_size=CONTEXT_SIZE,
    negative_sample_size=SAMPLE_SIZE,
    event_vector_size=VECTOR_SIZE,
    dictionary=word_indexing
)

In [27]:
STATE_FILE_C10_S5_WND_V100_LR5 = "../models/word2vec_sgram_ptb_train_E1_C10_S5_Wnormal_std_0.01_V100_LR5.0_N10.pkl"
STATE_FILE_C10_S5_WND_V100_LR20 = "../models/word2vec_sgram_ptb_train_E1_C10_S5_Wnormal_std_0.01_V100_LR20.0_N10.pkl"

STATE_FILE = STATE_FILE_C10_S5_WND_V100_LR20

---
# Evaluate the vector space

Verify if the trained model, or the vector space W, has encoded the words in a way that **similar** words are close in the vector space.

* [How to measure the similarity among vectors](https://math.stackexchange.com/questions/4132458)

In [70]:
n = 10
context = "computer".split()

## C10_S5_WND_V100_LR5

```
TARGET_SIZE = TYPE_INT(1)       # Size of the target event (word)
CONTEXT_SIZE = TYPE_INT(10)     # Size of the context in which the target event occurs.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = TYPE_INT(5)       # Size of the negative samples

VECTOR_SIZE = TYPE_INT(100)     # Number of features in the event vector.
WEIGHT_SCHEME = "normal"
WEIGHT_PARAMS = {
    "std": 0.01
}

LR = TYPE_FLOAT(5)
NUM_SENTENCES = 10
```

In [71]:
state = embedding.load(STATE_FILE_C10_S5_WND_V100_LR5)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

Model loaded.
event_size 1
context_size: 10
event_vector_size: 100



In [72]:
word_indices = np.array(word_indexing.list_indices(context), dtype=TYPE_INT)

print(f"Words {context}")
print(f"Word indices {word_indices}")
print(word_indexing.list_events([embedding.predict(word_indices, n)]))

Words ['computer']
Word indices [1069]
[['computers' 'data' 'software' 'equipment' 'technology' 'food' 'digital' 'electronic' 'huge' 'oil']
 ['computers' 'data' 'software' 'technology' 'equipment' 'food' 'digital' 'electronic' 'huge' 'cable']]


## C10_S5_WND_V100_LR20

```
TARGET_SIZE = TYPE_INT(1)       # Size of the target event (word)
CONTEXT_SIZE = TYPE_INT(10)     # Size of the context in which the target event occurs.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = TYPE_INT(5)       # Size of the negative samples

VECTOR_SIZE = TYPE_INT(100)     # Number of features in the event vector.
WEIGHT_SCHEME = "normal"
WEIGHT_PARAMS = {
    "std": 0.01
}

LR = TYPE_FLOAT(20)
NUM_SENTENCES = 10
```

In [73]:
state = embedding.load(STATE_FILE_C10_S5_WND_V100_LR20)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

Model loaded.
event_size 1
context_size: 10
event_vector_size: 100



In [74]:
word_indices = np.array(word_indexing.list_indices(context), dtype=TYPE_INT)

print(f"Words {context}")
print(f"Word indices {word_indices}")
print(word_indexing.list_events([embedding.predict(word_indices, n)]))

Words ['computer']
Word indices [1069]
[['computers' 'software' 'digital' 'graphics' 'chip' 'equipment' 'store' 'portable' 'ibm' 'systems']
 ['computers' 'software' 'digital' 'chip' 'graphics' 'store' 'ibm' 'mainframe' 'processing' 'disk']]


---
# Compare with [gensim word2vec](https://radimrehurek.com/gensim/models/word2vec.html)

In [51]:
from gensim.models import (
    Word2Vec
)
from gensim.models.word2vec import (
    LineSentence    
)

In [52]:
sentences = LineSentence(source=path_to_corpus)

In [53]:
w2v = Word2Vec(
    sentences=sentences, 
    sg=0,
    window=5, 
    negative=5,
    vector_size=100, 
    min_count=1, 
    workers=4
)
del sentences

In [75]:
w2v.wv.most_similar(context, topn=n)

[('software', 0.8628386855125427),
 ('systems', 0.8502405285835266),
 ('machines', 0.8363229036331177),
 ('chemical', 0.8266561627388),
 ('electronics', 0.8167977333068848),
 ('data', 0.8129450082778931),
 ('digital', 0.8107041716575623),
 ('giant', 0.8097655773162842),
 ('steel', 0.809612512588501),
 ('maker', 0.8053660988807678)]