# word2vec evaluation


---
# Setups

In [None]:
import cProfile
import sys
import os
import re
from itertools import islice
from typing import Dict, List
import numpy as np
import tensorflow as tf

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=400) 

# Jupyter notebook setups

Auto reolaod causes an error in Jupyter notebooks. Restart the Jupyter kernel for the error:
```TypeError: super(type, obj): obj must be an instance or subtype of type```
See
- https://stackoverflow.com/a/52927102/4281353
- http://thomas-cokelaer.info/blog/2011/09/382/

> The problem resides in the mechanism of reloading modules.
> Reloading a module often changes the internal object in memory which
> makes the isinstance test of super return False.

In [None]:
%load_ext line_profiler
%load_ext autoreload

## Utilites

In [None]:
%autoreload 2

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

import function.fileio as fileio
import function.text as text

---
# Data Types


In [None]:
from common.constant import (
    TYPE_INT,
    TYPE_FLOAT,
    TYPE_LABEL,
    TYPE_TENSOR,
)

# Constants

In [None]:
USE_TEXT8 = True
USE_PTB = not USE_TEXT8
USE_CBOW = False
USE_SGRAM = not USE_CBOW

CORPUS_FILE = "text8_512" if USE_TEXT8 else "ptb_train"
CORPUS_URL = "https://data.deepai.org/text8.zip" \
    if USE_TEXT8 else f'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt'

TARGET_SIZE = 1   # Size of the target event (word)
CONTEXT_SIZE = 10  # Size of the context.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = 5   # Size of the negative samples
VECTOR_SIZE = 100  # Number of features in the event vector.

In [None]:
STATE_FILE_PTB_C10_S5_WND_V100_LR5 = "../models/word2vec_sgram_ptb_train_E1_C10_S5_Wnormal_std_0.01_V100_LR5.0_N10.pkl"
STATE_FILE_PTB_C10_S5_WND_V100_LR20 = "../models/word2vec_sgram_ptb_train_E1_C10_S5_Wnormal_std_0.01_V100_LR20.0_N10.pkl"
STATE_FILE_TEXT8_C10_S5_WND_v100_LR20="../models/word2vec_sgram_text8_512_E1_C10_S5_Wnormal_std_0.01_V100_LR20.0_N1.pkl"
STATE_FILE = STATE_FILE_TEXT8_C10_S5_WND_v100_LR20

---

# Data
## Corpus

In [None]:
path_to_corpus = f"~/.keras/datasets/{CORPUS_FILE}"
if fileio.Function.is_file(path_to_corpus):
    pass
else:
    # text8, run "cat text8 | xargs -n 512 > text8_512" after download
    path_to_corpus = tf.keras.utils.get_file(
        fname=CORPUS_FILE,
        origin=CORPUS_URL,
        extract=True
    )
corpus = fileio.Function.read_file(path_to_corpus)
print(path_to_corpus)

In [None]:
examples = corpus.split('\n')[:1]
for line in examples:
    print(line)

---
# Event (word) indexing
Index the events that have occurred in the event sequence.

In [None]:
%autoreload 2
from layer.preprocessing import (
    EventIndexing, 
)

In [None]:
word_indexing = EventIndexing(
    name="word_indexing_on_ptb",
    corpus=corpus
)
del corpus

---
# Word Embedding

Embedding is to train the model to group similar events in a close proximity in the event vector space. If two events e.g. 'pencil' and 'pen' are similar concepts, then their event vectors resides in a close distance in the event space. 

* [Thought Vectors](https://wiki.pathmind.com/thought-vectors)



In [None]:
%autoreload 2
if USE_CBOW:
    from layer.embedding_cbow_dual_vector_spaces.py import (
        Embedding
    )
else:
    from layer.embedding_sgram import (
        Embedding
    )

from optimizer import (
    SGD
)

In [None]:
embedding: Embedding = Embedding(
    name="embedding",
    num_nodes=WINDOW_SIZE,
    target_size=TARGET_SIZE,
    context_size=CONTEXT_SIZE,
    negative_sample_size=SAMPLE_SIZE,
    event_vector_size=VECTOR_SIZE,
    dictionary=word_indexing
)

---
# Evaluate the vector space

Verify if the trained model, or the vector space W, has encoded the words in a way that **similar** words are close in the vector space.

* [How to measure the similarity among vectors](https://math.stackexchange.com/questions/4132458)

# Benchmark 

Use [gensim word2vec](https://radimrehurek.com/gensim/models/word2vec.html) as the benchmark.

In [None]:
from gensim.models import (
    Word2Vec
)
from gensim.models.word2vec import (
    LineSentence    
)

In [None]:
sentences = LineSentence(source=path_to_corpus)

In [None]:
w2v = Word2Vec(
    sentences=sentences, 
    sg=0,
    window=5, 
    negative=5,
    vector_size=100, 
    min_count=1, 
    workers=4
)
del sentences

### Input word

In [32]:
n = 10
context = "king".split()

## Text8 based trained model

Split text8 into lines where each line has N words (e.g. 512)
```
N=512
cat text8 | xargs -n $N > text8_$N
```

### TEXT8_C10_S5_WND_v100_LR20

```
USE_TEXT8 = True

CORPUS_FILE = "text8_512" if USE_TEXT8 else "ptb_train"
CORPUS_URL = "https://data.deepai.org/text8.zip" \
    if USE_TEXT8 else 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt'

TARGET_SIZE = TYPE_INT(1)       # Size of the target event (word)
CONTEXT_SIZE = TYPE_INT(10)     # Size of the context in which the target event occurs.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = TYPE_INT(5)      # Size of the negative samples
VECTOR_SIZE = TYPE_INT(100)     # Number of features in the event vector.

WEIGHT_SCHEME = "normal"
WEIGHT_PARAMS = {
    "std": 0.01
}
LR = TYPE_FLOAT(20.0)

NUM_SENTENCES = 1
```

In [None]:
state = embedding.load(STATE_FILE_TEXT8_C10_S5_WND_v100_LR20)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

### Comparision between text8-based model and gensim

In [49]:
cosines, distances = embedding.predict(word_indices, n)
print("Text8 model predictions (cosine evaluation) for %s:" % context)
for word_index in cosines:
    print(f"{word_index}:{word_indexing.list_events([word_index])}")
    
print("\nText8 model predictions (distance evaluation) for %s:" % context)
for word_index in distances:
    print(f"{word_index}:{word_indexing.list_events([word_index])}")
    
print("\nGensim predictions for %s:" % context)
for word in w2v.wv.most_similar(context, topn=n):
    print(word)

Text8 model predictions (cosine evaluation) for ['king']:
2971:son
4962:henry
1229:mary
5569:queen
8165:emperor
4906:england
6156:prince
5657:charles
4998:james
4567:iii

Text8 model predictions (distance evaluation) for ['king']:
2971:son
4962:henry
4906:england
8165:emperor
4998:james
5657:charles
3371:president
1229:mary
2768:minister
173:william

Gensim predictions for ['king']:
('prince', 0.7555170059204102)
('queen', 0.7447685599327087)
('throne', 0.7199389338493347)
('emperor', 0.7015615105628967)
('kings', 0.6873074173927307)
('vii', 0.6840494871139526)
('regent', 0.6827466487884521)
('pope', 0.6799136400222778)
('elector', 0.6754884123802185)
('sultan', 0.6695173978805542)


---
## PTB based training

### C10_S5_WND_V100_LR5

```
TARGET_SIZE = TYPE_INT(1)       # Size of the target event (word)
CONTEXT_SIZE = TYPE_INT(10)     # Size of the context in which the target event occurs.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = TYPE_INT(5)       # Size of the negative samples

VECTOR_SIZE = TYPE_INT(100)     # Number of features in the event vector.
WEIGHT_SCHEME = "normal"
WEIGHT_PARAMS = {
    "std": 0.01
}

LR = TYPE_FLOAT(5)
NUM_SENTENCES = 10
```

In [None]:
state = embedding.load(STATE_PTB_FILE_C10_S5_WND_V100_LR5)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

In [None]:
word_indices = np.array(word_indexing.list_indices(context), dtype=TYPE_INT)

print(f"Words {context}")
print(f"Word indices {word_indices}")
print(word_indexing.list_events([embedding.predict(word_indices, n)]))

### C10_S5_WND_V100_LR20

```
TARGET_SIZE = TYPE_INT(1)       # Size of the target event (word)
CONTEXT_SIZE = TYPE_INT(10)     # Size of the context in which the target event occurs.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = TYPE_INT(5)       # Size of the negative samples

VECTOR_SIZE = TYPE_INT(100)     # Number of features in the event vector.
WEIGHT_SCHEME = "normal"
WEIGHT_PARAMS = {
    "std": 0.01
}

LR = TYPE_FLOAT(20)
NUM_SENTENCES = 10
```

In [None]:
state = embedding.load(STATE_FILE_PTB_C10_S5_WND_V100_LR20)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

In [None]:
word_indices = np.array(word_indexing.list_indices(context), dtype=TYPE_INT)

print(f"Words {context}")
print(f"Word indices {word_indices}")
print(word_indexing.list_events([embedding.predict(word_indices, n)]))