# word2vec evaluation


---
# Setups

In [22]:
import cProfile
import sys
import os
import re
from itertools import islice
from typing import Dict, List
import numpy as np
import tensorflow as tf

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=400) 

# Jupyter notebook setups

Auto reolaod causes an error in Jupyter notebooks. Restart the Jupyter kernel for the error:
```TypeError: super(type, obj): obj must be an instance or subtype of type```
See
- https://stackoverflow.com/a/52927102/4281353
- http://thomas-cokelaer.info/blog/2011/09/382/

> The problem resides in the mechanism of reloading modules.
> Reloading a module often changes the internal object in memory which
> makes the isinstance test of super return False.

In [23]:
%load_ext line_profiler
%load_ext autoreload

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Utilites

In [24]:
%autoreload 2

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

import function.fileio as fileio
import function.text as text

---
# Data Types


In [25]:
from common.constant import (
    TYPE_INT,
    TYPE_FLOAT,
    TYPE_LABEL,
    TYPE_TENSOR,
)

# Constants

In [26]:
USE_TEXT8 = True
USE_PTB = not USE_TEXT8
USE_CBOW = False
USE_SGRAM = not USE_CBOW

CORPUS_FILE = "text8_512" if USE_TEXT8 else "ptb_train"
CORPUS_URL = "https://data.deepai.org/text8.zip" \
    if USE_TEXT8 else f'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt'

TARGET_SIZE = 1   # Size of the target event (word)
CONTEXT_SIZE = 10  # Size of the context.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = 5   # Size of the negative samples
VECTOR_SIZE = 100  # Number of features in the event vector.

In [27]:
STATE_FILE_PTB_C10_S5_WND_V100_LR5 = "../models/word2vec_sgram_ptb_train_E1_C10_S5_Wnormal_std_0.01_V100_LR5.0_N10.pkl"
STATE_FILE_PTB_C10_S5_WND_V100_LR20 = "../models/word2vec_sgram_ptb_train_E1_C10_S5_Wnormal_std_0.01_V100_LR20.0_N10.pkl"
STATE_FILE_TEXT8_C10_S5_WND_v100_LR20 ="../models/word2vec_sgram_text8_512_E1_C10_S5_Wnormal_std_0.01_V100_LR20.0_N1.pkl"
STATE_FILE_TEXT8_C6_S6_WND_V100_LR20 = "../models/word2vec_sgram_text8_512_E1_C6_S6_Wnormal_std_0.01_V100_LR20.0_N1.pkl"

---

# Data
## Corpus

In [28]:
path_to_corpus = f"~/.keras/datasets/{CORPUS_FILE}"
if fileio.Function.is_file(path_to_corpus):
    pass
else:
    # text8, run "cat text8 | xargs -n 512 > text8_512" after download
    path_to_corpus = tf.keras.utils.get_file(
        fname=CORPUS_FILE,
        origin=CORPUS_URL,
        extract=True
    )
corpus = fileio.Function.read_file(path_to_corpus)
print(path_to_corpus)

/home/oonisim/.keras/datasets/text8_512


In [29]:
examples = corpus.split('\n')[:1]
for line in examples:
    print(line)

anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economic institutio

---
# Event (word) indexing
Index the events that have occurred in the event sequence.

In [30]:
%autoreload 2
from layer.preprocessing import (
    EventIndexing, 
)

In [31]:
word_indexing = EventIndexing(
    name="word_indexing_on_ptb",
    corpus=corpus
)
del corpus

---
# Word Embedding

Embedding is to train the model to group similar events in a close proximity in the event vector space. If two events e.g. 'pencil' and 'pen' are similar concepts, then their event vectors resides in a close distance in the event space. 

* [Thought Vectors](https://wiki.pathmind.com/thought-vectors)



In [32]:
%autoreload 2
if USE_CBOW:
    from layer.embedding_cbow_dual_vector_spaces.py import (
        Embedding
    )
else:
    from layer.embedding_sgram import (
        Embedding
    )

from optimizer import (
    SGD
)

In [33]:
embedding: Embedding = Embedding(
    name="embedding",
    num_nodes=WINDOW_SIZE,
    target_size=TARGET_SIZE,
    context_size=CONTEXT_SIZE,
    negative_sample_size=SAMPLE_SIZE,
    event_vector_size=VECTOR_SIZE,
    dictionary=word_indexing
)

---
# Evaluate the vector space

Verify if the trained model, or the vector space W, has encoded the words in a way that **similar** words are close in the vector space.

* [How to measure the similarity among vectors](https://math.stackexchange.com/questions/4132458)

# Benchmark 

Use [gensim word2vec](https://radimrehurek.com/gensim/models/word2vec.html) as the benchmark.

In [34]:
from gensim.models import (
    Word2Vec
)
from gensim.models.word2vec import (
    LineSentence    
)

In [42]:
sentences = LineSentence(source=path_to_corpus)

In [43]:
w2v = Word2Vec(
    sentences=sentences, 
    sg=0,
    window=3, 
    negative=3,
    vector_size=100, 
    min_count=1, 
    workers=4
)
del sentences

### Input word

In [37]:
n = 10
context = "king".split()
word_indices = np.array(word_indexing.list_indices(context), dtype=TYPE_INT)

## Text8 based trained model

Split text8 into lines where each line has N words (e.g. 512)
```
N=512
cat text8 | xargs -n $N > text8_$N
```

### TEXT8_C10_S5_WND_v100_LR20

```
USE_TEXT8 = True

CORPUS_FILE = "text8_512" if USE_TEXT8 else "ptb_train"
CORPUS_URL = "https://data.deepai.org/text8.zip" \
    if USE_TEXT8 else 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt'

TARGET_SIZE = TYPE_INT(1)       # Size of the target event (word)
CONTEXT_SIZE = TYPE_INT(10)     # Size of the context in which the target event occurs.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = TYPE_INT(5)      # Size of the negative samples
VECTOR_SIZE = TYPE_INT(100)     # Number of features in the event vector.

WEIGHT_SCHEME = "normal"
WEIGHT_PARAMS = {
    "std": 0.01
}
LR = TYPE_FLOAT(20.0)

NUM_SENTENCES = 1
```

In [38]:
state = embedding.load(STATE_FILE_TEXT8_C10_S5_WND_v100_LR20)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

Model loaded.
event_size 1
context_size: 10
event_vector_size: 100



### TEXT8_C6_S6_WND_V100_LR20

In [39]:
state = embedding.load(STATE_FILE_TEXT8_C6_S6_WND_V100_LR20)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

Model loaded.
event_size 1
context_size: 6
event_vector_size: 100



### Comparision between text8-based model and gensim

In [44]:
cosines, distances = embedding.predict(word_indices, n)
print("Text8 model predictions (cosine evaluation) for %s:" % context)
for word_index in cosines:
    print(f"{word_index}:{word_indexing.list_events([word_index])}")
    
print("\nText8 model predictions (distance evaluation) for %s:" % context)
for word_index in distances:
    print(f"{word_index}:{word_indexing.list_events([word_index])}")
    
print("\nGensim predictions for %s:" % context)
for word in w2v.wv.most_similar(context, topn=n):
    print(word)

Text8 model predictions (cosine evaluation) for ['king']:
8165:emperor
3371:president
5569:queen
2971:son
4567:iii
4148:battle
1351:john
5657:charles
6156:prince
1015:italian

Text8 model predictions (distance evaluation) for ['king']:
3371:president
8165:emperor
2971:son
1351:john
4148:battle
5657:charles
287:book
1286:david
1670:george
4906:england

Gensim predictions for ['king']:
('prince', 0.7660735249519348)
('queen', 0.7235230207443237)
('emperor', 0.7160524129867554)
('pope', 0.7091274857521057)
('throne', 0.7053828239440918)
('tsar', 0.6905214190483093)
('kings', 0.6828960180282593)
('sultan', 0.6712819337844849)
('lord', 0.6602793335914612)
('crown', 0.6600614190101624)


---
## PTB based training

### C10_S5_WND_V100_LR5

```
TARGET_SIZE = TYPE_INT(1)       # Size of the target event (word)
CONTEXT_SIZE = TYPE_INT(10)     # Size of the context in which the target event occurs.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = TYPE_INT(5)       # Size of the negative samples

VECTOR_SIZE = TYPE_INT(100)     # Number of features in the event vector.
WEIGHT_SCHEME = "normal"
WEIGHT_PARAMS = {
    "std": 0.01
}

LR = TYPE_FLOAT(5)
NUM_SENTENCES = 10
```

In [None]:
state = embedding.load(STATE_PTB_FILE_C10_S5_WND_V100_LR5)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

In [None]:
word_indices = np.array(word_indexing.list_indices(context), dtype=TYPE_INT)

print(f"Words {context}")
print(f"Word indices {word_indices}")
print(word_indexing.list_events([embedding.predict(word_indices, n)]))

### C10_S5_WND_V100_LR20

```
TARGET_SIZE = TYPE_INT(1)       # Size of the target event (word)
CONTEXT_SIZE = TYPE_INT(10)     # Size of the context in which the target event occurs.
WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE
SAMPLE_SIZE = TYPE_INT(5)       # Size of the negative samples

VECTOR_SIZE = TYPE_INT(100)     # Number of features in the event vector.
WEIGHT_SCHEME = "normal"
WEIGHT_PARAMS = {
    "std": 0.01
}

LR = TYPE_FLOAT(20)
NUM_SENTENCES = 10
```

In [None]:
state = embedding.load(STATE_FILE_PTB_C10_S5_WND_V100_LR20)

fmt="""Model loaded.
event_size %s
context_size: %s
event_vector_size: %s
"""
print(fmt % (
    state["target_size"], 
    state["context_size"], 
    state["event_vector_size"]
))

In [None]:
word_indices = np.array(word_indexing.list_indices(context), dtype=TYPE_INT)

print(f"Words {context}")
print(f"Word indices {word_indices}")
print(word_indexing.list_events([embedding.predict(word_indices, n)]))