# word2vec implementation

In [1]:
import sys
import os
import re
from typing import Dict, List
import numpy as np

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=400) 

%load_ext line_profiler

# Overview
<img src="image/word2vec_cbow_mechanism.png" align="left"/>

# One Hot Encoding (OHE) as row selector
Each row of the identity matrix I extracts the row at the ```1``` position, e.g. ```[0, 1, 0, 0, 0, 0, 0]``` extracts the 2nd row.

word2vec first convert a word into a one-hot-encoding vector, then use it to extract a ```word vector``` from the ```word vector space (word embedding matrix)```. Suppose the word vector space is ```(7, 4``` matrix:

```
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]
 [24 25 26 27]]
 ```
 
 The OHE ```[0, 1, 0, 0, 0, 0, 0]``` extracts the 2nd row ``` 4  5  6  7]``` by dot operation ```@```.

In [2]:
c = np.array([0, 1, 0, 0, 0, 0, 0])
n = 4
W = np.arange(len(c) * n).reshape(len(c), n)
print(c)
print(W)

[0 1 0 0 0 0 0]
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]
 [24 25 26 27]]


In [3]:
c@W

array([4, 5, 6, 7])

# Constants

In [4]:
from preprocessing import (
    DELIMITER,
    SPACE,
    NIL
)

STRIDE = 2
CONTEXT_SIZE = 1 + (STRIDE * 2)

USE_PTB = False

In [5]:
DEBUG = False
VALIDATION = True

# Text utilities

In [6]:
from preprocessing.text import (
    PAD_MODE_SQUEEZE,
    pad_text
)

# Corpus

In [7]:
corpus = "To be, or not to be, that is the question that matters"
if USE_PTB:
    corpus=load_text('train')

# Word indexing

In [8]:
from preprocessing.text import (
    text_to_sequence
)

In [9]:
corpus = re.sub(r'[.,:;]+', SPACE, corpus.lower())

In [10]:
(sequence, word_to_id, id_to_word, vocabulary_size) = text_to_sequence(corpus)

### Validation

In [11]:
if not USE_PTB:
    print("word to id \n{}".format(word_to_id))
    print("id to word \n{}".format(id_to_word))
    print()
    print("corpus is \n[{}]".format(corpus))
    print("sequence is \n{}".format(sequence))
    print("corpus size is {} sequence size is {} expected sum is {}".format(
        len(re.compile('[\t\s]+').split(corpus)), 
        len(sequence), 
        (len(sequence) - (2*STRIDE)) * (2*STRIDE)  # Exclude NIL from the sequence
    ))
    #print([id_to_word[index] for index in sequence])
    print(np.array([id_to_word[index] for index in sequence]))

word to id 
{'to': 0, 'be': 1, 'or': 2, 'not': 3, 'that': 4, 'is': 5, 'the': 6, 'question': 7, 'matters': 8}
id to word 
{0: 'to', 1: 'be', 2: 'or', 3: 'not', 4: 'that', 5: 'is', 6: 'the', 7: 'question', 8: 'matters'}

corpus is 
[to be  or not to be  that is the question that matters]
sequence is 
[0 1 2 3 0 1 4 5 6 7 4 8]
corpus size is 12 sequence size is 12 expected sum is 32
['to' 'be' 'or' 'not' 'to' 'be' 'that' 'is' 'the' 'question' 'that' 'matters']


# Create training data

In [12]:
from np.analytics import (
    create_context_set
)

In [13]:
contexts, labels = create_context_set(sequence, CONTEXT_SIZE)
contexts

array([[0, 1, 3, 0],
       [1, 2, 0, 1],
       [2, 3, 1, 4],
       [3, 0, 4, 5],
       [0, 1, 5, 6],
       [1, 4, 6, 7],
       [4, 5, 7, 4],
       [5, 6, 4, 8]])

### Validation

In [14]:
if not USE_PTB:
    print(corpus)
    for context, label in zip(contexts, labels):
        print("context {:40} label {:10}".format(
            '{}'.format([id_to_word[index] for index in context]), 
            id_to_word[label]
        ))

to be  or not to be  that is the question that matters
context ['to', 'be', 'not', 'to']                label or        
context ['be', 'or', 'to', 'be']                 label not       
context ['or', 'not', 'be', 'that']              label to        
context ['not', 'to', 'that', 'is']              label be        
context ['to', 'be', 'is', 'the']                label that      
context ['be', 'that', 'the', 'question']        label is        
context ['that', 'is', 'question', 'that']       label the       
context ['is', 'the', 'that', 'matters']         label question  


# Convert to One Hot Encoding (OHE)

In [16]:
from preprocessing.encoding import (
    convert_one_hot
)

In [21]:
length = max(word_to_id.values()) + 1
ohe_contexts = convert_one_hot(contexts, length)
ohe_labels = convert_one_hot(labels, length)