# word2vec implementation

## Overview
<img src="image/word2vec_cbow_mechanism.png" align="left"/>

---
# Setups

In [1]:
import sys
import os
import re
from itertools import islice
from typing import Dict, List
import numpy as np
import tensorflow as tf

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=400) 

## Setup for Google Colab environment

In [2]:
try:
    import google.colab
    IN_GOOGLE_COLAB = True
except:
    IN_GOOGLE_COLAB = False
    
if IN_GOOGLE_COLAB:
    !pip install line_profiler
    !google.colab.drive.mount('/content/gdrive')
    !rm -rf /content/github
    !mkdir -p /content/github
    !git clone https://github.com/oonisim/python-programs.git /content/github
        
    import sys
    sys.path.append('/content/github/nlp/src')

## Jupyter notebook setups

In [3]:
%load_ext line_profiler
%load_ext autoreload

## Utilites

In [4]:
%autoreload 2

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

import function.fileio as fileio

# Constants

In [5]:
USE_PTB = True
DEBUG = False
VALIDATION = True

---

# Data
## Corpus

In [6]:
corpus = "To be, or not to be, that is the question that matters"
_file = "ptb.train.txt"
if USE_PTB:
    if not fileio.Function.is_file(f"~/.keras/datasets/{_file}"):
        path_to_ptb = tf.keras.utils.get_file(
            _file, 
            f'https://raw.githubusercontent.com/tomsercu/lstm/master/data/{_file}'
        )
    corpus = fileio.Function.read_file(path_to_ptb)

In [7]:
examples = corpus.split('\n')[:5]
for line in examples:
    print(line)

 aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter 
 pierre <unk> N years old will join the board as a nonexecutive director nov. N 
 mr. <unk> is chairman of <unk> n.v. the dutch publishing group 
 rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate 
 a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported 


---
# Event (word) indexing
Index the events that have occurred in the event sequence.

In [8]:
%autoreload 2
from layer.preprocessing import (
    EventIndexing, 
    EventContext
)

In [9]:
word_indexing = EventIndexing(
    name="word_indexing_on_ptb",
    corpus=corpus
)

## EventIndexing  for the corpus

Adapt to the ```corpus``` and provides:
* event_to_index dictionary
* vocaburary of the corpus
* word occurrence probabilites

In [10]:
words = word_indexing.list_events(range(10))
print(f"EventIndexing.vocabulary[10]:\n{words}\n")

indices = word_indexing.list_indices(words)
print(f"EventIndexing.event_to_index[10]:")
for item in zip(words, indices):
    print(item)

probabilities = word_indexing.list_probabilities(words)
print(f"\nEventIndexing.probabilities[10]:")
for word, p in zip(words, probabilities):
    print(f"{word:20s} : {p:.5e}")

EventIndexing.vocabulary[10]:
['<nil>' '<unk>' 'aer' 'banknote' 'berlitz' 'calloway' 'centrust' 'cluett' 'fromstein' 'gitano']

EventIndexing.event_to_index[10]:
('<nil>', 0)
('<unk>', 1)
('aer', 2)
('banknote', 3)
('berlitz', 4)
('calloway', 5)
('centrust', 6)
('cluett', 7)
('fromstein', 8)
('gitano', 9)

EventIndexing.probabilities[10]:
<nil>                : 0.00000e+00
<unk>                : 1.65308e-02
aer                  : 5.34860e-06
banknote             : 5.34860e-06
berlitz              : 5.34860e-06
calloway             : 5.34860e-06
centrust             : 5.34860e-06
cluett               : 5.34860e-06
fromstein            : 5.34860e-06
gitano               : 5.34860e-06


## Sampling using the probability

Sample events according to their probabilities.

In [11]:
sample = word_indexing.sample(size=5)
print(sample)

['the', 'united', 'boom', 'in', 'always']


## Negative Sampling
Sample events not including those events already sampled.

In [12]:
negative_indices = word_indexing.negative_sample_indices(
    size=5, excludes=word_indexing.list_indices(sample)
)
print(f"negative_indices={negative_indices} \nevents={word_indexing.list_events(negative_indices)}")


negative_indices=[1088, 257, 738, 231, 1608] 
events=['financing' 'such' 'plans' 'by' 'offering']


## Sentence to Sequence

In [13]:
# sentences = "\n".join(corpus.split('\n')[5:6])
sentences = """
the asbestos fiber <unk> is unusually <unk> once it enters the <unk> 
with even brief exposures to it causing symptoms that show up decades later researchers said
"""
sequences = word_indexing.function(sentences)
for pair in zip(sentences.strip().split(" "), sequences[0]):
    print(f"{pair[0]:15} : {pair[1]:5}")

Sentence is empty. Skipping...
Sentence is empty. Skipping...


the             :    34
asbestos        :    63
fiber           :    86
<unk>           :     1
is              :    42
unusually       :    87
<unk>           :     1
once            :    64
it              :    80
enters          :    88
the             :    34
<unk>           :     1

with           :     0
even            :     0
brief           :     0


---
# Context of a word in a sentence

In the sentence ```"a form of asbestos once used to make kent cigarette filters"```, one of the context windows ```a form of asbestos once``` of size 5 and event size 1 has.
* ```of``` as a target word.
* ```(a, form) and (asbestos, once)``` as its context.

### Sequence of the word indices for the sentence

In [14]:
sentences = """
a form of asbestos once used to make kent cigarette filters

N years old and former chairman of consolidated gold fields plc was named a nonexecutive director
"""

sequence = word_indexing.function(sentences)
sequence

Sentence is empty. Skipping...
Sentence is empty. Skipping...
Sentence is empty. Skipping...


array([[37, 62, 44, 63, 64, 65, 66, 67, 68, 69, 70,  0,  0,  0,  0,  0],
       [29, 30, 31, 50, 51, 43, 44, 52, 53, 54, 55, 56, 57, 37, 38, 39]])

## Target, context pairs

For each word in the setence ```(of, asbestos, ... , kent)``` excludnig the ends of the sentence, create ```(target, context)``` as:

```
[
  [of, a, form, asbestos, once],              # target is 'of', context is (a, form, asbestos, once)
  ['asbestos', 'form', 'of', 'once', 'used'],
  ['once', 'of', 'asbestos', 'used', 'to'],
  ...
]
```

In [15]:
event_context = EventContext(
    name="ev",
    window_size=5,
    event_size=1    
)

In [16]:
event_context_pairs = event_context.function(sequence)
event_context_pairs

array([[[44, 37, 62, 63, 64],
        [63, 62, 44, 64, 65],
        [64, 44, 63, 65, 66],
        [65, 63, 64, 66, 67],
        [66, 64, 65, 67, 68],
        [67, 65, 66, 68, 69],
        [68, 66, 67, 69, 70],
        [69, 67, 68, 70,  0],
        [70, 68, 69,  0,  0],
        [ 0, 69, 70,  0,  0],
        [ 0, 70,  0,  0,  0],
        [ 0,  0,  0,  0,  0]],

       [[31, 29, 30, 50, 51],
        [50, 30, 31, 51, 43],
        [51, 31, 50, 43, 44],
        [43, 50, 51, 44, 52],
        [44, 51, 43, 52, 53],
        [52, 43, 44, 53, 54],
        [53, 44, 52, 54, 55],
        [54, 52, 53, 55, 56],
        [55, 53, 54, 56, 57],
        [56, 54, 55, 57, 37],
        [57, 55, 56, 37, 38],
        [37, 56, 57, 38, 39]]], dtype=int32)

### Target, context pairs in textual words

In [17]:
word_indexing.sequence_to_sentence(event_context_pairs)

[[['of', 'a', 'form', 'asbestos', 'once'],
  ['asbestos', 'form', 'of', 'once', 'used'],
  ['once', 'of', 'asbestos', 'used', 'to'],
  ['used', 'asbestos', 'once', 'to', 'make'],
  ['to', 'once', 'used', 'make', 'kent'],
  ['make', 'used', 'to', 'kent', 'cigarette'],
  ['kent', 'to', 'make', 'cigarette', 'filters'],
  ['cigarette', 'make', 'kent', 'filters', '<nil>'],
  ['filters', 'kent', 'cigarette', '<nil>', '<nil>'],
  ['<nil>', 'cigarette', 'filters', '<nil>', '<nil>'],
  ['<nil>', 'filters', '<nil>', '<nil>', '<nil>'],
  ['<nil>', '<nil>', '<nil>', '<nil>', '<nil>']],
 [['old', 'n', 'years', 'and', 'former'],
  ['and', 'years', 'old', 'former', 'chairman'],
  ['former', 'old', 'and', 'chairman', 'of'],
  ['chairman', 'and', 'former', 'of', 'consolidated'],
  ['of', 'former', 'chairman', 'consolidated', 'gold'],
  ['consolidated', 'chairman', 'of', 'gold', 'fields'],
  ['gold', 'of', 'consolidated', 'fields', 'plc'],
  ['fields', 'consolidated', 'gold', 'plc', 'was'],
  ['plc', 'g