# word2vec implementation

## Overview
<img src="image/word2vec_cbow_mechanism.png" align="left"/>

---
# Setups

In [13]:
import sys
import os
import re
from itertools import islice
from typing import Dict, List
import numpy as np
import tensorflow as tf

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=400) 

## Setup for Google Colab environment

In [2]:
try:
    import google.colab
    IN_GOOGLE_COLAB = True
except:
    IN_GOOGLE_COLAB = False
    
if IN_GOOGLE_COLAB:
    !pip install line_profiler
    !google.colab.drive.mount('/content/gdrive')
    !mkdir -p /content/drive/MyDrive/github
    !git clone https://github.com/oonisim/python-programs.git /content/drive/MyDrive/github
    %cd '/content/drive/MyDrive/github/nlp/src'
        
    import sys
    sys.path.append('/content/drive/MyDrive/github/nlp/src')

## Jupyter notebook setups

In [3]:
%load_ext line_profiler
%load_ext autoreload

## Utilites

In [4]:
%autoreload 2

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

import function.fileio as fileio

# Constants

In [5]:
USE_PTB = True
DEBUG = False
VALIDATION = True

---

# Data
## Corpus

In [7]:
corpus = "To be, or not to be, that is the question that matters"
_file = "ptb.train.txt"
if USE_PTB:
    if not fileio.Function.is_file(f"~/.keras/datasets/{_file}"):
        path_to_ptb = tf.keras.utils.get_file(
            _file, 
            f'https://raw.githubusercontent.com/tomsercu/lstm/master/data/{_file}'
        )
    corpus = fileio.Function.read_file(path_to_ptb)

In [8]:
examples = corpus.split('\n')[:5]
for line in examples:
    print(line)

 aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter 
 pierre <unk> N years old will join the board as a nonexecutive director nov. N 
 mr. <unk> is chairman of <unk> n.v. the dutch publishing group 
 rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate 
 a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported 


---
# Word indexing

In [9]:
%autoreload 2
from layer.preprocessing import (
    WordIndexing, 
    EventContext
)

## WordIndexing instance for the corpus

Adapt to the ```corpus``` and provides:
* word_to_index dictionary
* vocaburary of the corpus
* word occurrence probabilites

In [10]:
word_indexing = WordIndexing(
    name="word_indexing_on_ptb",
    corpus=corpus
)

In [11]:
print(f"WordIndexing.vocabulary[10]:\n{word_indexing.vocabulary[:10]}\n")
print(f"WordIndexing.word_to_index[10]:")
for item in take(10, word_indexing.word_to_index.items()):
    print(item)

print(f"\nWordIndexing.probabilities[10]:")
for item in take(10, word_indexing.word_to_index.items())[1:11]:
    print(f"{item[0]:15}: {word_indexing.probabilities[item[0]]}")


WordIndexing.vocabulary[10]:
['<nil>' '<unk>' 'carol' 'substantial' 'deposit' 'ratios' 'shoppers' 'lotus' 'della' 'real']

WordIndexing.word_to_index[10]:
('<nil>', 0)
('<unk>', 1)
('carol', 2)
('substantial', 3)
('deposit', 4)
('ratios', 5)
('shoppers', 6)
('lotus', 7)
('della', 8)
('real', 9)

WordIndexing.probabilities[10]:
<unk>          : 0.05056069360915978
carol          : 1.3476861912703628e-05
substantial    : 8.872267425863223e-05
deposit        : 6.064587860716633e-05
ratios         : 8.984574608469087e-06
shoppers       : 1.0107646434527721e-05
lotus          : 2.358450834723135e-05
della          : 1.909222104299681e-05
real           : 0.0004548440895537475


## Sentence to Sequence

In [12]:
word_indexing.function("\n")

Sentence is empty. Skipping...
Sentence is empty. Skipping...


AssertionError: No valid sentences in the input 
[
]


In [None]:
# sentences = "\n".join(corpus.split('\n')[5:6])
sentences = """
the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said
"""
sequences = word_indexing.function(sentences)
for pair in zip(sentences.strip().split(" "), sequences[0]):
    print(f"{pair[0]:15} : {pair[1]:5}")