# word2vec implementation

In [1]:
import sys
import os
import re
from typing import Dict, List
import numpy as np
import tensorflow as tf

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=400) 

%load_ext line_profiler
%load_ext autoreload

In [32]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

# Overview
<img src="image/word2vec_cbow_mechanism.png" align="left"/>

# Constants

In [4]:
USE_PTB = True
DEBUG = False
VALIDATION = True

# Text utilities

In [5]:
%autoreload 2
import function.fileio as fileio

# Corpus

In [6]:
corpus = "To be, or not to be, that is the question that matters"
_file = "ptb.train.txt"
if USE_PTB:
    if not fileio.Function.is_file(f"~/.keras/datasets/{_file}"):
        path_to_ptb = tf.keras.utils.get_file(
            _file, 
            f'https://raw.githubusercontent.com/tomsercu/lstm/master/data/{_file}'
        )
    corpus = fileio.Function.read_file(path_to_ptb)

# Word indexing

In [7]:
%autoreload 2
from layer.preprocessing import (
    WordIndexing, 
    EventContext
)

## WordIndexing instance for the corpus

Adapt to the ```corpus``` and provides:
* word_to_index dictionary
* vocaburary of the corpus
* word occurrence probabilites

In [8]:
word_indexing = WordIndexing(
    name="word_indexing_on_ptb",
    corpus=corpus
)

In [47]:
print(f"WordIndexing.vocabulary[10]:\n{word_indexing.vocabulary[:10]}\n")
print(f"WordIndexing.word_to_index[10]:")
for item in take(10, word_indexing.word_to_index.items()):
    print(item)

print(f"\nWordIndexing.probabilities[10]:")
for item in take(10, word_indexing.word_to_index.items())[1:11]:
    print(f"{item[0]:15}: {word_indexing.probabilities[item[0]]}")


WordIndexing.vocabulary[10]:
['<nil>' '<unk>' 'searched' 'studies' 'thief' 'casino' 'types' 'maturities' 'breeding' 'greece']

WordIndexing.word_to_index[10]:
('<nil>', 0)
('<unk>', 1)
('searched', 2)
('studies', 3)
('thief', 4)
('casino', 5)
('types', 6)
('maturities', 7)
('breeding', 8)
('greece', 9)

WordIndexing.probabilities[10]:
<unk>          : 0.05056069360915978
searched       : 6.738430956351814e-06
studies        : 5.6153591302931785e-05
thief          : 6.738430956351814e-06
casino         : 3.4815226607817707e-05
types          : 4.8292088520521335e-05
maturities     : 2.0215292869055442e-05
breeding       : 1.0107646434527721e-05
greece         : 1.2353790086644994e-05


## Sentence to Sequence

In [70]:
sentences = "\n".join(corpus.split('\n')[4:5])
sequences = word_indexing.function(sentences)
for pair in zip(sentences.strip().split(" "), sequences[0]):
    print(f"{pair[0]:15} : {pair[1]:5}")


a               :  8247
form            :  7193
of              :  3321
asbestos        :  4493
once            :  1135
used            :  8523
to              :  8915
make            :  7610
kent            :  4155
cigarette       :  5657
filters         :  8219
has             :  2619
caused          :  4180
a               :  8247
high            :  2959
percentage      :   489
of              :  3321
cancer          :  4578
deaths          :  1111
among           :  7426
a               :  8247
group           :  8534
of              :  3321
workers         :  1104
exposed         :  7914
to              :  8915
it              :  5477
more            :  3399
than            :  4280
N               :  3576
years           :  7598
ago             :  7851
researchers     :   393
reported        :  9189
