In [1]:
import tensorflow as tf
import numpy as np

In [2]:
import logging
from os.path import join
from glob import glob
from datetime import datetime
from tools.words_vectors import find_closest
from tools.corpus_manipulator import get_curpos_sentences, get_curpos_words

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger(__name__)

In [3]:
WINDOW_SIZE = 2

In [4]:
TRAINING_SET_FULL_PATHS = glob(join('corpus', 'training-monolingual.tokenized.shuffled', '*.txt'))
TRAINING_SET_MIN_PATHS = glob(join('corpus', 'training-monolingual.tokenized.shuffled',
                                   'news.en-00001*'))
TRAINING_SET_SANITY_TEST_PATHS = glob(join('corpus', 'training_sample.txt'))
TRAINING_SET_PATHS = TRAINING_SET_MIN_PATHS

In [5]:
curpos_words = get_curpos_words(TRAINING_SET_PATHS)

In [6]:
sentences = get_curpos_sentences(TRAINING_SET_PATHS)

In [7]:
vocab_size = len(curpos_words)

In [8]:
def get_training_skip_gram(sentences):
    data = []
    for sentence in sentences:
        for word_index, word in enumerate(sentence):
            window_start_index = max(word_index - WINDOW_SIZE, 0)
            window_end_index = min(word_index + WINDOW_SIZE, len(sentence)) + 1
            window_sentence = sentence[window_start_index: window_end_index]
            data += [[word, window_word] for window_word in window_sentence if window_word != word]
    return data

In [9]:
train_pairs = get_training_skip_gram(sentences)

In [10]:
def train_pairs_to_one_hot_vectors(train_pairs, word_to_one_hot_vector):
    inputs = [word_to_one_hot_vector[pair[0]] for pair in train_pairs]
    outputs = [word_to_one_hot_vector[pair[1]] for pair in train_pairs]
    return inputs, outputs


def to_one_hot(data_point_index, vector_size):
    temp = np.zeros(vector_size)
    temp[data_point_index] = 1
    return temp


def map_words_to_one_hot_vectors(words):
    word_to_one_hot_vector = {word: to_one_hot(i, len(words)) for i, word in enumerate(words)}
    return word_to_one_hot_vector

In [11]:
word_to_one_hot_vector = map_words_to_one_hot_vectors(curpos_words)

In [12]:
inputs, outputs = train_pairs_to_one_hot_vectors(train_pairs, word_to_one_hot_vector)

In [13]:
x = tf.placeholder(tf.float32, shape=(None, vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

In [14]:
EMBEDDING_DIM = 5
W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM]))  # bias
hidden_representation = tf.add(tf.matmul(x,W1), b1)

In [15]:
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation, W2), b2))

In [16]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [17]:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))

In [18]:
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)

In [19]:
n_iters = 10000

In [None]:
for i in range(n_iters):
    sess.run(train_step, feed_dict={x: inputs, y_label: outputs})
    print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: inputs, y_label: outputs}))

In [None]:
vectors = sess.run(W1 + b1)


def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum((vec1-vec2)**2))


def find_closest(word_index, vectors):
    min_dist = 10000 # to act like positive infinity
    min_index = -1
    query_vector = vectors[word_index]
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
    return min_index


from sklearn.manifold import TSNE

model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
vectors = model.fit_transform(vectors)

from sklearn import preprocessing

normalizer = preprocessing.Normalizer()
vectors = normalizer.fit_transform(vectors, 'l2')

print(vectors)

import matplotlib.pyplot as plt


fig, ax = plt.subplots()
print(words)
for word in words:
    print(word, vectors[word2int[word]][1])
    ax.annotate(word, (vectors[word2int[word]][0],vectors[word2int[word]][1] ))
plt.show()