## Tensorflow Word Embeddings From Scratch
- Using CBOW model
- References:
    - https://gist.github.com/yxtay/a94d971955d901c4690129580a4eafb9
    - https://github.com/huseinzol05/Text-Classification-Comparison/blob/master/preparation/word-vector.ipynb
    - The difference with husein model is that he trains Skip-gram and I train a CBOW model. 

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder
import re
import collections
import random
import time
import os
import helpers.pickle_helpers as ph
import math
os.environ['CUDA_VISIBLE_DEVICES'] = ''

### Generating raw vocabulary

In [2]:
train_data = ph.load_from_pickle(directory="data/husein_emotion/emotion-english/merged_training.pkl")

In [3]:
def clearstring(string):
    string = re.sub('[^\'\"A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = [y for y in string if len(y) > 3 and y.find('nbsp') < 0]
    return ' '.join(string)

def read_data():
    e_dir = 'data/husein_emotion/emotion-english/data/'
    list_folder = os.listdir(e_dir)
    label = list_folder
    label.sort()
    outer_string, outer_label = [], []
    for i in range(len(list_folder)):
        list_file = os.listdir(e_dir + list_folder[i])
        strings = []
        for x in range(len(list_file)):
            with open(e_dir + list_folder[i] + '/' + list_file[x], 'r') as fopen:
                strings += fopen.read().split('\n')
        strings = list(filter(None, strings))
        for k in range(len(strings)):
            strings[k] = clearstring(strings[k])
        labels = [i] * len(strings)
        outer_string += strings
        outer_label += labels
    
    dataset = np.array([outer_string, outer_label])
    dataset = dataset.T
    np.random.shuffle(dataset)
    
    string = []
    for i in range(dataset.shape[0]):
        string += dataset[i][0].split()
    
    return string, dataset, label

def read_data_with_pandas():
    """ I already converted the data into pandas to we can avoid the function above"""
    vocab = []
    text = train_data.text.values.tolist()
    for t in text:
        strings = clearstring(t)
        vocab+=strings.split()
    return vocab

In [4]:
#vocabulary, dataset, label = read_data()
vocabulary = read_data_with_pandas()

In [5]:
print("example 10 words:", vocabulary[:10])
print('size corpus:',len(vocabulary))
vocabulary_size = len(list(set(vocabulary)))
print('size of unique words:',vocabulary_size)

example 10 words: ['feel', 'awful', 'about', 'because', 'position', 'succeed', 'just', 'didn', 'happen', 'here']
size corpus: 4433712
size of unique words: 71554


### Build Dataset

In [6]:
def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary) # increase index as words added
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

In [7]:
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)

In [8]:
del vocabulary # reduces memory

In [9]:
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:15], [reverse_dictionary[i] for i in data[:15]])

Most common words (+UNK) [['UNK', 1], ('feel', 289939), ('feeling', 134185), ('that', 130733), ('like', 73972)]
Sample data [1, 370, 8, 11, 1016, 2713, 9, 176, 434, 72, 140, 1, 370, 228, 1193] ['feel', 'awful', 'about', 'because', 'position', 'succeed', 'just', 'didn', 'happen', 'here', 'alone', 'feel', 'awful', 'probably', 'mentioned']


In [10]:
len(data)

4433712

### Generating Batches for the CBOW model

In [11]:
data_index = 0

def generate_batch(batch_size, context_window):
    # all context tokens should be used, hence no associated num_skips argument
    global data_index
    context_size = 2 * context_window
    batch = np.ndarray(shape=(batch_size, context_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * context_window + 1  # [ context_window target context_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size):
        # context tokens are just all the tokens in buffer except the target
        batch[i, :] = [token for idx, token in enumerate(buffer) if idx != context_window]
        labels[i, 0] = buffer[context_window]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    data_index-=1
    return batch, labels

In [12]:
batch, labels = generate_batch(batch_size=8, context_window=1)

In [13]:
for i in range(8):
    print(batch[i, 0], reverse_dictionary[batch[i, 0]],
          batch[i, 1], reverse_dictionary[batch[i, 1]],
          '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

1 feel 8 about -> 370 awful
370 awful 11 because -> 8 about
8 about 1016 position -> 11 because
11 because 2713 succeed -> 1016 position
1016 position 9 just -> 2713 succeed
2713 succeed 176 didn -> 9 just
9 just 434 happen -> 176 didn
176 didn 72 here -> 434 happen


In [14]:
list(data[:30])

[1,
 370,
 8,
 11,
 1016,
 2713,
 9,
 176,
 434,
 72,
 140,
 1,
 370,
 228,
 1193,
 6,
 99,
 13,
 1,
 297,
 17,
 105,
 967,
 7,
 144,
 4320,
 4800,
 3190,
 1340,
 2]

In [15]:
batch, labels, print(data_index)

10


(array([[   1,    8],
        [ 370,   11],
        [   8, 1016],
        [  11, 2713],
        [1016,    9],
        [2713,  176],
        [   9,  434],
        [ 176,   72]], dtype=int32), array([[ 370],
        [   8],
        [  11],
        [1016],
        [2713],
        [   9],
        [ 176],
        [ 434]], dtype=int32), None)

### Model

In [16]:
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
context_window = 1  # How many words to consider left and right.
context_size = 2 * context_window

valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64 

data_index = 0 # reset index

In [17]:
valid_examples

array([33, 42, 61, 31, 11, 27, 98, 50, 48, 83, 76, 26,  0,  2,  7, 38])

In [18]:
graph = tf.Graph()

with graph.as_default():
    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size, context_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        # take mean of embeddings of context words for context embedding
        embed_context = tf.reduce_mean(embed, 1)

        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / np.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    loss = tf.reduce_mean(
        tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed_context,
                       num_sampled, vocabulary_size))

    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)

    # Add variable initializer.
    init = tf.global_variables_initializer()

### Training

In [19]:
num_steps = 100001
with tf.Session(graph=graph) as session:
    # We must initialize all variables before we use them.
    init.run()
    print("Initialized")

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(
            batch_size, context_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

Initialized
Average loss at step  0 :  299.08343505859375
Nearest to much: chel, minion, rian, niceee, harmonizes, thou, gillan, interrupts,
Nearest to could: ceviche, reseach, lifestyles, references, rasputins, latimers, befuddles, sealer,
Nearest to work: munich, intake, sweetie, gwens, tolong, synch, agitated, into,
Nearest to some: choice, hashtag, shropshire, atoshealthcare, refferred, unimportance, succeeded, sabar,
Nearest to because: noche, afte, wwii, boarder, timeisaidsomething, metronomy, maniacofmagic, miiiiight,
Nearest to want: hoie, prostration, carseat, nagalene, vinage, harrumping, vistas, frightened,
Nearest to person: possible, quetions, christasthostoraoraret, underprivledged, memories, splayed, ashers, dreamcatcher,
Nearest to their: beind, thumped, commodore, daven, libers, jiggle, yowling, meuan,
Nearest to always: lovechild, yoma, qualitative, battlefield, cheesy, palet, seris, emitting,
Nearest to through: raynaud, baskets, grasps, subcontracted, ablowing, lill

In [26]:
final_embeddings

array([[ 0.03698784,  0.08871643, -0.0046615 , ..., -0.14359537,
        -0.09679344,  0.03453588],
       [ 0.00217088, -0.00746337, -0.02105754, ...,  0.04860028,
        -0.05745085, -0.11272517],
       [-0.01719761, -0.01997992, -0.04020466, ..., -0.11894673,
        -0.08256418, -0.04724454],
       ...,
       [-0.10864564,  0.14505424, -0.04585417, ...,  0.09261709,
         0.11069414,  0.10313071],
       [ 0.07042108,  0.13556236, -0.0210914 , ...,  0.08755381,
         0.08562793,  0.12323809],
       [ 0.15020965,  0.06738573, -0.09012128, ...,  0.1060191 ,
        -0.02045754, -0.13858506]], dtype=float32)

In [23]:
ph.convert_to_pickle(directory="data/husein_emotion/tf_embeddings/tf_cbow_embeddings.p", item=final_embeddings)
ph.convert_to_pickle(directory="data/husein_emotion/tf_embeddings/tf_cbow_dictionary.p", item=dictionary)