In [106]:
import os
import numpy as np
import tensorflow as tf
import nltk
import collections
import math

### Reading data

In [107]:
topics = ['business','entertainment','politics','sport','tech']

def read_data(topics):
    
    num_of_docs_to_read = 100
    doc_words = []
    
    for topic in topics:
        for doc in range(1,num_of_docs_to_read + 1):

            with open(os.path.join('bbc', topic, format(doc, '03d') + '.txt')) as f:
                document = f.read()
                file = document.lower()
                file = nltk.word_tokenize(file)
                doc_words.extend(file)

    return doc_words

def read_test_data(topics):
    
    num_of_docs_to_read = 100
    test_doc_words_dict = {}
    
    for topic in topics:
        for doc in np.random.randint(1,num_of_docs_to_read,(10)).tolist():

            with open(os.path.join('bbc', topic, format(doc, '03d') + '.txt')) as f:
                document = f.read()
                file = document.lower()
                file = nltk.word_tokenize(file)
                test_doc_words_dict[topic + '-' + str(doc)] = file
                
    return test_doc_words_dict

doc_words = read_data(topics)
test_doc_words_dict = read_test_data(topics)

print('no. of words:', len(doc_words))
print('doc words:', doc_words[:10])


no. of words: 202528
doc words: ['ad', 'sales', 'boost', 'time', 'warner', 'profit', 'quarterly', 'profits', 'at', 'us']


### Create dictionary

In [108]:
vocab_size = 15000

def create_dict(doc_words):
    global vocab_size
    count = [['UNK', -1]]
    count.extend(collections.Counter(doc_words).most_common(vocab_size - 1))

    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)

    rev_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    return dictionary, rev_dictionary, count

dictionary, rev_dictionary, count = create_dict(doc_words)

print('dictionary', list(dictionary)[:10])
print('reverse dictionary', list(rev_dictionary)[:10])
print('most common words:', count[0:5])

dictionary ['deadwood', 'barley', 'gorgeous', 'judge-led', 'send', 'finland', '3.25', 'nor', 'tracker', 'dedicated']
reverse dictionary [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
most common words: [['UNK', -1], ('the', 11245), ('.', 8249), (',', 6896), ('to', 5203)]


### Converting string to int

In [109]:
def str_to_int(doc_words, dictionary):
    doc_int = []

    for word in doc_words:
        if word in dictionary:
            doc_int.append(dictionary[word])
        else:
            doc_int.append(dictionary['UNK'])

    return doc_int



doc_int = str_to_int(doc_words, dictionary)

print('no. of words:', len(doc_int))
print('doc int:', doc_int[:10])

test_doc_int = {}

for doc, words in test_doc_words_dict.items():
    test_doc_int[doc] = str_to_int(test_doc_words_dict[doc], dictionary)



no. of words: 202528
doc int: [4762, 155, 748, 86, 1559, 818, 3133, 580, 26, 58]


### Generating data and labels

In [110]:
data = []
labels = []
window_size = 2

def gen_data_labels(doc_int, window_size):
    global data
    global labels
    
    for i in range(len(doc_int)):
    
        if i < len(doc_int) - window_size:

            [data.append(doc_int[i : i + window_size] + doc_int[i + window_size + 1 : i + 2*window_size + 1])]
            labels.append(doc_int[i + window_size])

    return data, labels

data, labels = gen_data_labels(doc_int, window_size)

print('data:', data[:10])
print('\nlabels:', labels[:10])

data: [[4762, 155, 86, 1559], [155, 748, 1559, 818], [748, 86, 818, 3133], [86, 1559, 3133, 580], [1559, 818, 580, 26], [818, 3133, 26, 58], [3133, 580, 58, 438], [580, 26, 438, 1071], [26, 58, 1071, 2966], [58, 438, 2966, 2828]]

labels: [748, 86, 1559, 818, 3133, 580, 26, 58, 438, 1071]


In [128]:
len(labels)

202526

### Generating batches for CBOW

In [111]:
batch_size = 64

def next_batch(batch_no, batch_size):
    
    batch_data = data[batch_no*batch_size : (batch_no * batch_size) + batch_size]
    batch_data = np.array(batch_data)
    
    batch_labels = labels[batch_no*batch_size : (batch_no * batch_size) + batch_size]
    batch_labels = np.array(batch_labels)
    batch_labels = np.reshape(batch_labels, (batch_size, 1))
    
    return batch_data, batch_labels

batch_data, batch_labels = next_batch(0, batch_size)

print('batch data: \n\n{}\n \nbatch labels: \n\n{}'.format(batch_data, batch_labels))


batch data: 

[[ 4762   155    86  1559]
 [  155   748  1559   818]
 [  748    86   818  3133]
 [   86  1559  3133   580]
 [ 1559   818   580    26]
 [  818  3133    26    58]
 [ 3133   580    58   438]
 [  580    26   438  1071]
 [   26    58  1071  2966]
 [   58   438  2966  2828]
 [  438  1071  2828 10144]
 [ 1071  2966 10144    49]
 [ 2966  2828    49     4]
 [ 2828 10144     4    84]
 [10144    49    84  7648]
 [   49     4  7648    51]
 [    4    84    51 13735]
 [   84  7648 13735    50]
 [ 7648    51    50    11]
 [   51 13735    11     1]
 [13735    50     1   122]
 [   50    11   122   196]
 [   11     1   196     4]
 [    1   122     4   337]
 [  122   196   337     3]
 [  196     4     3    31]
 [    4   337    31    84]
 [  337     3    84 13344]
 [    3    31 13344  6522]
 [   31    84  6522     2]
 [   84 13344     2     1]
 [13344  6522     1   137]
 [ 6522     2   137     3]
 [    2     1     3    34]
 [    1   137    34    13]
 [  137     3    13    82]
 [    3    34 

In [112]:
test_data_index = 0

def generate_test_batch_words(data, batch_size, num_test_steps):
    global test_data_index

    batch = np.ndarray(shape=(batch_size,), dtype=np.int32)
    
    for bi in range(batch_size):
        batch[bi] = data[test_data_index + num_test_steps]
        test_data_index = (test_data_index + 1) % batch_size
    
    return batch

## CBOW 

### Hyperparameters

In [113]:
batch_size = 64
embedding_size = 128 
window_size = 2
num_sampled = 32

### I/p and O/p

In [114]:
tf.reset_default_graph()

train_data = tf.placeholder(tf.int32, shape = [batch_size, 2 * window_size])
train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])

test_labels = tf.placeholder(tf.int32, shape = [batch_size], name = 'test_labels')

### Model parameters and other variables

In [115]:
embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0, dtype = tf.float32))

softmax_weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_size], stddev = 1.0 / math.sqrt(embedding_size)))
softmax_biases = tf.Variable(tf.zeros([vocab_size]), dtype = tf.float32)

In [116]:
mean_batch_embeddings = tf.reduce_mean(tf.nn.embedding_lookup(embeddings, test_labels), axis = 0)

stacked_embeddings = None

for i in range(2*window_size):
    embedding_i = tf.nn.embedding_lookup(embeddings, train_data[:, i])
    x_size, y_size = embedding_i.get_shape().as_list()
    
    if stacked_embeddings is None:
        stacked_embeddings = tf.reshape(embedding_i, [x_size, y_size, 1])
    else:
        stacked_embeddings = tf.concat(axis = 2, values = [stacked_embeddings, tf.reshape(embedding_i, [x_size, y_size, 1])])

        
print("Stacked embedding size: %s"%stacked_embeddings.get_shape().as_list())

mean_embeddings = tf.reduce_mean(stacked_embeddings, axis = 2, keepdims = False)

print("Reduced mean embedding size: %s"%mean_embeddings.get_shape().as_list())


Stacked embedding size: [64, 128, 4]
Reduced mean embedding size: [64, 128]


### Computing loss

In [117]:
loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weights,
                                                 biases = softmax_biases,
                                                 inputs = mean_embeddings,
                                                 labels = train_labels,
                                                 num_sampled = num_sampled,
                                                 num_classes = vocab_size))


### Optimizer

In [118]:
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

### Run CBOW on Document

In [136]:
num_steps = 3000
cbow_loss = []

with tf.Session() as sess:
    
    tf.global_variables_initializer().run()
    
    avg_loss = 0
    
    for step in range(num_steps):
        
        batch_data, batch_labels = next_batch(step, batch_size)
    
        feed_dict = {train_data: batch_data, train_labels: batch_labels}
        
        _, l = sess.run([optimizer, loss], feed_dict = feed_dict)
        avg_loss += l
        
        if (step) % 200 == 0:
            if step > 0:
                avg_loss = avg_loss / 200
            print('Average loss at step %d: %f' % (step, avg_loss))
            
            cbow_loss.append(avg_loss)
    
    # Computing document embeddings by averaging word embeddings
    document_embeddings = {}
    num_test_steps = 5
    
    for doc, words in test_doc_int.items():
        
        test_data_index = 0
        topic_mean_batch_embeddings = np.empty((num_test_steps,embedding_size),dtype=np.float32)
        
        for step in range(num_test_steps):
            test_batch_labels = generate_test_batch_words(test_doc_int[doc],batch_size, step)
            batch_mean = sess.run(mean_batch_embeddings,feed_dict={test_labels:test_batch_labels})
            
            topic_mean_batch_embeddings[step, :] = batch_mean
        document_embeddings[doc] = np.mean(topic_mean_batch_embeddings)
        

Average loss at step 0: 6.366329
Average loss at step 200: 4.315928
Average loss at step 400: 3.507949
Average loss at step 600: 3.388779
Average loss at step 800: 3.375359
Average loss at step 1000: 3.106099
Average loss at step 1200: 3.224767
Average loss at step 1400: 3.210920
Average loss at step 1600: 3.079953
Average loss at step 1800: 2.954291
Average loss at step 2000: 3.159761
Average loss at step 2200: 2.822422
Average loss at step 2400: 2.868108
Average loss at step 2600: 3.189792
Average loss at step 2800: 3.067338
