In [1]:
import os
import numpy as np
import tensorflow as tf
import nltk
import collections
import math
from sklearn.cluster import KMeans

  from ._conv import register_converters as _register_converters


### Reading data

In [2]:
topics = ['business','entertainment','politics','sport','tech']

def read_data(topics):
    
    num_of_docs_to_read = 100
    doc_words = []
    
    for topic in topics:
        for doc in range(1,num_of_docs_to_read + 1):

            with open(os.path.join('bbc', topic, format(doc, '03d') + '.txt')) as f:
                document = f.read()
                file = document.lower()
                file = nltk.word_tokenize(file)
                doc_words.extend(file)

    return doc_words

def read_test_data(topics):
    
    num_of_docs_to_read = 100
    test_doc_words_dict = {}
    
    for topic in topics:
        for doc in np.random.randint(1,num_of_docs_to_read,(10)).tolist():

            with open(os.path.join('bbc', topic, format(doc, '03d') + '.txt')) as f:
                document = f.read()
                file = document.lower()
                file = nltk.word_tokenize(file)
                test_doc_words_dict[topic + '-' + str(doc)] = file
                
    return test_doc_words_dict

doc_words = read_data(topics)
test_doc_words_dict = read_test_data(topics)

print('no. of words:', len(doc_words))
print('doc words:', doc_words[:10])


no. of words: 202528
doc words: ['ad', 'sales', 'boost', 'time', 'warner', 'profit', 'quarterly', 'profits', 'at', 'us']


### Create dictionary

In [3]:
vocab_size = 15000

def create_dict(doc_words):
    global vocab_size
    count = [['UNK', -1]]
    count.extend(collections.Counter(doc_words).most_common(vocab_size - 1))

    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)

    rev_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    return dictionary, rev_dictionary, count

dictionary, rev_dictionary, count = create_dict(doc_words)

print('dictionary', list(dictionary)[:10])
print('reverse dictionary', list(rev_dictionary)[:10])
print('most common words:', count[0:5])

dictionary ['unparalleled', 'aimed', 'italians', 'drops', '25m', 'contesting', 'urban-based', '7.35.', 'breaches', 'sanguine']
reverse dictionary [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
most common words: [['UNK', -1], ('the', 11245), ('.', 8249), (',', 6896), ('to', 5203)]


### Converting string to int

In [4]:
def str_to_int(doc_words, dictionary):
    doc_int = []

    for word in doc_words:
        if word in dictionary:
            doc_int.append(dictionary[word])
        else:
            doc_int.append(dictionary['UNK'])

    return doc_int



doc_int = str_to_int(doc_words, dictionary)

print('no. of words:', len(doc_int))
print('doc int:', doc_int[:10])

test_doc_int = {}

for doc, words in test_doc_words_dict.items():
    test_doc_int[doc] = str_to_int(test_doc_words_dict[doc], dictionary)



no. of words: 202528
doc int: [4724, 155, 746, 86, 1571, 837, 3164, 580, 26, 58]


### Generating data and labels

In [5]:
data = []
labels = []
window_size = 2

def gen_data_labels(doc_int, window_size):
    global data
    global labels
    
    for i in range(len(doc_int)):
    
        if i < len(doc_int) - window_size:

            [data.append(doc_int[i : i + window_size] + doc_int[i + window_size + 1 : i + 2*window_size + 1])]
            labels.append(doc_int[i + window_size])

    return data, labels

data, labels = gen_data_labels(doc_int, window_size)

print('data:', data[:10])
print('\nlabels:', labels[:10])

data: [[4724, 155, 86, 1571], [155, 746, 1571, 837], [746, 86, 837, 3164], [86, 1571, 3164, 580], [1571, 837, 580, 26], [837, 3164, 26, 58], [3164, 580, 58, 438], [580, 26, 438, 1082], [26, 58, 1082, 3065], [58, 438, 3065, 2651]]

labels: [746, 86, 1571, 837, 3164, 580, 26, 58, 438, 1082]


In [6]:
len(labels)

202526

### Generating batches for CBOW

In [7]:
batch_size = 64

def next_batch(batch_no, batch_size):
    
    batch_data = data[batch_no*batch_size : (batch_no * batch_size) + batch_size]
    batch_data = np.array(batch_data)
    
    batch_labels = labels[batch_no*batch_size : (batch_no * batch_size) + batch_size]
    batch_labels = np.array(batch_labels)
    batch_labels = np.reshape(batch_labels, (batch_size, 1))
    
    return batch_data, batch_labels

batch_data, batch_labels = next_batch(0, batch_size)

print('batch data: \n\n{}\n \nbatch labels: \n\n{}'.format(batch_data, batch_labels))


batch data: 

[[ 4724   155    86  1571]
 [  155   746  1571   837]
 [  746    86   837  3164]
 [   86  1571  3164   580]
 [ 1571   837   580    26]
 [  837  3164    26    58]
 [ 3164   580    58   438]
 [  580    26   438  1082]
 [   26    58  1082  3065]
 [   58   438  3065  2651]
 [  438  1082  2651 11177]
 [ 1082  3065 11177    49]
 [ 3065  2651    49     4]
 [ 2651 11177     4    84]
 [11177    49    84  7018]
 [   49     4  7018    51]
 [    4    84    51  9434]
 [   84  7018  9434    50]
 [ 7018    51    50    11]
 [   51  9434    11     1]
 [ 9434    50     1   120]
 [   50    11   120   197]
 [   11     1   197     4]
 [    1   120     4   337]
 [  120   197   337     3]
 [  197     4     3    31]
 [    4   337    31    84]
 [  337     3    84 11308]
 [    3    31 11308  6626]
 [   31    84  6626     2]
 [   84 11308     2     1]
 [11308  6626     1   136]
 [ 6626     2   136     3]
 [    2     1     3    34]
 [    1   136    34    13]
 [  136     3    13    82]
 [    3    34 

In [8]:
test_data_index = 0

def generate_test_batch_words(data, batch_size, num_test_steps):
    global test_data_index

    batch = np.ndarray(shape=(batch_size,), dtype=np.int32)
    
    for bi in range(batch_size):
        batch[bi] = data[test_data_index + num_test_steps]
        test_data_index = (test_data_index + 1) % batch_size
    
    return batch

## CBOW 

### Hyperparameters

In [9]:
batch_size = 64
embedding_size = 128 
window_size = 2
num_sampled = 32

### I/p and O/p

In [10]:
tf.reset_default_graph()

train_data = tf.placeholder(tf.int32, shape = [batch_size, 2 * window_size])
train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])

test_labels = tf.placeholder(tf.int32, shape = [batch_size], name = 'test_labels')

### Model parameters and other variables

In [11]:
embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0, dtype = tf.float32))

softmax_weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_size], stddev = 1.0 / math.sqrt(embedding_size)))
softmax_biases = tf.Variable(tf.zeros([vocab_size]), dtype = tf.float32)

In [12]:
mean_batch_embeddings = tf.reduce_mean(tf.nn.embedding_lookup(embeddings, test_labels), axis = 0)

stacked_embeddings = None

for i in range(2*window_size):
    embedding_i = tf.nn.embedding_lookup(embeddings, train_data[:, i])
    x_size, y_size = embedding_i.get_shape().as_list()
    
    if stacked_embeddings is None:
        stacked_embeddings = tf.reshape(embedding_i, [x_size, y_size, 1])
    else:
        stacked_embeddings = tf.concat(axis = 2, values = [stacked_embeddings, tf.reshape(embedding_i, [x_size, y_size, 1])])

        
print("Stacked embedding size: %s"%stacked_embeddings.get_shape().as_list())

mean_embeddings = tf.reduce_mean(stacked_embeddings, axis = 2, keepdims = False)

print("Reduced mean embedding size: %s"%mean_embeddings.get_shape().as_list())


Stacked embedding size: [64, 128, 4]
Reduced mean embedding size: [64, 128]


### Computing loss

In [13]:
loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weights,
                                                 biases = softmax_biases,
                                                 inputs = mean_embeddings,
                                                 labels = train_labels,
                                                 num_sampled = num_sampled,
                                                 num_classes = vocab_size))


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



### Optimizer

In [14]:
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

### Run CBOW on Document

In [15]:
num_steps = 3000
cbow_loss = []

with tf.Session() as sess:
    
    tf.global_variables_initializer().run()
    
    avg_loss = 0
    
    for step in range(num_steps):
        
        batch_data, batch_labels = next_batch(step, batch_size)
    
        feed_dict = {train_data: batch_data, train_labels: batch_labels}
        
        _, l = sess.run([optimizer, loss], feed_dict = feed_dict)
        avg_loss += l
        
        if (step) % 200 == 0:
            if step > 0:
                avg_loss = avg_loss / 200
            print('Average loss at step %d: %f' % (step, avg_loss))
            
            cbow_loss.append(avg_loss)
    
    # Computing document embeddings by averaging word embeddings
    document_embeddings = {}
    num_test_steps = 5
    
    for doc, words in test_doc_int.items():
        
        test_data_index = 0
        topic_mean_batch_embeddings = np.empty((num_test_steps,embedding_size),dtype=np.float32)
        
        for step in range(num_test_steps):
            test_batch_labels = generate_test_batch_words(test_doc_int[doc],batch_size, step)
            batch_mean = sess.run(mean_batch_embeddings,feed_dict={test_labels:test_batch_labels})
            
            topic_mean_batch_embeddings[step, :] = batch_mean
        document_embeddings[doc] = np.mean(topic_mean_batch_embeddings, axis = 0)
        

Average loss at step 0: 5.838185
Average loss at step 200: 4.332036
Average loss at step 400: 3.547339
Average loss at step 600: 3.401832
Average loss at step 800: 3.379141
Average loss at step 1000: 3.123720
Average loss at step 1200: 3.221186
Average loss at step 1400: 3.218886
Average loss at step 1600: 3.086449
Average loss at step 1800: 2.972738
Average loss at step 2000: 3.150804
Average loss at step 2200: 2.821532
Average loss at step 2400: 2.895416
Average loss at step 2600: 3.206407
Average loss at step 2800: 3.090301


### Classifying documents

In [16]:
kmeans = kmeans = KMeans(n_clusters=5, random_state=0, max_iter = 10000)
kmeans.fit(np.array(list(document_embeddings.values())))

document_classes = {}

for inp, lbl in zip(list(document_embeddings.keys()), kmeans.labels_):
    if lbl not in document_classes:
        document_classes[lbl] = [inp]
    else:
        document_classes[lbl].append(inp)

for k,v in document_classes.items():    
    print('\nDocuments in Cluster ',k)
    print('\t',v)



Documents in Cluster  0
	 ['sport-23', 'sport-93', 'politics-60', 'sport-2', 'sport-75', 'politics-6']

Documents in Cluster  1
	 ['tech-17', 'tech-44', 'entertainment-74', 'sport-89', 'sport-47', 'tech-52', 'politics-19', 'politics-22', 'tech-79', 'tech-72', 'tech-9', 'sport-24', 'entertainment-55', 'business-96', 'politics-39', 'tech-27', 'business-41', 'politics-67']

Documents in Cluster  2
	 ['sport-66', 'business-78', 'entertainment-72', 'politics-43', 'business-77', 'business-28', 'tech-84', 'entertainment-26', 'entertainment-48', 'entertainment-54']

Documents in Cluster  3
	 ['business-94', 'entertainment-85', 'entertainment-33', 'tech-56', 'entertainment-81', 'entertainment-2', 'sport-64', 'business-11']

Documents in Cluster  4
	 ['business-74', 'business-49', 'politics-71', 'business-98', 'politics-32']
