In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import string
import io
import collections
import urllib2
from nltk.corpus import stopwords

In [2]:
sess=tf.Session()

In [60]:
batch_size=50
embedding_size=200
vocabulary_size=10000
generations=50000
print_loss_every=500
num_sampled=int(batch_size/2)
window_size=2
stops=stopwords.words('english')
print_valid_every=2000
valid_words=['cliche','love','hate','silly','sad']

In [86]:
pos=open('temp/rt-polaritydata/rt-polarity.pos','r')
neg=open('temp/rt-polaritydata/rt-polarity.neg','r')

In [87]:
pos_data=[]
neg_data=[]

In [88]:
for line in pos:
    pos_data.append(line.decode('ISO-8859-1').encode('utf-8',errors='ignore').decode('utf-8'))

In [89]:
for line in neg:
    neg_data.append(line.decode('ISO-8859-1').encode('utf-8',errors='ignore').decode('utf-8'))

In [90]:
texts=pos_data+neg_data

In [91]:
target=[1]*len(pos_data)+[0]*len(neg_data)

In [92]:
def normalize_text(texts,stops):
    # This is the function that we use to normalize the texts
    # Lower all the cases
    texts=[x.lower() for x in texts]
    # Remove all the punctuation
    texts=[''.join(c for c in x if c not in string.punctuation) for x in texts]
    # Remove all the stops words
    texts=[' '.join([word for word in x.split() if word not in stops]) for x in texts]
    # Remove numbers
    texts=[''.join(c for c in x if c not in '0123456789') for x in texts]
    # Remove extra white space
    texts=[' '.join(x.split()) for x in texts]
    return texts
texts=normalize_text(texts,stops)

In [93]:
# To make sure all the movies are informative, here we only kept the longer sentences
target = [target[ix] for ix,x in enumerate(texts) if len(x.split())>2]
texts = [x for x in texts if len(x.split())>2]

In [101]:
split_sentences = [s.split() for s in texts]
words = [x for sublist in split_sentences for x in sublist]

In [102]:
count=[['RARE',-1]]
count.extend(collections.Counter(words).most_common(vocabulary_size-1))

In [105]:
word_dict={}
for word,word_count in count:
    word_dict[word]=len(word_dict)

In [108]:
def build_dictionary(sentences,vocabulary_size):
    # create sentences into list of words
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist]
    count=[['RARE',-1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    word_dict={}
    for word,word_count in count:
        word_dict=len(word_dict)
    return word_dict

In [107]:
# Here we create sentence into numbers so we can use it for later training
def text_to_numbers(sentences,word_dict):
    data=[]
    for sentence in sentences:
        temp=[]
        for word in sentence:
            if word in word_dict:
                word_ix=word_dict[word]
            else:
                word_ix=0
            temp.append(word_ix)
        data.append(temp)
    return(data)

In [109]:
word_dictionary=word_dict

In [111]:
word_dictionary_rev=dict(zip(word_dictionary.values(),word_dictionary.keys()))

In [114]:
text_data=text_to_numbers(texts,word_dictionary)

In [116]:
valid_examples=[word_dictionary[x] for x in valid_words]

In [118]:
# Now we need to create skip-gram batches
def generate_batch_data(sentences, batch_size,window_size,method='skip-gram'):
    # Fill out the batch
    batch_data=[]
    label_data=[]
    while len(batch_data)<batch_size:
        rand_sentence=np.random.choice(sentences)
        #generate consecutive window to look at
        window_sequences=[rand_sentence[max(ix-window_size,0):(ix+window_size+1)] for ix,x in enumerate(rand_sentence)]
        label_indices=[ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]
        if method=='skip-gram':
            batch_and_labels=[(x[y],x[:y]+x[(y+1):]) for x,y in zip(window_sequences,label_indices)]
            tuple_data=[(x,y_) for x,y in batch_and_labels for y_ in y]
        #extract batch and labels
        batch,labels = [list(x) for x in zip(*tuple_data)]
        batch_data.extend(batch[:batch_size])
        label_data.extend(labels[:batch_size])
    batch_data=batch_data[:batch_size]
    label_data=label_data[:batch_size]
    
    batch_data=np.array(batch_data)
    label_data=np.array(np.array([label_data]))
    
    return (batch_data,label_data)

In [133]:
embeddings=tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
# create data/target place holder
x_inputs=tf.placeholder(tf.int32,shape=[batch_size])
y_target=tf.placeholder(tf.int32,shape=[batch_size,1])
valid_dataset=tf.constant(valid_examples,dtype=tf.int32)

embed=tf.nn.embedding_lookup(embeddings,x_inputs)

In [137]:
nce_weights=tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/np.sqrt(embedding_size)))
nce_biases= tf.Variable(tf.zeros([vocabulary_size]))

In [143]:
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                     biases=nce_biases,
                                     labels=y_target,
                                     inputs=embed,
                                     num_sampled=num_sampled,
                                     num_classes=vocabulary_size))

In [145]:
norm=tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keep_dims=True))
normalized_embeddings=embeddings/norm
valid_embeddings=tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
similarity = tf.matmul(valid_embeddings,normalized_embeddings,transpose_b=True)

In [154]:
optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.5).minimize(loss)
init=tf.global_variables_initializer()
sess.run(init)
loss_vec = []
loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = generate_batch_data(text_data, batch_size, window_size)
    feed_dict = {x_inputs : batch_inputs, y_target : np.transpose(batch_labels)}

    # Run the train step
    sess.run(optimizer, feed_dict=feed_dict)

    # Return the loss
    if (i+1) % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(i+1)
        print("Loss at step {} : {}".format(i+1, loss_val))
      
    # Validation: Print some random words and top 5 related words
    if (i+1) % print_valid_every == 0:
        sim = sess.run(similarity, feed_dict=feed_dict)
        for j in range(len(valid_words)):
            valid_word = word_dictionary_rev[valid_examples[j]]
            top_k = 5 # number of nearest neighbors
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                log_str = "%s %s," % (log_str, close_word)
            print(log_str)

Loss at step 500 : 32.9895439148
Loss at step 1000 : 2.98992228508
Loss at step 1500 : 15.9521713257
Loss at step 2000 : 10.6765232086
Nearest to cliche: nair, piccoli, mistaken, messenger, strike,
Nearest to love: canadians, cannot, eat, greengrass, zany,
Nearest to hate: particular, funeral, paperthin, waves, unexpectedly,
Nearest to silly: flashback, hammy, workmanlike, band, crack,
Nearest to sad: grinning, river, tremendously, twenty, watstein,
Loss at step 2500 : 8.72713279724
Loss at step 3000 : 9.23748397827
Loss at step 3500 : 10.3326950073
Loss at step 4000 : 10.055606842
Nearest to cliche: nair, piccoli, mistaken, messenger, strike,
Nearest to love: canadians, cannot, eat, greengrass, zany,
Nearest to hate: particular, funeral, paperthin, waves, unexpectedly,
Nearest to silly: flashback, hammy, workmanlike, band, crack,
Nearest to sad: grinning, river, tremendously, twenty, watstein,
Loss at step 4500 : 16.3775501251
Loss at step 5000 : 4.11282682419
Loss at step 5500 : 0.69