In [28]:
from IPython.display import clear_output, HTML
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import zipfile
from bs4 import BeautifulSoup
import re
import nltk
import gensim
from gensim.models import word2vec
import itertools, collections, random

import tensorflow as tf
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings; warnings.simplefilter('ignore')

### Preprocessing

In [5]:
#nltk.download()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [6]:
# Read all data from 3 files 
train = pd.read_csv( "labeledTrainData.tsv/labeledTrainData.tsv", header=0, 
                        delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv/unlabeledTrainData.tsv", header=0, 
                        delimiter="\t", quoting=3 )

In [7]:
# Word2Vec expects single sentences, each one as a list of words. 

def review_to_wordlist(raw_review, remove_stopwords=False):
    '''Returns a list of words'''
    review_text = BeautifulSoup(raw_review,'lxml').get_text() 
    letters_only=re.sub('[^a-zA-Z]'," ",review_text)
    words = letters_only.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english")) 
        words =  [w for w in words if w not in stops]
    return words

def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append( review_to_wordlist( raw_sentence,remove_stopwords ))
    return sentences

In [8]:
%%time
sentences = []

for review in train.review:
    sentences += review_to_sentences(review,tokenizer)
    
for review in unlabeled_train.review:
    sentences += review_to_sentences(review, tokenizer)

CPU times: user 4min 25s, sys: 3.44 s, total: 4min 28s
Wall time: 4min 28s


In [9]:
# numbe of reviews
print("Number of reviews :",len(train)+len(unlabeled_train))
print("Total number of sentences :",len(sentences))

Number of reviews : 75000
Total number of sentences : 795538


### Vectorazing words 1: Gensim

In [10]:
%%time
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 3       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

model = word2vec.Word2Vec(sentences=sentences,workers=num_workers,size=num_features,min_count=min_word_count,
                          window=context,sample=downsampling)

CPU times: user 5min 3s, sys: 624 ms, total: 5min 4s
Wall time: 1min 49s


In [11]:
model.init_sims(replace=True)
#Saving the model
model_name = "300features_40minwords_10context"
model.save(model_name)

In [14]:
# exploring model results
# businessman - man + woman = ?
model.most_similar(positive=['businessman','woman'],negative=['man'])

[('socialite', 0.7162072062492371),
 ('prostitute', 0.7108803391456604),
 ('waitress', 0.6776148080825806),
 ('housewife', 0.6685196161270142),
 ('wealthy', 0.6460838913917542),
 ('nurse', 0.6441401243209839),
 ('widow', 0.6410201787948608),
 ('gigolo', 0.6342802047729492),
 ('heiress', 0.631268322467804),
 ('lawyer', 0.6177793145179749)]

In [15]:
# There are 16490 words, every word is 300 dim vector
model.wv.syn0.shape

(16490, 300)

### Vectorizing words 2: Tensorflow

In [24]:
# list of words
words = list(itertools.chain(*sentences))
print("There are",len(words),"words")

There are 17798082 words


In [25]:
# data               ->  [index, ...]
# count              ->  [[word,count],[...]]
# dictionary         ->  {word:index}
# erverse_dictionary ->  {index:word}

#vocabulary_size = 50000
min_word_count = 40   # Minimum word count                        

def build_dataset(words):
    count = [['UNK', -1]]
    #count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    count.extend([tup for tup in collections.Counter(words).most_common() if tup[1]>=min_word_count])
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words  # to reduce memory.

Most common words (+UNK) [['UNK', 559142], ('the', 1014971), ('and', 494087), ('a', 491473), ('of', 439756)]
Sample data [15, 30, 10, 520, 165, 183, 31, 1, 558, 15]


In [27]:
def generate_batch(batch_size, skip_window):
    global data_index
    
    num_skips = 2 * skip_window
    assert batch_size % num_skips == 0
    batch  = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

# print('data:', [reverse_dictionary[di] for di in data[:40]])


# data_index = 0
# batch, labels = generate_batch(batch_size=40,  skip_window=10)
# print('\nwith  skip_window = %d:' % (skip_window))
# print('    batch:', [reverse_dictionary[bi] for bi in batch])
# print('    labels:', [reverse_dictionary[li] for li in labels.reshape(40)])

In [32]:
data_index     = 0
batch_size     = 200
embedding_size = 300     # Dimension of the embedding vector (number of neurons).
skip_window    = 10      # How many words to consider left and right.
num_skips      = 20      # How many times to reuse an input to generate a label.
vocabulary_size = len(dictionary)
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 

valid_size     = 4  # Random set of words to evaluate similarity on.
valid_window   = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled    = 64 # Number of negative examples to sample.

graph = tf.Graph()
with graph.as_default():

    # Input data.
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels  = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
    # hidden weights
    embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / np.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                               labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

    # Optimizer.
    # Note: The optimizer will optimize the softmax_weights AND the embeddings.
    # This is because the embeddings are defined as a variable quantity and the
    # optimizer's `minimize` method will by default modify all variable quantities 
    # that contribute to the tensor it is passed.
    # See docs on `tf.train.Optimizer.minimize()` for more details.
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

In [34]:
num_steps = 100001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  average_loss = 0
  for step in range(num_steps):
    batch_data, batch_labels = generate_batch(batch_size, skip_window)
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    if step % 10000 == 0:
      if step > 0:
        average_loss = average_loss / 10000
      # The average loss is an estimate of the loss over the last 10000 batches.
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
      # note that this is expensive (~20% slowdown if computed every 500 steps)
      sim = similarity.eval()
      for i in range(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to %s:' % valid_word
        for k in range(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log = '%s %s,' % (log, close_word)
        print(log)
  final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0: 6.755709
Nearest to too: reeks, merit, romanticized, screwed, tourneur, wizard, lurks, kiss,
Nearest to are: highlight, crossword, becomes, fisher, fable, neurotic, conflicted, art,
Nearest to with: appropriate, universal, intervals, rumor, vanishing, survivors, occasions, warsaw,
Nearest to also: vulnerability, implication, jury, learn, judy, gestures, compensation, sharif,
Average loss at step 10000: 4.480114
Nearest to too: merit, reeks, jafar, bogarde, wizard, rockets, screwed, romanticized,
Nearest to are: kinda, keys, turned, waiter, art, bendix, of, highlight,
Nearest to with: and, UNK, in, as, a, to, archie, of,
Nearest to also: vulnerability, implication, jury, compensated, option, initially, blend, flushed,
Average loss at step 20000: 4.239134
Nearest to too: merit, screwed, bogarde, hundred, romanticized, controls, reeks, intelligence,
Nearest to are: they, were, some, all, and, with, romy, desi,
Nearest to with: on, of, some, as, from, to

In [35]:
def eval_context(positive=[],negative=[]):
    vec = np.zeros(embedding_size)
    for word in positive:
        vec = vec + final_embeddings[dictionary[word],:]
        vec=vec/ np.sqrt(np.sum(vec**2))
    for word in negative:
        vec = vec - final_embeddings[dictionary[word],:]
        vec=vec/ np.sqrt(np.sum(vec**2))
    sim = np.dot(vec, final_embeddings.T)
    nearest = (-sim).argsort()[1:9]
    words = []
    for k in range(8):
          words.append(reverse_dictionary[nearest[k]])
    return list(zip(words,sim[nearest]))

In [42]:
eval_context(positive=['businessman','woman'],negative=['man'])

[('woman', 0.4365889754428649),
 ('buddhist', 0.22472288890400319),
 ('solving', 0.21947928440538284),
 ('biography', 0.20472893888160873),
 ('kathryn', 0.20261553766788787),
 ('backseat', 0.2010753473947694),
 ('heist', 0.19771546896371606),
 ('conan', 0.19381510351280007)]

## Pytorch

In [46]:
class Vec2Word(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(Vec2Word, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs, batch):
        embeds = self.embeddings(inputs).view((batch, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)
        return log_probs

In [52]:
losses = []
loss_function = nn.NLLLoss()
model = Vec2Word(len(dictionary), num_features, context)
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [53]:
batch_size = 200
num_steps=len(dictionary)//batch_size


In [56]:
data_index = 0
optimizer = optim.Adagrad(model.parameters(), lr=0.1)
for epoch in range(20):
    total_loss = torch.Tensor([0])
    for _ in range(num_steps):

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in variables)
        batch, labels = generate_batch(batch_size,context)
        
        batch_var  = Variable(torch.from_numpy(batch).type(torch.LongTensor))
        labels_var = Variable(torch.from_numpy(labels).type(torch.LongTensor))
        
        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(batch_var,batch_size)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a variable)
        loss = loss_function(log_probs, labels_var.view(-1))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
        total_loss += loss.data
        #print(loss.data)
    print(total_loss.numpy()/num_steps)
    losses.append(total_loss)
#print(losses)  # The loss decreased every iteration over the training data!

[ 9.49053001]
[ 7.77449656]
[ 7.3751483]
[ 7.08530188]
[ 6.99869204]
[ 7.07109833]
[ 6.9630127]
[ 6.74427176]
[ 7.1649332]
[ 6.72411203]
[ 6.82953978]
[ 6.88935375]
[ 6.80248642]
[ 7.01510191]
[ 6.78813505]
[ 7.02150917]
[ 6.95554161]
[ 6.95084715]
[ 6.86200857]
[ 6.91851234]


In [57]:
result = model.embeddings(Variable(torch.LongTensor([dictionary['businessman']]))) - \
         model.embeddings(Variable(torch.LongTensor([dictionary['man']]))) + \
         model.embeddings(Variable(torch.LongTensor([dictionary['woman']])))

In [58]:
cosine = []
for i in range(model.embeddings.num_embeddings):
    embed = model.embeddings(Variable(torch.LongTensor([i]))).data.numpy()
    cosine.append(cosine_similarity(embed, result.data.numpy())[0][0])

In [67]:
indx = np.array(cosine).argmax()
reverse_dictionary[indx]

'accuracy'

In [68]:
cosine.pop(indx)

0.1957472