In [1]:
from gensim.models import Word2Vec, KeyedVectors, TfidfModel
from gensim.parsing.preprocessing import STOPWORDS
from scipy.sparse.linalg import svds
from scipy.spatial.distance import cosine
import numpy as np

In [2]:
word2vec_model = KeyedVectors.load_word2vec_format("glove-wiki-gigaword-200.gz")

In [3]:
np.shape(word2vec_model.syn0)

  """Entry point for launching an IPython kernel.


(400000, 200)

For doc2vec and sent2vec we'll train the model with [this dataset](http://ai.stanford.edu/~amaas/data/sentiment/).

IMDB Dataset: 100,000 movie-reviews.

In [4]:
import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
import smart_open

dirname = 'aclImdb'
filename = 'aclImdb_v1.tar.gz'
locale.setlocale(locale.LC_ALL, 'C')

if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]

# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    return norm_text

import time
start = time.clock()

if not os.path.isfile('aclImdb/alldata-id.txt'):
    if not os.path.isdir(dirname):
        if not os.path.isfile(filename):
            # Download IMDB archive
            print("Downloading IMDB archive...")
            url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename
            r = requests.get(url)
            with open(filename, 'wb') as f:
                f.write(r.content)
        tar = tarfile.open(filename, mode='r')
        tar.extractall()
        tar.close()

    # Concatenate and normalize test/train data
    print("Cleaning up dataset...")
    folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']
    alldata = u''
    for fol in folders:
        temp = u''
        output = fol.replace('/', '-') + '.txt'
        # Is there a better pattern to use?
        txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
        for txt in txt_files:
            with smart_open.smart_open(txt, "rb") as t:
                t_clean = t.read().decode("utf-8")
                for c in control_chars:
                    t_clean = t_clean.replace(c, ' ')
                temp += t_clean
            temp += "\n"
        temp_norm = normalize_text(temp)
        with smart_open.smart_open(os.path.join(dirname, output), "wb") as n:
            n.write(temp_norm.encode("utf-8"))
        alldata += temp_norm

    with smart_open.smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:
        for idx, line in enumerate(alldata.splitlines()):
            num_line = u"_*{0} {1}\n".format(idx, line)
            f.write(num_line.encode("utf-8"))

end = time.clock()
print ("Total running time: ", end-start)

('Total running time: ', 0.001635999999990645)


In [5]:
import os.path
assert os.path.isfile("aclImdb/alldata-id.txt"), "alldata-id.txt unavailable"

In [6]:
# -*- coding: utf-8 -*-
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

alldocs = []  # Will hold all docs in original order
with open('aclImdb/alldata-id.txt') as alldata:
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
        split = ['train', 'test', 'extra', 'extra'][line_no//25000]  # 25k train, 25k test, 25k extra
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # For reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

100000 docs: 25000 train-sentiment, 25000 test-sentiment


In [7]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

doc2vec_model = Doc2Vec(dm=1, dm_concat=1, size=200, window=5, negative=5, hs=0, min_count=2, workers=cores)
doc2vec_model.build_vocab(alldocs)
models_by_name = OrderedDict((str(model), model) for model in [doc2vec_model])



In [8]:
import numpy as np
import statsmodels.api as sm
from random import sample

# For timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    
def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    if infer:
        if infer_subsample < 1.0:
            test_data = sample(test_data, int(infer_subsample * len(test_data)))
        test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
    else:
        test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

  from pandas.core import datetools


In [9]:
from collections import defaultdict
best_error = defaultdict(lambda: 1.0)  # To selectively print only best errors achieved

In [None]:
from random import shuffle
import datetime

alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes

print("START %s" % datetime.datetime.now())

for epoch in range(passes):
    shuffle(doc_list)  # Shuffling gets best results
    
    for name, train_model in models_by_name.items():
        # Train
        duration = 'na'
        train_model.alpha, train_model.min_alpha = alpha, alpha
        with elapsed_timer() as elapsed:
            train_model.train(doc_list, total_examples=len(doc_list), epochs=1)
            duration = '%.1f' % elapsed()
            
        # Evaluate
        eval_duration = ''
        with elapsed_timer() as eval_elapsed:
            err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
        eval_duration = '%.1f' % eval_elapsed()
        best_indicator = ' '
        if err <= best_error[name]:
            best_error[name] = err
            best_indicator = '*' 
        print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))

        if ((epoch + 1) % 5) == 0 or epoch == 0:
            eval_duration = ''
            with elapsed_timer() as eval_elapsed:
                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
            eval_duration = '%.1f' % eval_elapsed()
            best_indicator = ' '
            if infer_err < best_error[name + '_inferred']:
                best_error[name + '_inferred'] = infer_err
                best_indicator = '*'
            print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))

    print('Completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta
    
print("END %s" % str(datetime.datetime.now()))

In [None]:
# Print best error rates achieved
print("Err rate Model")
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
    print("%f %s" % (rate, name))

In [None]:
doc2vec_model.save('doc2vec_model')

In [10]:
from gensim.models import Doc2Vec
doc2vec_model = Doc2Vec.load('doc2vec_model')

In [11]:
print(doc2vec_model)

Doc2Vec(dm/c,d200,n5,w5,mc2,s0.001,t4)


In [12]:
def simple_average(sent):
    sents_emd = []
    for s in sent:
        sent_emd = []
        for w in s:
            if w in word2vec_model:
                sent_emd.append(word2vec_model[w])
            else:
                continue
        sent_emd = np.array(sent_emd)
        sum_ = sent_emd.sum(axis=0)
        result = sum_/np.sqrt((sum_**2).sum())
        sents_emd.append(result)
    return sents_emd

In [13]:
def tf_idf(sent):
    word_counter = {}
    total_count = 0
    for s in sent:
        for w in s:
            if w in word_counter:
                word_counter[w] = word_counter[w] + 1
            else:
                word_counter[w] = 1
        total_count = total_count + len(s)
    no_of_sentences = len(sent)
    sents_emd = []
    for s in sent:
        sent_emd = []
        for word in s:
            tf = word_counter[word]/float(len(s))
            idf = np.log(no_of_sentences/float(1+ word_counter[word]))
            try:
                emd = tf*idf*word2vec_model[word]
                sent_emd.append(emd)
            except:
                continue
        sent_emd = np.array(sent_emd)
        sum_ = sent_emd.sum(axis=0)
        result = sum_/np.sqrt((sum_**2).sum())
        sents_emd.append(result)
    return sents_emd

In [14]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
def tf_idf_v2(sent):
    dct = Dictionary(sent)
    corpus = [dct.doc2bow(line) for line in sent]
    tf_idf_model = TfidfModel(corpus)
    vector = tf_idf_model[corpus]
    d = {dct.get(id): value for doc in vector for id, value in doc}
    sents_emd = []
    sent_emd = []
    for i in range(len(sent)):
        for j in range(len(sent[i])):
            word = sent[i][j]
            if word in word2vec_model:
                emd = d[word]*word2vec_model[word]
                sent_emd.append(emd)
        sent_emd_np = np.array(sent_emd)
        sum_ = sent_emd_np.sum(axis=0)
        result = sum_/np.sqrt((sum_**2).sum())
        sents_emd.append(result)
    return sents_emd

In [15]:
def smooth_inverse_frequency(sent, a=None):
    word_counter = {}
    sentences = []
    total_count = 0
    for s in sent:
        for w in s:
            if w in word_counter:
                word_counter[w] = word_counter[w] + 1
            else:
                word_counter[w] = 1
        total_count = total_count + len(s)
    no_of_sentences = len(sent)
    sents_emd = []
    for s in sent:
        sent_emd = []
        for word in s:
            if a is None and word in word2vec_model:
                a = 0.001
                emd = (a/(a + (word_counter[word]/total_count)))*word2vec_model[word]
                sent_emd.append(emd)
            elif a is not None and word in word2vec_model:
                emd = (a/(a + (word_counter[word]/total_count)))*word2vec_model[word]
                sent_emd.append(emd)
        sum_ = np.array(sent_emd).sum(axis=0)
        sentence_emd = sum_/float(no_of_sentences)
        sents_emd.append(sentence_emd)
    u  = np.array(svds(sents_emd, k=1))
    u = u[2]
    new_sents_emd = []
    for s in sents_emd:
        s = s - u.dot(u.transpose())*s
        new_sents_emd.append(s)
    return new_sents_emd

In [16]:
s1 = "this is a sample sentence with cat and dog"
s1 = s1.lower().split()
s1 = [w for w in s1 if w not in STOPWORDS]
s2 = "there was a time when computers were very expensive"
s2 = s2.lower().split()
s2 = [w for w in s2 if w not in STOPWORDS]
s3 = "one more day with cute dog"
s3 = s3.lower().split()
s3 = [w for w in s3 if w not in STOPWORDS]
s4 = "eagerly waiting for Avengers Infinity War"
s4 = s4.lower().split()
s4 = [w for w in s4 if w not in STOPWORDS]
s5 = "this is a completely different"
s5 = s5.lower().split()
s5 = [w for w in s5 if w not in STOPWORDS]

In [17]:
sentences = [s1,s2,s3,s4]
sentences_emd1 = smooth_inverse_frequency(sentences)
sentences_emd2 = tf_idf_v2(sentences)
sentences_emd3 = simple_average(sentences)

Benchmarking with cosine distance

In [18]:
d1 = cosine(sentences_emd1[0],sentences_emd1[2])
d2 = cosine(sentences_emd3[0],sentences_emd3[2])
d3 = cosine(sentences_emd2[0],sentences_emd2[2])
print("SIF: {} tfIdf: {} SimAvg: {}".format(d1, d2, d3))
d4 = cosine(sentences_emd1[1],sentences_emd1[3])
d5 = cosine(sentences_emd3[1],sentences_emd3[3])
d6 = cosine(sentences_emd2[1],sentences_emd2[3])
print("SIF: {} tfIdf: {} SimAvg: {}".format(d4, d5, d6))

SIF: 0.316366493702 tfIdf: 0.297450304031 SimAvg: 0.177186429501
SIF: 0.564340353012 tfIdf: 0.569366067648 SimAvg: 0.0811237096786


In [19]:
doc_d1 = doc2vec_model.infer_vector(s1)
doc_d2 = doc2vec_model.infer_vector(s3)
print("doc2vec for s1 and s3: {}".format(cosine(doc_d1,doc_d2)))
doc_d3 = doc2vec_model.infer_vector(s1)
doc_d4 = doc2vec_model.infer_vector(s4)
print("doc2vec for s1 and s4: {}".format(cosine(doc_d3,doc_d4)))

doc2vec for s1 and s3: 0.48870998621
doc2vec for s1 and s4: 0.848112657666


Clearly word2vec with SIF rocks!