In [106]:
from gensim.models import Word2Vec, KeyedVectors, TfidfModel
from gensim.parsing.preprocessing import STOPWORDS
from scipy.sparse.linalg import svds
from scipy.spatial.distance import cosine
import numpy as np

In [2]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [198]:
def simple_average(sent):
    sents_emd = []
    for s in sent:
        sent_emd = []
        for w in s:
            if w in model:
                sent_emd.append(model[w])
            else:
                continue
        sent_emd = np.array(sent_emd)
        sum_ = sent_emd.sum(axis=0)
        result = sum_/np.sqrt((sum_**2).sum())
        sents_emd.append(result)
    return sents_emd

In [199]:
def tf_idf(sent):
    word_counter = {}
    total_count = 0
    for s in sent:
        for w in s:
            if w in word_counter:
                word_counter[w] = word_counter[w] + 1
            else:
                word_counter[w] = 1
        total_count = total_count + len(s)
    no_of_sentences = len(sent)
    sents_emd = []
    for s in sent:
        sent_emd = []
        for word in s:
            tf = word_counter[word]/float(len(s))
            idf = np.log(no_of_sentences/float(1+ word_counter[word]))
            try:
                emd = tf*idf*model[word]
                sent_emd.append(emd)
            except:
                continue
        sent_emd = np.array(sent_emd)
        sum_ = sent_emd.sum(axis=0)
        result = sum_/np.sqrt((sum_**2).sum())
        sents_emd.append(result)
    return sents_emd

In [210]:
def smooth_inverse_frequency(sent, a=None):
    word_counter = {}
    sentences = []
    total_count = 0
    for s in sent:
        for w in s:
            if w in word_counter:
                word_counter[w] = word_counter[w] + 1
            else:
                word_counter[w] = 1
        total_count = total_count + len(s)
    no_of_sentences = len(sent)
    sents_emd = []
    for s in sent:
        sent_emd = []
        for word in s:
            if a is None and word in model:
                a = 0.001
                emd = (a/(a + (word_counter[word]/total_count)))*model[word]
                sent_emd.append(emd)
            elif a is not None and word in model:
                emd = (a/(a + (word_counter[word]/total_count)))*model[word]
                sent_emd.append(emd)
        sum_ = np.array(sent_emd).sum(axis=0)
        sentence_emd = sum_/float(no_of_sentences)
        sents_emd.append(sentence_emd)
    u  = np.array(svds(sents_emd, k=1))
    u = u[2]
    new_sents_emd = []
    for s in sents_emd:
        s = s - u.dot(u.transpose())*s
        new_sents_emd.append(s)
    return new_sents_emd

In [211]:
s1 = "this is a sample sentence with cat and dog"
s1 = s1.lower().split()
s1 = [w for w in s1 if w not in STOPWORDS]
s2 = "there was a time when computers were very expensive"
s2 = s2.lower().split()
s2 = [w for w in s2 if w not in STOPWORDS]
s3 = "one more day with cute dog"
s3 = s3.lower().split()
s3 = [w for w in s3 if w not in STOPWORDS]
s4 = "I'm eagerly waiting for Avengers Infinity War"
s4 = s4.lower().split()
s4 = [w for w in s4 if w not in STOPWORDS]

In [217]:
sentences = [s1,s2,s3,s4]
sentences_emd1 = smooth_inverse_frequency(sentences)
sentences_emd2 = tf_idf(sentences)
sentences_emd3 = simple_average(sentences)
# print sentences_emd1

In [218]:
# tf_idf_model = TfidfModel(sentences)

Benchmarking with cosine distance

In [219]:
d1 = cosine(sentences_emd1[0],sentences_emd1[2])
d2 = cosine(sentences_emd3[0],sentences_emd3[2])
d3 = cosine(sentences_emd2[0],sentences_emd2[2])
print("SIF: {} tfIdf: {} SimAvg: {}".format(d1, d2, d3))
d4 = cosine(sentences_emd1[1],sentences_emd1[3])
d5 = cosine(sentences_emd3[1],sentences_emd3[3])
d6 = cosine(sentences_emd2[1],sentences_emd2[3])
print("SIF: {} tfIdf: {} SimAvg: {}".format(d4, d5, d6))

SIF: 0.369763314724 tfIdf: 0.371883571148 SimAvg: 0.418903470039
SIF: 0.803362444043 tfIdf: 0.79425162077 SimAvg: 0.79425162077


### For visualisation

In [185]:
# for simple average
a = ""
for i in sentences_emd3:
    for s in i:
        a = a + str(s) + '\t'
    a = a + '\n'
with open("record3.tsv", "w") as record_file:
    record_file.write(a)

Simple Average: 
<img src="AVG.png">

In [186]:
# for TF-IDF
a = ""
for i in sentences_emd2:
    for s in i:
        a = a + str(s) + '\t'
    a = a + '\n'
with open("record2.tsv", "w") as record_file:
    record_file.write(a)

TF-IDF: 
<img src="TFIDF.png">

In [187]:
# for SIF
a = ""
for i in sentences_emd1:
    for s in i:
        a = a + str(s) + '\t'
    a = a + '\n'
with open("record1.tsv", "w") as record_file:
    record_file.write(a)

SIF: 
<img src="SIF.png">

### Clearly in all three, s1 and s3 are more closer to each other.