In [205]:
from collections import Counter, namedtuple
from gensim.models import KeyedVectors
from nltk.stem import WordNetLemmatizer 

import time

In [137]:
class Document():
    def __init__(self, words):
        self.words = words
        self.vecs = []
        self.weights = []
        self.weights_sum = 0
    
    def setvecs(self, model):
        for i, w in enumerate(self.words):
            self.vecs.append(model[w])
    
    def setweights(self):
        total, counts = self._getcounts(self.words)
        for w in self.words:
            self.weights.append(counts[w] / total)
        self.weights_sum = sum(self.weights)
    
    def getweight(self, word):
        if word in self.words:
            return self.weights[self.words.index(word)]
        else:
            return None
        
    def getvec(self, word):
        if word in self.words:
            return self.vecs[self.words.index(word)]
        else:
            return None
    
    def _getcounts(self, words):
        counts = Counter(words)
        total = sum(counts.values())
        return total, counts        

In [172]:
class DocPair():
    def __init__(self, doc1, doc2):
        self.doc1 = doc1
        self.doc2 = doc2
        self.vecs = []
    
    def _getvocab(self):
        vocab = list(set(self.doc1.words + self.doc2.words))
        print(vocab)
        return len(vocab), vocab
    
    def getsignature(self):
        n, vocab = self._getvocab()
        self.sig1 = [self.doc1.getweight(w) if w in self.doc1.words else 0.0 for w in vocab]
        self.sig2 = [self.doc2.getweight(w) if w in self.doc2.words  else 0.0 for w in vocab]
        
    def getvecs(self):
        

In [199]:
doc1.weights[1]

0.25

In [200]:
doc1.weights[doc1.words.index('speaks')]

0.25

In [189]:
[w for w in vocab if w in doc1.words]

['illinois', 'speaks', 'obama', 'media']

In [201]:
[doc1.getweight(w) if w in doc1.words else 0.0 for w in vocab]

[0.25, 0.25, 0.0, 0.0, 0.25, 0.0, 0.25, 0.0]

In [203]:
docpair = DocPair(doc1, doc2)
docpair.getsignature()

['illinois', 'speaks', 'president', 'greets', 'obama', 'press', 'media', 'chicago']


In [204]:
docpair.sig1

[0.25, 0.25, 0.0, 0.0, 0.25, 0.0, 0.25, 0.0]

In [120]:
test = list([1,2,3])
test.extend([4,5])
set(test)

{1, 2, 3, 4, 5}

In [48]:
def read_1w_corpus(name, sep="\t"):
    for line in open(name):
        yield line.split(sep)

In [6]:
print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
vocabulary = set(model.vocab)
relevant_words = [word for (word, count) in read_1w_corpus('count_1w.txt') if word in vocabulary]
model_reduced = model[[w for w in relevant_words]]

Loading GoogleNews Vectors


In [7]:
%time model.init_sims(replace=True)

CPU times: user 3.25 s, sys: 4.25 s, total: 7.5 s
Wall time: 8.49 s


In [186]:
# Obama speaks to the media in Illinois 
sentence1_words = ['obama', 'speaks', 'media', 'illinois']

# The President greets the press in Chicago. 
sentence2_words = ['president', 'greets', 'press', 'chicago']

In [202]:
doc1 = Document(sentence1_words)
doc1.setvecs(model)
doc1.setweights()

doc2 = Document(sentence2_words)
doc2.setvecs(model)
doc2.setweights()

In [71]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mikaelbrunila/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [89]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

import numpy as np

sw = set(stopwords.words('english'))

doc1 = " ".join(sentence1_words)
doc2 = " ".join(sentence2_words)
nbow = CountVectorizer(stop_words = sw)
nbow.fit([doc for doc in [doc1, doc2]])

vocabulary = set(model.index2word)
names = nbow.get_feature_names()

In [93]:
v1, v2 = nbow.transform([doc1, doc2])   
index = np.union1d(v1.indices, v2.indices)

v1  = v1.toarray().ravel()
v2  = v2.toarray().ravel()

n = len(index) 
index_map = [(index[i], i) for i in range(n) if names[index[i]] in vocabulary]
source    = np.zeros(n)
sink      = np.zeros(n)
vecs      = np.zeros(shape = (n, 300))
    
for i, j in index_map:
    source[j] = v1[i]
    sink[j]   = v2[i]
    vecs[j]   = model[names[i]]

sum_source = sum(source)
sum_sink = sum(sink)
if sum_source == 0:
    sum_source = 1
if sum_sink == 0:
    sum_sink = 1


In [98]:
source / sum_source

array([0.  , 0.  , 0.25, 0.25, 0.25, 0.  , 0.  , 0.25])

In [100]:
vecs.shape

(8, 300)

In [87]:
index

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int32)