In [3]:
import sklearn
import gensim
import logging

import numpy as np
import pandas as pd

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [122]:
newsgroup = sklearn.datasets.fetch_20newsgroups(subset="all", shuffle=True, random_state=42, 
                                                remove=('header', 'footer', 'quotes'))

In [193]:
processed_docs = [gensim.utils.simple_preprocess(x) for x in newsgroup['data']]

X_train, X_test, _, y_test = sklearn.model_selection.train_test_split(processed_docs, newsgroup['target'], 
                                                                      random_state=42, test_size=0.33)

In [None]:
doc2vecc_gensim = gensim.models.word2vec.Word2Vec(X_train, size=100, alpha=0.025, window=5, min_count=3, 
                                                 sample=1e-3, seed=42, workers=4, min_alpha=0.0001, 
                                                 sg=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=0, 
                                                 iter=30, doc2vecc=0.2)

# 2018-11-27 13:57:23,705 : INFO : training on a 80221560 raw words (64035464 effective words) took 4313.5s, 14845 effective words/s

In [None]:
doc2vecc_C = gensim.models.KeyedVectors.load_word2vec_format("wordvectors.txt", binary=False)

# time ./doc2vecc -train ../../X_train.txt -word wordvectors.txt -cbow 0 -size 100 -window 5 -negative 5 -hs 0 -sample 0.001 -threads 2 -binary 0 -iter 30 -min-count 3 \
# -sentence-sample 0.2 -alpha 0.025 -save-vocab tmp.vocab -test alldata.txt -output docvectors.txt^C

In [141]:
def infer_vectors(model, documents):
    document_vectors = []
    for doc in documents:
        doc_vector = np.zeros(model.vector_size)
        size = 0
        for word in doc:
            if word in model.wv.vocab:
                doc_vector += model.wv[word]
                size += 1
        if size > 0:
            doc_vector /= size    
        document_vectors.append(doc_vector)
    return document_vectors

In [197]:
vectors_doc2vecc_gensim = infer_vectors(doc2vecc_gensim, X_test)
vectors_doc2vecc_C      = infer_vectors(doc2vecc_C, X_test)
del X_test

  import sys
  


In [17]:
# Add the randomly added documents
df_negative = df.sample(len(df_positive), replace=True)
df_negative.columns = ['target_z', 'index_z']

df_positive['target_z'] = df_negative['target_z'].values
df_positive['index_z']  = df_negative['index_z'].values

# remove Documents C with the same category as Document A 
df_positive_negative = df_positive[df_positive['target'] != df_positive['target_z']].drop(['target', 'target_z'], 
                                                                                          axis=1)

In [18]:
df_positive_negative.head(5)

Unnamed: 0,index_x,index_y,index_z
1,5927,3837,2973
2,5927,4293,1758
3,5927,3469,2281
4,5927,65,1678
6,5927,1069,2011


In [133]:
def cosine_similarity(a, b):
    """identical vectors should yield similarity of 1.0"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def compare_vectors(docvecs_list, doc_x, doc_y, doc_z):
    """Returns True if doc_y is closer to doc_x than doc_z in the vector space."""  
    return cosine_similarity(docvecs_list[doc_x], docvecs_list[doc_y]) \
           > cosine_similarity(docvecs_list[doc_x], docvecs_list[doc_z])


def compare_distances(docvecs_list, doc_x, doc_y, doc_z):
    """ returns 1.0 if x and y are very close in the vector space, while x and z are very distant."""
    return 1.0 - sigmoid(
        np.linalg.norm(docvecs_list[doc_x] - docvecs_list[doc_y]) -
        np.linalg.norm(docvecs_list[doc_x] - docvecs_list[doc_z]))

In [198]:
gensim_true_positives   = df_positive_negative.apply(lambda x: compare_vectors(vectors_doc2vecc_gensim, *x), axis=1).sum()
C_true_positives        = df_positive_negative.apply(lambda x: compare_vectors(vectors_doc2vecc_C, *x), axis=1).sum()

gensim_avg_distance = df_positive_negative.apply(lambda x: compare_distances(vectors_doc2vecc_gensim, *x), axis=1).mean()
C_avg_distance = df_positive_negative.apply(lambda x: compare_distances(vectors_doc2vecc_C, *x), axis=1).mean()

In [202]:
print("Gensim classified:\t %i correctly,\t %f%%" % (gensim_true_positives, gensim_true_positives/len(df_positive_negative) * 100))
print("C code classified:\t %i correctly,\t %f%%" % (C_true_positives, C_true_positives/len(df_positive_negative) * 100))
print('-'*75)
print("Gensim average distance: \t %f" % gensim_avg_distance)
print("C code average distance:\t %f" % C_avg_distance)

Gensim classified:	 48222 correctly,	 64.711882%
C code classified:	 49147 correctly,	 65.953193%
---------------------------------------------------------------------------
Gensim average distance: 	 0.522506
C code average distance:	 0.627540
