In [216]:
import sklearn
import gensim
import logging

import numpy as np
import pandas as pd

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [245]:
newsgroup = sklearn.datasets.fetch_20newsgroups(subset="all", shuffle=True, random_state=42, 
                                                remove=('header', 'footer', 'quotes'))

processed_docs = [gensim.utils.simple_preprocess(x) for x in newsgroup['data']]

In [247]:
X_train, X_test, _, y_test = sklearn.model_selection.train_test_split(processed_docs, newsgroup['target'], 
                                                                      random_state=42, test_size=0.33)

In [None]:
doc2vecc_gensim_020 = gensim.models.word2vec.Word2Vec(X_train, size=100, alpha=0.025, window=5, min_count=3, 
                                                 sample=1e-3, seed=42, workers=4, min_alpha=0.0001, 
                                                 sg=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=0, 
                                                 iter=30, doc2vecc=0.2)

# 2018-11-27 13:57:23,705 : INFO : training on a 80221560 raw words (64035464 effective words) took 4313.5s, 14845 effective words/s

doc2vecc_gensim_060 = gensim.models.word2vec.Word2Vec(X_train, size=100, alpha=0.025, window=5, min_count=3, 
                                                 sample=1e-3, seed=42, workers=8, min_alpha=0.0001, 
                                                 sg=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=0, 
                                                 iter=30, doc2vecc=0.6)

In [249]:
doc2vecc_C_02 = gensim.models.KeyedVectors.load_word2vec_format("wordvectors_0.2.txt", binary=False)
# ./doc2vecc -train ../../X_train.txt -word wordvectors.txt -cbow 0 -size 100 -window 5 -negative 5 -hs 0 -sample 0.001 -threads 4 -binary 0 -iter 30 -min-count 3 \
# -sentence-sample 0.2 -alpha 0.025 -save-vocab tmp.vocab -test alldata.txt -output docvectors.txt^C

doc2vecc_C_06 = gensim.models.KeyedVectors.load_word2vec_format("wordvectors_0.6.txt", binary=False)
# ./doc2vecc -train ../../X_train.txt -word wordvectors.txt -cbow 0 -size 100 -window 5 -negative 5 -hs 0 -sample 0.001 -threads 8 -binary 0 -iter 30 -min-count 3 \
# -sentence-sample 0.6 -alpha 0.025 -save-vocab tmp.vocab -test alldata.txt -output docvectors.txt^C


2018-11-28 15:31:10,412 : INFO : loading projection weights from ~/Downloads/wordvectors_0.2.txt


2018-11-28 15:31:14,963 : INFO : loaded (37065, 100) matrix from ~/Downloads/wordvectors_0.2.txt


2018-11-28 15:31:14,965 : INFO : loading projection weights from ~/Downloads/wordvectors_0.6.txt


2018-11-28 15:31:18,638 : INFO : loaded (37065, 100) matrix from ~/Downloads/wordvectors_0.6.txt


In [None]:
# doc2vecc_gensim_020     = gensim.models.keyedvectors.KeyedVectors.load('doc2vecc_wv.wv')
# w2v_model               = gensim.models.keyedvectors.KeyedVectors.load('w2v_model.wv')
# doc2vecc_gensim_060     = gensim.models.keyedvectors.KeyedVectors.load('doc2vecc_gensim_060.wv')

In [222]:
w2v_model = gensim.models.word2vec.Word2Vec(X_train, size=100, alpha=0.025, window=5, min_count=3, 
                                            sample=1e-3, seed=42, workers=8, min_alpha=0.0001, 
                                            sg=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=0, 
                                            iter=30)

In [251]:
models = {
        "w2v_model" : w2v_model,                        # word2vec model
        "doc2vecc_C_02" : doc2vecc_C_02,                # C code with 0.2 corruption value
        "doc2vecc_gensim_020" : doc2vecc_gensim_020,    # gensim code with 0.2 corruption value
        "doc2vecc_C_06" : doc2vecc_C_06,                # C code with 0.6 corruption value
        "doc2vecc_gensim_060" : doc2vecc_gensim_060}    # gensim code with 0.6 corruption value

In [252]:
def average_vector(model, doc):
    doc_vector = np.zeros(model.vector_size)
    size = 0
    for word in doc:
        if word in model.wv.vocab:
            doc_vector += model.wv[word]
            size += 1
    if size > 0:
        doc_vector /= size
    return doc_vector


def average_vector_batch(model, documents):
    return [average_vector(model, d) for d in documents]

In [253]:
# generate vectors for every model

models_average_vector = {k: average_vector_batch(models[k], X_test) for k in models.keys()}
# del X_test

  """
  


In [254]:
# Create the dataset of triplets: 
# - Document A, 
# - Document B sampled from the same category as A's
# - document C randomly sampled from the dataset with category different than A's

df = pd.DataFrame(y_test, columns=['target'])
df['index'] = df.index

np.random.seed(131)

df_sample = df.sample(frac=0.20)

df_positive = pd\
    .merge(df_sample, df_sample, left_on='target', right_on=df_sample['target'])\
    .drop(['target_x', 'target_y'], axis=1)

# remove same documents
df_positive = df_positive[df_positive['index_x'] != df_positive['index_y']]


In [255]:
# Add the randomly added documents
df_negative = df.sample(len(df_positive), replace=True)
df_negative.columns = ['target_z', 'index_z']

df_positive['target_z'] = df_negative['target_z'].values
df_positive['index_z']  = df_negative['index_z'].values

# remove Documents C with the same category as Document A 
df_positive_negative = df_positive[df_positive['target'] != df_positive['target_z']].drop(['target', 'target_z'], 
                                                                                          axis=1)

In [256]:
df_positive_negative.head(5)

Unnamed: 0,index_x,index_y,index_z
1,5927,3837,2973
2,5927,4293,1758
3,5927,3469,2281
4,5927,65,1678
6,5927,1069,2011


In [257]:
def cosine_similarity(a, b):
    """identical vectors should yield similarity of 1.0"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def compare_vectors(docvecs_list, doc_x, doc_y, doc_z):
    """Returns True if doc_y is closer to doc_x than doc_z in the vector space."""  
    return cosine_similarity(docvecs_list[doc_x], docvecs_list[doc_y]) \
           > cosine_similarity(docvecs_list[doc_x], docvecs_list[doc_z])


def compare_distances(docvecs_list, doc_x, doc_y, doc_z):
    """ returns 1.0 if x and y are very close in the vector space, while x and z are very distant."""
    return 1.0 - sigmoid(
        np.linalg.norm(docvecs_list[doc_x] - docvecs_list[doc_y]) -
        np.linalg.norm(docvecs_list[doc_x] - docvecs_list[doc_z]))

In [258]:
# compatible with python2,3
def get_items(d):
    for k in d:
        yield k, d[k]
        
        
for k,vecs in get_items(models_average_vector):
    true_positives = df_positive_negative.apply(lambda x: compare_vectors(vecs, *x), axis=1).sum()
    avg_distance   = df_positive_negative.apply(lambda x: compare_distances(vecs, *x), axis=1).mean()
    
    print('-'*75)
    print("%s classified:\t\t %i correctly,\t %f%%" % (k,true_positives, true_positives/len(df_positive_negative) * 100))
    print("%s average distance: \t %f" % (k, avg_distance))



---------------------------------------------------------------------------
doc2vecc_C_06 classified:		 43088 correctly,	 57.822271%
doc2vecc_C_06 average distance: 	 0.561585


---------------------------------------------------------------------------
w2v_model classified:		 52490 correctly,	 70.439357%
w2v_model average distance: 	 0.527471


---------------------------------------------------------------------------
doc2vecc_C_02 classified:		 49147 correctly,	 65.953193%
doc2vecc_C_02 average distance: 	 0.627540


---------------------------------------------------------------------------
doc2vecc_gensim_020 classified:		 48222 correctly,	 64.711882%
doc2vecc_gensim_020 average distance: 	 0.522506


---------------------------------------------------------------------------
doc2vecc_gensim_060 classified:		 57402 correctly,	 77.031053%
doc2vecc_gensim_060 average distance: 	 0.555006
