In [1]:
from sklearn import datasets, model_selection

import gensim
import logging

import numpy as np
import pandas as pd

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [20]:
def print_model(type, workers, doc2vecc):
    """Prints model name + workers and doc2vecc value"""
    return type + "workers=" + str(workers) +  " doc2vecc=" + str(doc2vecc)


def average_vector(model, doc):
    """Constructs a document vector using the average of its words' vectors"""
    doc_vector = np.zeros(model.vector_size)
    size = 0
    for word in doc:
        if word in model.wv.vocab:
            doc_vector += model.wv[word]
            size += 1
    if size > 0:
        doc_vector /= size
    return doc_vector


def average_vector_batch(model, documents):
    """Batch document vector generation"""
    return [average_vector(model, d) for d in documents]


def cosine_similarity(a, b):
    """identical vectors should yield similarity of 1.0"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def compare_vectors(docvecs_list, doc_x, doc_y, doc_z):
    """Returns True if doc_y is closer to doc_x than doc_z in the vector space."""  
    return cosine_similarity(docvecs_list[doc_x], docvecs_list[doc_y]) \
           > cosine_similarity(docvecs_list[doc_x], docvecs_list[doc_z])


def compare_distances(docvecs_list, doc_x, doc_y, doc_z):
    """ returns 1.0 if x and y are very close in the vector space, while x and z are very distant."""
    return 1.0 - sigmoid(
        np.linalg.norm(docvecs_list[doc_x] - docvecs_list[doc_y]) -
        np.linalg.norm(docvecs_list[doc_x] - docvecs_list[doc_z]))

In [3]:
newsgroup = datasets.fetch_20newsgroups(subset="all", shuffle=True, random_state=42, 
                                        remove=('header', 'footer', 'quotes'))

processed_docs = [gensim.utils.simple_preprocess(x) for x in newsgroup['data']]

X_train, X_test, _, y_test = model_selection.train_test_split(processed_docs, newsgroup['target'], 
                                                              random_state=42, test_size=0.33)

In [5]:
# Create the dataset of triplets: 
# - Document A, 
# - Document B sampled from the same category as A's
# - document C randomly sampled from the dataset with category different than A's

df = pd.DataFrame(y_test, columns=['target'])
df['index'] = df.index

np.random.seed(131)

df_sample = df.sample(frac=0.20)

df_positive = pd\
    .merge(df_sample, df_sample, left_on='target', right_on=df_sample['target'])\
    .drop(['target_x', 'target_y'], axis=1)

# remove same documents
df_positive = df_positive[df_positive['index_x'] != df_positive['index_y']]

# Add the randomly added documents
df_negative = df.sample(len(df_positive), replace=True)
df_negative.columns = ['target_z', 'index_z']

df_positive['target_z'] = df_negative['target_z'].values
df_positive['index_z']  = df_negative['index_z'].values

# remove Documents C with the same category as Document A 
df_positive_negative = df_positive[df_positive['target'] != df_positive['target_z']].drop(['target', 'target_z'], axis=1)

In [268]:
# Models training - gensim
models = []
for i in np.arange(0, 1.2, 0.2):
    model = gensim.models.word2vec.Word2Vec(X_train[1:5], size=100, alpha=0.025, window=5, min_count=3, 
                                            sample=1e-3, seed=42, workers=6, min_alpha=0.0001, 
                                            sg=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=0, 
                                            iter=30, doc2vecc=i)
    model.wv.save("~/trained_models/doc2vec_%f.wv" % i)
    models.append((print_model("Gensim:", 6, i), model))

In [35]:
# Models training - C code from doc2vecc

# for i in `seq 0.2 0.2 1.0`;
# do
#     echo doc2vecc_$i
#     time ./doc2vecc   -train ../../X_train.txt -word wordvectors_$i.txt -cbow 0 
#                       -size 100 -window 5 -negative 5 -hs 0 -sample 0.001 -threads 6 -binary 0 -iter 30 -min-count 3 \
#                       -sentence-sample $i -alpha 0.025 -save-vocab tmp.vocab -test alldata.txt 
#                       -output docvectors.txt
# done

for i in np.arange(0.2, 1.2, 0.2):
    model = gensim.models.KeyedVectors.load_word2vec_format("wordvectors_%s.txt" % i, binary=False)
    models.append((print_model("C:", 6, i), model))

2018-11-29 15:49:19,637 : INFO : loading projection weights from /Users/ahmed.elsafty/Downloads/trained_models/wordvectors_0.2.txt


2018-11-29 15:49:23,324 : INFO : loaded (37065, 100) matrix from /Users/ahmed.elsafty/Downloads/trained_models/wordvectors_0.2.txt


2018-11-29 15:49:23,325 : INFO : loading projection weights from /Users/ahmed.elsafty/Downloads/trained_models/wordvectors_0.4.txt


2018-11-29 15:49:27,027 : INFO : loaded (37065, 100) matrix from /Users/ahmed.elsafty/Downloads/trained_models/wordvectors_0.4.txt


2018-11-29 15:49:27,028 : INFO : loading projection weights from /Users/ahmed.elsafty/Downloads/trained_models/wordvectors_0.6.txt


2018-11-29 15:49:30,494 : INFO : loaded (37065, 100) matrix from /Users/ahmed.elsafty/Downloads/trained_models/wordvectors_0.6.txt


2018-11-29 15:49:30,494 : INFO : loading projection weights from /Users/ahmed.elsafty/Downloads/trained_models/wordvectors_0.8.txt


2018-11-29 15:49:33,967 : INFO : loaded (37065, 100) matrix from /Users/ahmed.elsafty/Downloads/trained_models/wordvectors_0.8.txt


2018-11-29 15:49:33,968 : INFO : loading projection weights from /Users/ahmed.elsafty/Downloads/trained_models/wordvectors_1.0.txt


2018-11-29 15:49:37,864 : INFO : loaded (37065, 100) matrix from /Users/ahmed.elsafty/Downloads/trained_models/wordvectors_1.0.txt


In [37]:
# generate document vectors for every model
models_average_vector = [(k, average_vector_batch(model, X_test)) for k, model in models]

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [39]:
# Prediction accuracy and average distance         
for k, vecs in models_average_vector:
    true_positives = df_positive_negative.apply(lambda x: compare_vectors(vecs, *x), axis=1).sum()
    avg_distance   = df_positive_negative.apply(lambda x: compare_distances(vecs, *x), axis=1).mean()
    
    print('-'*75)
    print("%s classified:\t\t %i correctly,\t %f%%" % (k,true_positives, true_positives/len(df_positive_negative) * 100))
    print("%s average distance: \t %f" % (k, avg_distance))


---------------------------------------------------------------------------
Gensim: workers=6 doc2vecc=0.0 classified:		 52498 correctly,	 70.450093%
Gensim: workers=6 doc2vecc=0.0 average distance: 	 0.527342


---------------------------------------------------------------------------
Gensim: workers=6 doc2vecc=0.2 classified:		 57388 correctly,	 77.012265%
Gensim: workers=6 doc2vecc=0.2 average distance: 	 0.553244


---------------------------------------------------------------------------
Gensim: workers=6 doc2vecc=0.4 classified:		 57383 correctly,	 77.005556%
Gensim: workers=6 doc2vecc=0.4 average distance: 	 0.554434


---------------------------------------------------------------------------
Gensim: workers=6 doc2vecc=0.6 classified:		 57354 correctly,	 76.966639%
Gensim: workers=6 doc2vecc=0.6 average distance: 	 0.554916


---------------------------------------------------------------------------
Gensim: workers=6 doc2vecc=0.8 classified:		 57141 correctly,	 76.680802%
Gensim: workers=6 doc2vecc=0.8 average distance: 	 0.554535


---------------------------------------------------------------------------
Gensim: workers=6 doc2vecc=1.0 classified:		 56961 correctly,	 76.439250%
Gensim: workers=6 doc2vecc=1.0 average distance: 	 0.554650


---------------------------------------------------------------------------
C:workers=6 doc2vecc=0.2 classified:		 49652 correctly,	 66.630881%
C:workers=6 doc2vecc=0.2 average distance: 	 0.632914


---------------------------------------------------------------------------
C:workers=6 doc2vecc=0.4 classified:		 44381 correctly,	 59.557422%
C:workers=6 doc2vecc=0.4 average distance: 	 0.572672


---------------------------------------------------------------------------
C:workers=6 doc2vecc=0.6 classified:		 43069 correctly,	 57.796774%
C:workers=6 doc2vecc=0.6 average distance: 	 0.561308


---------------------------------------------------------------------------
C:workers=6 doc2vecc=0.8 classified:		 42606 correctly,	 57.175448%
C:workers=6 doc2vecc=0.8 average distance: 	 0.557564




---------------------------------------------------------------------------
C:workers=6 doc2vecc=1.0 classified:		 41866 correctly,	 56.182399%
C:workers=6 doc2vecc=1.0 average distance: 	 0.557967
