In [261]:
from sklearn import datasets, model_selection

import gensim
import logging

import numpy as np
import pandas as pd

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [263]:
def print_model(m):
    """Prints model name + workers and doc2vecc value"""
    return str(m) + " workers=" + str(m.workers) +  " doc2vecc=" + str(m.doc2vecc)


def average_vector(model, doc):
    """Constructs a document vector using the average of its words' vectors"""
    doc_vector = np.zeros(model.vector_size)
    size = 0
    for word in doc:
        if word in model.wv.vocab:
            doc_vector += model.wv[word]
            size += 1
    if size > 0:
        doc_vector /= size
    return doc_vector


def average_vector_batch(model, documents):
    """Batch document vector generation"""
    return [average_vector(model, d) for d in documents]


def cosine_similarity(a, b):
    """identical vectors should yield similarity of 1.0"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def compare_vectors(docvecs_list, doc_x, doc_y, doc_z):
    """Returns True if doc_y is closer to doc_x than doc_z in the vector space."""  
    return cosine_similarity(docvecs_list[doc_x], docvecs_list[doc_y]) \
           > cosine_similarity(docvecs_list[doc_x], docvecs_list[doc_z])


def compare_distances(docvecs_list, doc_x, doc_y, doc_z):
    """ returns 1.0 if x and y are very close in the vector space, while x and z are very distant."""
    return 1.0 - sigmoid(
        np.linalg.norm(docvecs_list[doc_x] - docvecs_list[doc_y]) -
        np.linalg.norm(docvecs_list[doc_x] - docvecs_list[doc_z]))

In [264]:
newsgroup = datasets.fetch_20newsgroups(subset="all", shuffle=True, random_state=42, 
                                        remove=('header', 'footer', 'quotes'))

processed_docs = [gensim.utils.simple_preprocess(x) for x in newsgroup['data']]

X_train, X_test, _, y_test = model_selection.train_test_split(processed_docs, newsgroup['target'], 
                                                              random_state=42, test_size=0.33)

In [265]:
# Create the dataset of triplets: 
# - Document A, 
# - Document B sampled from the same category as A's
# - document C randomly sampled from the dataset with category different than A's

df = pd.DataFrame(y_test, columns=['target'])
df['index'] = df.index

np.random.seed(131)

df_sample = df.sample(frac=0.20)

df_positive = pd\
    .merge(df_sample, df_sample, left_on='target', right_on=df_sample['target'])\
    .drop(['target_x', 'target_y'], axis=1)

# remove same documents
df_positive = df_positive[df_positive['index_x'] != df_positive['index_y']]

# Add the randomly added documents
df_negative = df.sample(len(df_positive), replace=True)
df_negative.columns = ['target_z', 'index_z']

df_positive['target_z'] = df_negative['target_z'].values
df_positive['index_z']  = df_negative['index_z'].values

# remove Documents C with the same category as Document A 
df_positive_negative = df_positive[df_positive['target'] != df_positive['target_z']].drop(['target', 'target_z'], axis=1)

In [268]:
# Models training 

models = []
for i in np.arange(0, 1.2, 0.2):
    model = gensim.models.word2vec.Word2Vec(X_train[1:5], size=100, alpha=0.025, window=5, min_count=3, 
                                                 sample=1e-3, seed=42, workers=6, min_alpha=0.0001, 
                                                 sg=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=0, 
                                                 iter=30, doc2vecc=i)
    model.wv.save("~/trained_models/doc2vec_%f.wv" % i)
    models.append((print_model(model), model))

In [None]:
# generate document vectors for every model
models_average_vector = [(k, average_vector_batch(model, X_test)) for k, model in models]

In [None]:
# Prediction accuracy and average distance         
for k, vecs in models_average_vector:
    true_positives = df_positive_negative.apply(lambda x: compare_vectors(vecs, *x), axis=1).sum()
    avg_distance   = df_positive_negative.apply(lambda x: compare_distances(vecs, *x), axis=1).mean()
    
    print('-'*75)
    print("%s classified:\t\t %i correctly,\t %f%%" % (k,true_positives, true_positives/len(df_positive_negative) * 100))
    print("%s average distance: \t %f" % (k, avg_distance))


---------------------------------------------------------------------------

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=0.0 classified:          52498 correctly,        70.450093%

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=0.0 average distance:    0.527342

---------------------------------------------------------------------------

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=0.2 classified:          57388 correctly,        77.012265%

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=0.2 average distance:    0.553244

---------------------------------------------------------------------------

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=0.4 classified:          57383 correctly,        77.005556%

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=0.4 average distance:    0.554434

---------------------------------------------------------------------------

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=0.6 classified:          57354 correctly,        76.966639%

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=0.6 average distance:    0.554916

---------------------------------------------------------------------------

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=0.8 classified:          57141 correctly,        76.680802%

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=0.8 average distance:    0.554535

---------------------------------------------------------------------------

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=1.0 classified:          56961 correctly,        76.439250%

Word2Vec(vocab=37064, size=100, alpha=0.025) workers=6 doc2vecc=1.0 average distance:    0.554650
