In [3]:
import sklearn
import gensim
import logging

import numpy as np
import pandas as pd

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [122]:
newsgroup = sklearn.datasets.fetch_20newsgroups(subset="all", shuffle=True, random_state=42, 
                                                remove=('header', 'footer', 'quotes'))

In [139]:
processed_docs = [gensim.utils.simple_preprocess(x) for x in newsgroup['data']]

X_train, X_test, _, y_test = sklearn.model_selection.train_test_split(processed_docs, newsgroup['target'], 
                                                                      random_state=42, test_size=0.33)

In [None]:
doc2vecc_model = gensim.models.word2vec.Word2Vec(X_train, size=100, alpha=0.025, window=5, min_count=3, 
                                                 sample=1e-3, seed=42, workers=4, min_alpha=0.0001, 
                                                 sg=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=0, 
                                                 iter=30, doc2vecc=0.2)

# 2018-11-27 13:57:23,705 : INFO : training on a 80221560 raw words (64035464 effective words) took 4313.5s, 14845 effective words/s

In [None]:
word2vec_model = gensim.models.word2vec.Word2Vec(X_train, size=100, alpha=0.025, window=5, min_count=3, 
                                                 sample=1e-3, seed=42, workers=4, min_alpha=0.0001, 
                                                 sg=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=0, 
                                                 iter=30)

# 2018-11-27 14:05:27,024 : INFO : training on a 80221560 raw words (64033783 effective words) took 252.7s, 253358 effective words/s

In [141]:
def infer_vectors(model, documents):
    document_vectors = []
    for doc in documents:
        doc_vector = np.zeros(model.vector_size)
        size = 0
        for word in doc:
            if word in model.wv.vocab:
                doc_vector += model.wv[word]
                size += 1
        if size > 0:
            doc_vector /= size    
        document_vectors.append(doc_vector)
    return document_vectors

In [169]:
X_test_vectors_doc2vecc = infer_vectors(doc2vecc_model, X_test)
X_test_vectors_word2vec = infer_vectors(word2vec_model, X_test)
del X_test

In [16]:
# Create the dataset, triplets, (Document A, Document B from the same category, document C randomly sampled from the dataset).

df = pd.DataFrame(y_test, columns=['target'])
df['index'] = df.index

np.random.seed(42)
df_sample = df.sample(frac=0.20)

df_positive = pd\
    .merge(df_sample, df_sample, left_on='target', right_on=df_sample['target'])\
    .drop(['target_x', 'target_y'], axis=1)

# remove identical documents
df_positive = df_positive[df_positive['index_x'] != df_positive['index_y']] 

In [17]:
# Add the randomly added documents
df_negative = df.sample(len(df_positive), replace=True)
df_negative.columns = ['target_z', 'index_z']

df_positive['target_z'] = df_negative['target_z'].values
df_positive['index_z']  = df_negative['index_z'].values

# remove Documents C with the same category as Document A 
df_positive_negative = df_positive[df_positive['target'] != df_positive['target_z']].drop(['target', 'target_z'], 
                                                                                          axis=1)

In [18]:
df_positive_negative.head(5)

Unnamed: 0,index_x,index_y,index_z
1,5927,3837,2973
2,5927,4293,1758
3,5927,3469,2281
4,5927,65,1678
6,5927,1069,2011


In [133]:
def cosine_similarity(a, b):
    """identical vectors should yield similarity of 1.0"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def compare_vectors(docvecs_list, doc_x, doc_y, doc_z):
    """Returns True if doc_y is closer to doc_x than doc_z"""  
    return cosine_similarity(docvecs_list[doc_x], docvecs_list[doc_y]) \
           > cosine_similarity(docvecs_list[doc_x], docvecs_list[doc_z])


def compare_distances(docvecs_list, doc_x, doc_y, doc_z):
    """ returns 1.0 if x and y are identical, while x and z are very not indentical"""
    return 1.0 - sigmoid(
        np.linalg.norm(docvecs_list[doc_x] - docvecs_list[doc_y]) -
        np.linalg.norm(docvecs_list[doc_x] - docvecs_list[doc_z]))

In [None]:
doc2vecc_correctly_classified = df_positive_negative.apply(lambda x: compare_vectors(X_test_vectors_doc2vecc, *x), axis=1).sum()
word2vec_correctly_classified = df_positive_negative.apply(lambda x: compare_vectors(X_test_vectors_word2vec, *x), axis=1).sum()

doc2vecc_avg_distance = df_positive_negative.apply(lambda x: compare_distances(X_test_vectors_doc2vecc, *x), axis=1).mean()
word2vec_avg_distance = df_positive_negative.apply(lambda x: compare_distances(X_test_vectors_word2vec, *x), axis=1).mean()

In [167]:
print("Doc2vecC with 0.2 corruption classified:\t %i correctly,\t %f%%" % (doc2vecc_correctly_classified, doc2vecc_correctly_classified/len(df_positive_negative) * 100))
print("Word2Vec classified:\t\t\t %i correctly,\t %f%%" % (word2vec_correctly_classified, word2vec_correctly_classified/len(df_positive_negative) * 100))
print('-'*75)
print("Doc2vecC with 0.2 corruption average distance: \t %f" % doc2vecc_avg_distance)
print("Word2Vec average distance:\t\t\t %f" % word2vec_avg_distance)


Doc2vecC with 0.2 corruption classified:	 48222 correctly,	 64.711882%
Word2Vec classified:			 52490 correctly,	 70.439357%
---------------------------------------------------------------------------
Doc2vecC with 0.2 corruption average distance: 	 0.522506
Word2Vec average distance:			 0.527471
