# Doc2vec Training

Install packages to environment.

TODO:
Run Doc2Vec analysis on sentiment of documents
Experiment with incorporating summarization
Choose the best Doc2Vec model

Testing changes



In [None]:
!pip install -U -q PyDrive
!pip install gensim
# Used by Gensim
!pip install testfixtures
!pip install scikit-learn


Run the following snippet to load article json files from Google Drive. Utility function save_to_drive can be used to save files to Google Drive. Useful for training models in Google Collab.

In [None]:
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials
# from google.colab import auth
# import io
# from googleapiclient.http import MediaIoBaseDownload
# from googleapiclient.http import MediaFileUpload
# 
# auth.authenticate_user()
# from googleapiclient.discovery import build
# drive_service = build('drive', 'v3')
# 
# 
# 
# files_to_load = list()
# 
# files_to_load.append({
#     "file_name": "all_articles.json",
#     "id": "19ErkUdKHwJO46T3u_LnoUhU-Ol4Om90W",
#     "is_binary": 0
# })
# 
# 
# def download_from_drive(file_id):
# 
#     request = drive_service.files().get_media(fileId=file_id)
#     downloaded = io.BytesIO()
#     downloader = MediaIoBaseDownload(downloaded, request)
#     done = False
#     while done is False:
#         # _ is a placeholder for a progress object that we ignore.
#         # (Our file is small, so we skip reporting progress.)
#         _, done = downloader.next_chunk()
# 
#     downloaded.seek(0)
#     read = downloaded.read()
#     return read
# 
# 
# def save_to_drive(filename):
# 
#     file_metadata = {
#         'name': filename,
#         'mimeType': 'text/plain'
#     }
#     media = MediaFileUpload(filename,
#                             mimetype='text/plain',
#                             resumable=True)
#     created = drive_service.files().create(body=file_metadata,
#                                            media_body=media,
#                                            fields='id').execute()
#     print('File ID: {}'.format(created.get('id')))
# 

# # Download all the Google Drive files
# for file in files_to_load:
#     print(file)
# 
#     # load document
#     doc = download_from_drive(file["id"])
# 
#     text_file = str()
#     if file["is_binary"]:
#         text_file = open(file["file_name"], "wb")
# 
#     else:
#         doc = doc.decode("utf-8")
#         text_file = open(file["file_name"], "w")
# 
#     text_file.write(doc)
#     text_file.close()
# 
#     print("loaded: " + file["file_name"])


print("Files loaded")

## Finding optimal Doc2vec model for IMDB article sentiment prediction

Download the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/).

Load dataset of articles.

In [7]:
from gensim.utils import to_unicode
from nltk.tokenize import word_tokenize
import collections
import tarfile
import re

import random
from random import shuffle
random.seed(10)


number_of_articles = 100000

SentimentDocument = collections.namedtuple('SentimentDocument', 'words tags split sentiment')

def create_sentiment_document(name, text, index):
    
    # Split the name of a movie review file into train/test, and +/- sentiment
    _, split, sentiment_str, _ = name.split('/')
    sentiment = {'pos': 1.0, 'neg': 0.0, 'unsup': None}[sentiment_str]

    if sentiment is None:
        split = 'extra'

    tokens = word_tokenize(to_unicode(text))
    return SentimentDocument(tokens, [index], split, sentiment)

def extract_documents(imdb_file):
    
    index = 0

    with tarfile.open(imdb_file, mode='r:gz') as tar:
        for member in tar.getmembers():
            if re.match(r'aclImdb/(train|test)/(pos|neg|unsup)/\d+_\d+.txt$', member.name):
                member_bytes = tar.extractfile(member).read()
                member_text = member_bytes.decode('utf-8', errors='replace')
                assert member_text.count('\n') == 0
                yield create_sentiment_document(member.name, member_text, index)
                index += 1
                
print("Loading documents ...")             
alldocs = list(extract_documents('aclImdb_v1.tar.gz'))
shuffle(alldocs)
alldocs = alldocs[:number_of_articles]

print(f"Total docs {len(alldocs)}")


Based on the [“Distributed Representations of Sentences and Documents”](http://cs.stanford.edu/~quocle/paragraph_vector.pdf) paper [Radim Hurek's](https://radimrehurek.com/gensim/auto_examples/howtos/run_doc2vec_imdb.html) reproduction of the experiment the `Doc2Vec(dbow,d100,n5,mc2,t8` model produces the lowest error rate in sentiment classification (10.3%).

This model is a concatenation of the Distributed Bag of Words model model and the DM/mean model.

To analyze whether further preprocessing of the text improves the quality of the embeddings we will train a model model for Text that went through a preprocessing pipeline containing:
- Frequency based summarization, lematization, stopword removal, and contraction expansion
- Lematization, stopword removal
- Contraction expansion, stopword removal
- Raw text

To evaluate the quality of the embeddings, we will check the accuracy of the sentiment analysis model trained to predict the sentiment based on the document embedding.

Define all of the preproccesing functions.

In [17]:
from string import punctuation
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords
nltk.download('stopwords')


def substitute_contraction(word):
    """
    Substitutes the contraction with expanded form
    Substates non-ascii quotes
    :param word:
    :return:
        The substituted contraction or the original token
    """

    # Replace unicode commas
    punctuation = {0x2018: 0x27, 0x2019: 0x27, 0x201C: 0x22, 0x201D: 0x22}
    w = word.translate(punctuation)

    contractions = get_contraction_dict()

    if w in contractions.keys():
        subbed = contractions[w]
        return subbed
    else:
        return w
    
def get_contraction_dict():
    return {
        "ain't": "is not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "he has",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how is",
        "I'd": "I would",
        "I'd've": "I would have",
        "I'll": "I will",
        "I'll've": "I shall have",
        "I'm": "I am",
        "I've": "I have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it shall have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so is",
        "that'd": "that had",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what'll've": "what will have",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who'll've": "who will have",
        "who's": "who is",
        "who've": "who have",
        "why's": "why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "you've": "you have"
    }


class LanguageProcessor:
    def __init__(self):
        """
        All of the natural processing functionality
        """
        
        self.lemmatizer = WordNetLemmatizer()

        self.punctuation_list = [c for c in punctuation]
        
        # TODO: make this a set
        self.STOPWORDS = stopwords.words()

        # TODO: Get more complete list and read it from file
        MORESTOP = ['will', 'thing', 'n\'t', '\'\'', '\'s', '``', '\'re', '\'', 'mr', 'mr.', '--', '...', '..', '->', '\'.',
                    '\' \'', ' .', '’',
                    '“', '”', "", "\n"]
        self.STOPWORDS.extend(MORESTOP)

    def substitute_contractions(self, words):
        """
        Loop through words and sub contractions
        :param text:
        :return:
        """
        subbed = []
        for word in words:
            subbed.append(substitute_contraction(word))
        return subbed

    def get_non_stopwords(self, words, substitute_contractions=True, stem=True):
        """
        Returns a list of lowercase non-stopwords in the text.
        non-stopwords are anything that is not punctuation or stopwords
        Numerical values are NOT FILTERED OUT
        :param text:
        :param stem:
        :return:
        """

        if substitute_contractions:
            words = self.substitute_contractions(words)

        non_stop_words = []

        # Loop through tokens
        for word in words:
            # Slowing things down
            token = self.remove_punctuation(word.lower())
            if token not in self.STOPWORDS:
                # Check if token contains punctuation
                if token not in self.punctuation_list:
                    if stem:
                        non_stop_words.append(self.get_word_lemma(token))
                    else:
                        non_stop_words.append(token)

        return non_stop_words
    

    def get_word_lemma(self, word):
        """
        Helper to allows customization to stemming process, like checking for trailing e's
        :param word:
        :return:
        """
        lema = self.lemmatizer.lemmatize(word)
        return lema

    def remove_punctuation(self, text):
        """
        Helper function to remove all non-acsii charcters
        :param text:
        :return:
        """
        return ''.join([i if ord(i) < 128 else '' for i in text])

    def is_text_token(self, token):
        """
        Checks if not punc or numerical, or non-acsii
        :param token:
        :return:
        """

        if len(token) == 1:
            if ord(token) < 128 and token not in punctuation and not token.isdigit():
                return True
            else:
                return False

        else:
            if not token.isdigit():
                return True
            else:
                return False
    
# Perform each type of preprocesing
# TODO: Add "sum+lem+stop+con"
corpora = ["lem+stop+con", "stop+con", "stop", "none"]
processed_texts = {cor:[] for cor in corpora}
processor = LanguageProcessor()


# # Reload processed text
# import pickle
# # Loading files from pickle
# processed_texts = pickle.load(open("processed_text.pkl", "rb"))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/milanarezina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Prepare the documents for each type of preprocessing.
- lem = lematization
- stop = stopword removal
- con = expanded contractions

In [None]:
print("Pre-processing Text...")

print("Processing lem+stop+con")
for doc in alldocs:
    words = processor.get_non_stopwords(doc.words, substitute_contractions=True, stem=True)
    doc2 = SentimentDocument(words, doc.tags, doc.split, doc.sentiment)
    processed_texts["lem+stop+con"].append(doc2)
    
print("Processing stop+con")
for doc in alldocs:
    words = processor.get_non_stopwords(doc.words, substitute_contractions=True, stem=False)
    doc2 = SentimentDocument(words, doc.tags, doc.split, doc.sentiment)
    processed_texts["stop+con"].append(doc2)
    
print("Processing stop")
for doc in alldocs:
    words = processor.get_non_stopwords(doc.words, substitute_contractions=False, stem=False)
    doc2 = SentimentDocument(words, doc.tags, doc.split, doc.sentiment)
    processed_texts["stop"].append(doc2)

processed_texts["none"] = alldocs

print("Completed Pre-processing")

Setup for evaluating models.

In [12]:
from sklearn.linear_model import LogisticRegression
import numpy as np

from gensim.models.doc2vec import Doc2Vec
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
import multiprocessing

# Keep track of the error rates for each model
error_rates = {}

def logistic_regression_predictor(X, y):
    """
    Return the predictor after fitting a model on embeddings and sentiment class
    :param X: 
        Embeddings
    :param y: 
        Sentiment class
    :return: 
    """
    clf = LogisticRegression(random_state=0, verbose=True).fit(X, y)
    return clf


def model_error_rate(doc2vec_model, train, test):
    """
    Test error rate of regression model that uses the doc2vec embeddings to predict sentiment class
    :param doc2vec_model: 
    :param train: 
    :param test: 
    :return: 
    """
    
    train_y = [doc.sentiment for doc in train_docs]
    train_x = [doc2vec_model.docvecs[doc.tags[0]] for doc in train_docs]
    test_x = [doc2vec_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_y = [doc.sentiment for doc in test]

    print("Sample Data", train_x[:1], train_y[:1])
    print(f"""Train / test data breakdown:
           Train positive sentiment samples {train_y.count(1.0)} out of {len(train_y)}
           Test positive sentiment samples {test_y.count(1.0)} out of {len(test_y)}""")
    
    predictor = logistic_regression_predictor(train_x, train_y)
    test_predictions = predictor.predict(test_x)

    corrects = sum(np.rint(test_predictions) == test_y)
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    
    return error_rate, errors, len(test_predictions), predictor
    

    

# Common Doc2vec configuration
common_kwargs = dict(
    vector_size=100, epochs=20, min_count=2,
    sample=0, workers=multiprocessing.cpu_count(), negative=5, hs=0,
)


# Doc2vec models for each type of preproccesing
models_by_corpora = {}

Train and evalute a PV-DBOW (paragraph vector distributed bag of words) model for each of the types of preprocessing.

In [15]:
for model in corpora:
    
    # TODO: Concatenated doc2vec model perform slightly better, however it is missing
    # tags property in it's implementation.
    # 2nd best model used
    models_by_corpora[model] = Doc2Vec(dm=0, **common_kwargs)
    
    
print("Training Doc2Vec models ...")

# Evaluate Doc2vec for each type of preproccesing
for corpus in corpora:
    
    model = models_by_corpora[corpus]
    docs = processed_texts[corpus]
    
    print("-"*20)
    print(f"Training model on: {corpus} documents ...")
    
    # Split into train / test sets
    train_docs = [doc for doc in docs if doc.split == 'train']
    test_docs = [doc for doc in docs if doc.split == 'test']
    
    model.build_vocab(docs)
    model.train(docs, total_examples=len(docs), epochs=model.epochs)

    err_rate, err_count, test_count, predictor = model_error_rate(model, train_docs, test_docs)
    error_rates[str(model)] = err_rate
    
    print(f"Error rate: {err_rate} for model {str(model)} trained on {corpus}")
    



Training Doc2Vec models ...
--------------------
Model trained on: lem+stop+con
Sample Data [array([ 0.19048014,  0.18574472,  0.19675452,  0.34363356, -0.05429327,
       -0.34967002, -0.9629098 ,  0.5046224 ,  0.3553929 , -0.02000841,
        0.42241308,  0.31711754, -0.1987363 , -0.11315008, -0.06220058,
       -0.0768768 ,  0.22785848,  0.2949146 , -0.39035028,  0.26064464,
       -0.11486331,  0.13627096,  0.38031614,  0.03657763,  0.4365639 ,
       -0.5859031 , -0.7461953 ,  0.551983  , -0.7311374 ,  0.46559998,
       -0.38593325, -0.44615978, -0.30371758, -0.05886084,  0.11023851,
       -0.15133354,  0.37857378, -0.35254526, -0.05427878, -0.0111264 ,
       -0.5489169 ,  0.41485846,  0.50775194,  0.36012176, -0.171678  ,
        0.24442673, -0.64688766, -0.28453165, -0.38347286,  0.4489134 ,
       -0.25286022,  0.00631496,  0.04119195, -0.15534724,  0.68830603,
       -0.23483275,  0.14449677, -0.4099465 ,  0.12440899,  0.09973439,
       -0.0247636 ,  0.02926677, -0.5410340

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


Error rate: 0.10584 for model Doc2Vec(dbow,d100,n5,mc2,t4) trained on lem+stop+con
--------------------
--------------------
Model trained on: stop+con
Sample Data [array([-0.37394282,  0.14541703, -0.1181206 ,  0.32588914,  0.00417809,
       -0.40655154, -0.8618801 , -0.17082322,  0.35712767, -0.34439042,
        0.36993855,  0.5819138 , -0.54412836,  0.24266304,  0.2662939 ,
        0.0190639 ,  0.83017415,  0.12247381, -0.5949772 ,  0.12477303,
        0.09859015,  0.004922  ,  0.3735908 ,  0.01978863, -0.05533889,
       -0.4187592 , -0.5312697 ,  0.42291212, -0.19113207,  0.11957824,
       -0.2275183 , -0.17485602, -0.01526071, -0.14402497,  0.3751672 ,
       -0.10791633,  0.03592143, -0.5977169 , -0.23647137, -0.16427007,
       -0.25056264,  0.31526288,  0.38313723,  0.32543314,  0.15905464,
        0.22170836, -0.15616153, -0.2102895 , -0.4667185 ,  0.3467459 ,
       -0.33510837, -0.29897165,  0.3037702 , -0.7112232 ,  0.32905924,
        0.662519  ,  0.17032981, -0.1178364

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


Error rate: 0.10372 for model Doc2Vec(dbow,d100,n5,mc2,t4) trained on stop+con
--------------------
--------------------
Model trained on: stop
Sample Data [array([ 0.17761402, -0.26420492,  0.361937  ,  0.26289362, -0.4712735 ,
       -0.38532922, -0.51815885, -0.04005979,  0.1153435 , -0.2119986 ,
        0.25628793,  0.35129106, -0.27893662, -0.20348997,  0.6807101 ,
       -0.01895518,  0.8363311 ,  0.15891616, -0.5509589 , -0.30133876,
        0.1803388 ,  0.19554168,  0.2696354 , -0.12448578,  0.32309046,
       -0.32536864, -0.23913145,  0.44343346, -0.2880625 ,  0.37692592,
        0.18396729,  0.04414753, -0.27661565,  0.590954  ,  0.4049222 ,
        0.22829579, -0.06214761, -0.34437808,  0.16237988, -0.03092983,
       -0.25630778,  0.39341688,  0.09050135, -0.02627868, -0.34114236,
        0.27708352, -0.38553795, -0.2610582 , -0.25584152,  0.34392452,
       -0.53384113,  0.51197445,  0.3515095 , -0.73996824,  0.17246945,
        0.68162835,  0.23187435, -0.48883978,  0.21

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


Error rate: 0.10392 for model Doc2Vec(dbow,d100,n5,mc2,t4) trained on stop
--------------------
--------------------
Model trained on: none
Sample Data [array([-3.38244528e-01, -2.18791798e-01, -2.18285546e-01,  4.88449216e-01,
        2.68145382e-01, -2.59598464e-01, -1.01595536e-01, -4.23042983e-01,
       -3.71159554e-01, -2.55106032e-01,  2.21404284e-01, -2.24932402e-01,
       -1.24065077e+00, -5.14121614e-02, -2.08528668e-01, -9.39034671e-02,
        1.84628502e-01, -4.37048882e-01, -1.55302763e-01, -7.63706982e-01,
       -1.42467797e-01, -3.85564417e-01, -9.14833415e-03, -5.12101129e-02,
       -1.68125749e-01, -1.40146643e-01, -5.10644257e-01, -1.15374245e-01,
       -3.22570175e-01,  6.44028544e-01,  3.59059125e-01,  3.49316210e-01,
        3.70865494e-01, -2.23197460e-01,  1.43083604e-02,  1.52120396e-01,
        1.07828908e-01, -2.10189559e-02, -1.34678394e-01, -4.68377799e-01,
       -1.36552170e-01,  9.28948283e-01, -3.62675935e-02,  9.40234840e-01,
        7.50751793e-01

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


Error rate: 0.10152 for model Doc2Vec(dbow,d100,n5,mc2,t4) trained on none
--------------------


For the PV-DBOW model no performing any preproccesing produces the highest accuracy.

Train and evalute a PV-DM (paragraph vector distributed memory) model for each of the types of preprocessing.

In [19]:
for model in corpora:
    
    # TODO: Concatenated doc2vec model perform slightly better, however it is missing
    # tags property in it's implementation.
    # 2nd best model used
    models_by_corpora[model + "+dm"] = Doc2Vec(dm=1, **common_kwargs)
    
    
print("Training Doc2Vec models ...")

# Evaluate Doc2vec for each type of preproccesing
for corpus in corpora:
    
    model = models_by_corpora[corpus + "+dm"]
    docs = processed_texts[corpus]
    
    print("-"*20)
    print(f"Training model on: {corpus} documents ...")
    
    # Split into train / test sets
    train_docs = [doc for doc in docs if doc.split == 'train']
    test_docs = [doc for doc in docs if doc.split == 'test']

    model.build_vocab(docs)
    model.train(docs, total_examples=len(docs), epochs=model.epochs)

    err_rate, err_count, test_count, predictor = model_error_rate(model, train_docs, test_docs)
    error_rates[str(model)] = err_rate
    
    print(f"Error rate: {err_rate} for model {str(model)} trained on {corpus}")

Training Doc2Vec models ...
--------------------
Training model on: lem+stop+con documents ...
Sample Data [array([ 4.8964587e-01, -5.8797348e-02, -2.6077956e-01,  4.5133162e-01,
       -1.5007243e-01,  3.4983036e-01, -1.4829528e-01,  1.1516626e+00,
       -1.4036475e-01, -4.4391423e-01,  2.6714194e-01,  2.4020796e-01,
        1.8720028e-01, -3.7481549e-01,  1.1214490e+00,  4.3689597e-01,
       -6.3363835e-02, -9.4921246e-02, -2.2934680e-01, -3.0742434e-01,
        1.7611554e-01, -1.1198944e-03, -1.1979154e-01, -1.4307345e-01,
        3.8967717e-01,  7.8051817e-01,  4.0083644e-01,  3.1800258e-01,
        7.9142708e-01, -1.3788615e-01, -6.3872203e-02, -1.8768595e-01,
       -3.5029519e-01, -3.0670562e-01, -4.6106091e-01,  5.6179827e-01,
        2.3946853e-01,  4.8041072e-01, -4.1785136e-01,  2.2026943e-01,
        9.6610361e-01, -4.8661163e-01, -2.3579098e-01, -7.3455429e-01,
        1.0253838e+00,  5.8691138e-01,  6.9736248e-01, -9.4024286e-02,
       -6.7753464e-01,  8.4717132e-02, -

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


Error rate: 0.14544 for model Doc2Vec(dm/m,d100,n5,w5,mc2,t4) trained on lem+stop+con
--------------------
Training model on: stop+con documents ...
Sample Data [array([ 0.27232426, -0.45932883,  0.5505049 , -0.1618307 , -0.65958476,
        0.8766121 , -0.2310929 ,  0.9546105 , -0.1475887 , -0.7298239 ,
        0.9234847 ,  0.24537049, -0.08019847, -1.2477479 ,  0.9693112 ,
        0.31118992, -0.48314756, -0.16000451, -0.56573766,  0.04264857,
        0.5580025 , -0.25024548, -0.9058656 , -0.21374385,  0.30061185,
        0.78815556, -0.35189217,  0.05987649,  0.9073889 ,  0.01899325,
        0.6163013 , -0.38901085,  0.17431754,  0.4312105 ,  0.22678795,
        0.16277197,  0.4662049 ,  0.20152847,  0.0473839 , -0.41274124,
        0.10997185, -0.4706153 ,  0.01948678,  0.7231294 ,  0.67650867,
       -0.12370142,  0.27910763, -0.29406056, -0.18828371,  0.6250889 ,
       -0.20583498, -0.3819209 , -0.13838315, -0.29362363, -0.450509  ,
       -0.81792045,  0.81960404, -0.11443598, 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


Error rate: 0.14572 for model Doc2Vec(dm/m,d100,n5,w5,mc2,t4) trained on stop+con
--------------------
Training model on: stop documents ...
Sample Data [array([ 3.1782696e-01,  1.4468937e-01,  5.5200112e-01,  1.9754238e-01,
       -4.5716739e-01, -2.0876018e-02,  5.3391922e-02,  4.2869037e-01,
        3.5506062e-02, -5.7764381e-01,  2.9031059e-01,  9.8971464e-02,
       -1.8032686e-01, -8.9375979e-01,  6.4667416e-01, -3.0933955e-01,
       -3.0709907e-01, -8.9960583e-02, -2.2077662e-01,  4.5606655e-01,
        1.0671389e+00,  7.4585646e-02, -4.4430754e-01, -1.5670855e-01,
        7.2458327e-01,  5.3552294e-01, -6.2834823e-01,  3.3240920e-01,
        3.6941335e-01,  6.1231934e-02,  7.6540363e-01, -4.9100927e-01,
        3.1980252e-01,  9.9475071e-02, -3.2293942e-02, -4.1922846e-01,
        6.7690104e-01,  5.3153348e-01,  4.3957984e-01, -1.9703147e-01,
        6.0100913e-01, -6.1769575e-01,  1.7554343e-01,  5.2654654e-01,
        3.0081433e-01,  2.2253822e-01,  1.8421561e-01,  4.0907881

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s finished


Error rate: 0.14364 for model Doc2Vec(dm/m,d100,n5,w5,mc2,t4) trained on stop
--------------------
Training model on: none documents ...
Sample Data [array([-0.4539063 ,  0.41374287, -0.45566967, -0.1684033 ,  0.25140274,
        1.457889  , -0.8647763 , -1.1876287 ,  1.2879443 , -0.7200864 ,
       -0.6822012 ,  0.48079208, -0.19480446, -0.5912165 , -0.13309531,
        0.29199585,  0.33319795, -1.7095803 ,  0.12073743, -0.13520083,
        0.9674745 ,  0.70850074,  0.63916624, -0.29646635,  0.4273372 ,
        0.0195256 , -0.6486902 ,  0.790284  ,  0.37703988,  0.5091603 ,
        0.29243577, -1.3375407 ,  0.3686426 ,  0.25683177, -0.04735296,
       -0.02597161,  0.6014649 ,  0.69863445, -0.3714222 ,  1.5299557 ,
       -1.2560995 , -0.66209966, -0.11763109,  0.9145844 ,  0.74525046,
        0.1489802 , -0.34050682,  0.8036674 ,  0.938801  ,  0.31990236,
       -0.08393154, -0.6542425 ,  0.29231068,  0.9085804 ,  0.2648597 ,
        0.83380234, -0.18929778,  0.559436  , -0.4640625 ,

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Error rate: 0.17256 for model Doc2Vec(dm/m,d100,n5,w5,mc2,t4) trained on none


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s finished


For the distrubted memory model the worst performance occurs when no preprocessing is applied (Error rate: 0.17256). The best performance is from removing stopwords (Error rate: 0.14364). Additional preprocessing such as, lem+stop+con and stop+con performs similary. (Error rate: 0.14544 and 0.14572)