In [1]:
from gensim.models.doc2vec import Doc2Vec



In [12]:
# Format data for input into Doc2Vec
import nltk
import csv
from collections import namedtuple
from numpy.random import choice

ScoreDocument = namedtuple('ScoreDocument', 'words tags split score')
elements = ['train', 'test', 'validate'] 
weights = [0.7, 0.2, 0.1]

titles = []
with open('all_comments.csv','r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    counter = 0
    for row in reader:
        title = row['title'].lower()
        selftext = row['selftext'].lower()
        words = nltk.word_tokenize(title + " " + selftext)
        score = int(row['ups']) - int(row['downs'])
        titles.append(ScoreDocument(words, tags=[counter], split=choice(elements, p=weights), score=score))
        counter += 1

In [13]:
train_docs = [doc for doc in titles if doc.split == 'train']
test_docs = [doc for doc in titles if doc.split == 'test']
val_docs = [doc for doc in titles if doc.split == 'validate']

print(len(train_docs))
print(len(test_docs))
print(len(val_docs))
# Access document vectors with test_model.docvecs[doc.tags[0]]

596868
169904
85672


In [14]:
import multiprocessing
cores = multiprocessing.cpu_count()
model = Doc2Vec(titles, size=100, workers=cores, window=8, min_count=5)
model.save('queries.doc2vec')



In [41]:
import numpy as np
import statsmodels.api as sm
from random import sample

threshold = 3 #Can set this parameter to make more/fewer passes to the test. 3 is at ~50% of test data

def logistic_predictor_from_data(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets, train_regressors = zip(*[(1 if (doc.score > threshold) else 0, test_model.docvecs[doc.tags[0]]) for doc in train_set])
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_data = test_set
    test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
    test_regressors = sm.add_constant(test_regressors)
    
    # Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
    rounded_predictions = np.rint(test_predictions)
    correct_set = rounded_predictions == [1 if (doc.score > threshold) else 0 for doc in test_data]
    false_pos = 0
    false_neg = 0
    for i in range(len(correct_set)):
        if rounded_predictions[i] == 1 and correct_set[i] == 0: false_pos += 1
        if rounded_predictions[i] == 0 and correct_set[i] == 0: false_neg += 1
    corrects = sum(correct_set)
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), false_pos, false_neg)

err, err_count, test_count, false_pos, false_neg = error_rate_for_model(model, train_docs, test_docs)
print("Error rate: " + str(err))
print("False positives: " + str(false_pos))
print("False negatives: " + str(false_neg))

Error rate: 0.3679313023825219
False positives: 27689
False negatives: 34824
Total num wrong: 62513


625