In [3]:
import pickle
import statsmodels.api as sm
import numpy as np
def train_predictor(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    return predictor

def error_rate(train_targets, train_regressors, test_targets, test_regressors):
    train_regressors = sm.add_constant(train_regressors)
    predictor = train_predictor(train_targets, train_regressors)
    test_regressors = sm.add_constant(test_regressors)
    test_predictions = predictor.predict(test_regressors)
    rounded_predictions = np.rint(test_predictions)
    false_pos = 0
    false_neg = 0
    for i in range(len(rounded_predictions)):
        if rounded_predictions[i] == 1 and test_targets[i] == 0: false_pos += 1
        if rounded_predictions[i] == 0 and test_targets[i] == 1: false_neg += 1
    errors = false_pos + false_neg
    corrects = len(rounded_predictions) - errors
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, false_pos, false_neg)

filenames = ['combined_train_test.p', 'r_train_so_test.p', 'so_train_r_test.p',
            'so_alone.p', 'reddit_alone.p']
from collections import namedtuple
tokenized_row = namedtuple('tokenized_row', 'sent_count sentences word_count words')
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import csv


In [34]:
import pickle
import csv

def baseline(filename):
    with open(filename, 'rb') as pfile:
        train, test = pickle.load(pfile)
    train_targets = train['answer_good'].values
    train_regressors = train['AnswerCount'].values
    test_targets = test['answer_good'].values
    test_regressors = test['AnswerCount'].values
    return error_rate(train_targets, train_regressors, test_targets, test_regressors)

filenames = ['combined_train_test.p', 'r_train_so_test.p', 'so_train_r_test.p',
            'so_alone.p', 'reddit_alone.p']

with open('baseline_results.csv', 'w+', newline="") as csvfile:
    fieldnames = ['Test Name', 'Success Rate', 'false +', 'false -']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for name in filenames:
        errors, false_pos, false_neg = baseline(name)
        success_rate = 1 - errors
        writer.writerow({'Test Name': name, 'Success Rate': success_rate, 
                         'false +': false_pos, 'false -': false_neg})
        

In [42]:
filenames = ['combined_train_test.p', 'r_train_so_test.p', 'so_train_r_test.p',
            'so_alone.p', 'reddit_alone.p']
from collections import namedtuple
tokenized_row = namedtuple('tokenized_row', 'sent_count sentences word_count words')


def length_only(filename):
    with open(filename, 'rb') as pfile:
        train, test = pickle.load(pfile)
    # Get length from the dict! Word count and sentence count
    train_regressors = []
    test_regressors = []
    directory_name = filename.split('.p')[0]
    with open(directory_name + "/tokenized_dict.p", 'rb') as pfile:
        train_token_dict, test_token_dict = pickle.load(pfile)
    for i in train.index.values:
        row = train_token_dict[i]
        train_regressors.append((row[0].word_count, row[0].sent_count, row[1].word_count, row[1].sent_count))
    train_targets = train['answer_good'].values
    test_targets = test['answer_good'].values
    for i in test.index.values:
        row = test_token_dict[i]
        test_regressors.append((row[0].word_count, row[0].sent_count, row[1].word_count, row[1].sent_count))
    return error_rate(train_targets, train_regressors, test_targets, test_regressors)

with open('length_only_results.csv', 'w+', newline="") as csvfile:
    fieldnames = ['Test Name', 'Success Rate', 'false +', 'false -']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for name in filenames:
        errors, false_pos, false_neg = length_only(name)
        success_rate = 1 - errors
        writer.writerow({'Test Name': name, 'Success Rate': success_rate, 
                         'false +': false_pos, 'false -': false_neg})

In [None]:
def words_only(filename):
    with open(filename, 'rb') as pfile:
        train, test = pickle.load(pfile)
    # Get length from the dict! Word count and sentence count
    train_regressors = []
    test_regressors = []
    directory_name = filename.split('.p')[0]
    with open(directory_name + "/tokenized_dict.p", 'rb') as pfile:
        train_token_dict, test_token_dict = pickle.load(pfile)
    with open(directory_name + "/body_vectorizer.p", 'rb') as pfile:
        body_vectorizer = pickle.load(pfile) 
    with open(directory_name + "/title_vectorizer.p", 'rb') as pfile:
        title_vectorizer = pickle.load(pfile)   
    for i in train.index.values:
        row = train_token_dict[i]
        title_words = row[0].words
        body_words = row[1].words
        title_vectorization = title_vectorizer.transform(title_words)
        body_vectorization = body_vectorizer.transform(body_words)
        # Append the vectorization of the word input
        train_regressors.append((title_vectorization, body_vectorization))
    train_targets = train['answer_good'].values
    test_targets = test['answer_good'].values
    for i in test.index.values:
        row = test_token_dict[i]
        title_words = row[0].words
        body_words = row[1].words
        title_vectorization = title_vectorizer.transform(title_words)
        body_vectorization = body_vectorizer.transform(body_words)
        # Append the vectorization of the word input
        test_regressors.append((title_vectorization, body_vectorization))
    return error_rate(train_targets, train_regressors, test_targets, test_regressors)

with open('words_only_results.csv', 'w+', newline="") as csvfile:
    fieldnames = ['Test Name', 'Success Rate', 'false +', 'false -']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for name in filenames:
        errors, false_pos, false_neg = words_only(name)
        success_rate = 1 - errors
        writer.writerow({'Test Name': name, 'Success Rate': success_rate, 
                         'false +': false_pos, 'false -': false_neg})