In [1]:
from sklearn import linear_model
import numpy as np
from collections import namedtuple
tokenized_row = namedtuple('tokenized_row', 'sent_count sentences word_count words')
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import csv

def test_batch(test_regressors, test_targets, sgd, successes, false_pos, false_neg):
    test_predictions = sgd.predict(test_regressors)
    rounded_predictions = np.rint(test_predictions)
    for i in range(len(rounded_predictions)):
        if rounded_predictions[i] == 1 and test_targets[i] == 0: false_pos += 1
        if rounded_predictions[i] == 0 and test_targets[i] == 1: false_neg += 1
        if rounded_predictions[i] == test_targets[i]: successes += 1
    return successes, false_pos, false_neg

filenames = ['combined_train_test.p', 'r_train_so_test.p', 'so_train_r_test.p',
            'so_alone.p', 'reddit_alone.p']


In [None]:
def words_only(filename):
    sgd = linear_model.SGDClassifier()

    with open(filename, 'rb') as pfile:
        train, test = pickle.load(pfile)
    directory_name = filename.split('.p')[0]
    with open(directory_name + "/tokenized_dict.p", 'rb') as pfile:
        train_token_dict, test_token_dict = pickle.load(pfile)
    with open(directory_name + "/body_vectorizer.p", 'rb') as pfile:
        body_vectorizer = pickle.load(pfile) 
    with open(directory_name + "/title_vectorizer.p", 'rb') as pfile:
        title_vectorizer = pickle.load(pfile)   
    empty_response = title_vectorizer.transform([])
    title_length = empty_response.shape[1]
    empty_response = body_vectorizer.transform([])
    body_length = empty_response.shape[1]
    train_length = len(train.index.values)

    max_batch_size = 1000
    counter = 0

    train_regressors = np.empty([max_batch_size, body_length + title_length])
    train_targets = np.empty([max_batch_size, 1])

    for i in range(train_length):
        index = train.index.values[i]
        row = train_token_dict[index]
        title_words = row[0].words
        body_words = row[1].words
        if len(body_words) is 0: body_words = [""]
        title_vectorization = title_vectorizer.transform(title_words).toarray()[0]
        body_vectorization = body_vectorizer.transform(body_words).toarray()[0]
        train_regressors[counter] = np.concatenate((title_vectorization, body_vectorization))
        train_targets[counter] = train['answer_good'].values[i]
        counter += 1
        if counter == max_batch_size:
            sgd.partial_fit(train_regressors, train_targets, classes=np.array([0, 1]))
            if train_length - i < max_batch_size:
                batch_size = train_length % max_batch_size
            else:
                batch_size = max_batch_size
            train_regressors = np.empty([batch_size, body_length + title_length])
            train_targets = np.empty([batch_size, 1])
            counter = 0
    
    counter = 0
    successes, false_pos, false_neg = 0, 0, 0
    test_length = len(test.index.values)
    test_regressors = np.empty([test_length, body_length + title_length])
    train_regressors = np.empty([max_batch_size, body_length + title_length])
    train_targets = np.empty([max_batch_size, 1])
    for i in range(test_length):
        index = train.index.values[i]
        row = train_token_dict[index]
        title_words = row[0].words
        body_words = row[1].words
        if len(body_words) is 0: body_words = [""]
        title_vectorization = title_vectorizer.transform(title_words).toarray()[0]
        body_vectorization = body_vectorizer.transform(body_words).toarray()[0]
        test_regressors[counter] = np.concatenate((title_vectorization, body_vectorization))
        test_targets[counter] = test['answer_good'].values[i]
        counter += 1
        if counter == max_batch_size:
            successes, false_pos, false_neg = test_batch(test_regressors, test_targets, sgd, 
                                                        successes, false_pos, false_neg)
            if test_length - i < max_batch_size:
                batch_size = test_length % max_batch_size
            else:
                batch_size = max_batch_size
            test_regressors = np.empty([batch_size, body_length + title_length])
            test_targets = np.empty([batch_size, 1])
            counter = 0
    return successes, false_pos, false_neg

with open('words_only_results.csv', 'w+', newline="") as csvfile:
    fieldnames = ['Test Name', 'Success Rate', 'false +', 'false -']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for name in filenames:
        successes, false_pos, false_neg = words_only(name)
        success_rate = float(successes) / (successes + false_pos + false_neg)
        writer.writerow({'Test Name': name, 'Success Rate': success_rate, 
                         'false +': false_pos, 'false -': false_neg})

  y = column_or_1d(y, warn=True)


In [11]:
filename = filenames[0]
sgd = linear_model.SGDClassifier()

with open(filename, 'rb') as pfile:
    train, test = pickle.load(pfile)
directory_name = filename.split('.p')[0]
with open(directory_name + "/tokenized_dict.p", 'rb') as pfile:
    train_token_dict, test_token_dict = pickle.load(pfile)
with open(directory_name + "/body_vectorizer.p", 'rb') as pfile:
    body_vectorizer = pickle.load(pfile) 
with open(directory_name + "/title_vectorizer.p", 'rb') as pfile:
    title_vectorizer = pickle.load(pfile)   
empty_response = title_vectorizer.transform([])
title_length = empty_response.shape[1]
empty_response = body_vectorizer.transform([])
body_length = empty_response.shape[1]
train_length = len(train.index.values)

max_batch_size = 1000
counter = 0

train_regressors = np.empty([max_batch_size, body_length + title_length])
train_targets = np.empty([max_batch_size, 1])

for i in range(train_length):
    index = train.index.values[i]
    row = train_token_dict[index]
    title_words = row[0].words
    body_words = row[1].words
    if len(body_words) is 0: body_words = [""]
    title_vectorization = title_vectorizer.transform(title_words).toarray()[0]
    body_vectorization = body_vectorizer.transform(body_words).toarray()[0]
    train_regressors[counter] = np.concatenate((title_vectorization, body_vectorization))
    train_targets[counter] = train['answer_good'].values[i]
    counter += 1
    if counter == 1000:
        sgd.partial_fit(train_regressors, train_targets)
        if train_length - i < 1000:
            batch_size = train_length % 1000
        else:
            batch_size = 1000
        train_regressors = np.empty([batch_size, body_length + title_length])
        train_targets = np.empty([batch_size, 1])
        counter = 0
        print(counter)

NameError: name 'sdg' is not defined

In [6]:
body_vectorizer.transform([""]).toarray()[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [7]:
body_words

[]