In [4]:
import pickle
import statsmodels.api as sm
import numpy as np
def train_predictor(train_targets, train_regressors):
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    return predictor

def error_rate(train_targets, train_regressors, test_targets, test_regressors):
    train_regressors = sm.add_constant(train_regressors)
    predictor = train_predictor(train_targets, train_regressors)
    test_regressors = sm.add_constant(test_regressors)
    test_predictions = predictor.predict(test_regressors)
    rounded_predictions = np.rint(test_predictions)
    false_pos = 0
    false_neg = 0
    for i in range(len(rounded_predictions)):
        if rounded_predictions[i] == 1 and test_targets[i] == 0: false_pos += 1
        if rounded_predictions[i] == 0 and test_targets[i] == 1: false_neg += 1
    errors = false_pos + false_neg
    corrects = len(rounded_predictions) - errors
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, false_pos, false_neg)


In [34]:
import pickle
import csv

def baseline(filename):
    with open(filename, 'rb') as pfile:
        train, test = pickle.load(pfile)
    train_targets = train['answer_good'].values
    train_regressors = train['AnswerCount'].values
    test_targets = test['answer_good'].values
    test_regressors = test['AnswerCount'].values
    return error_rate(train_targets, train_regressors, test_targets, test_regressors)

filenames = ['combined_train_test.p', 'r_train_so_test.p', 'so_train_r_test.p',
            'so_alone.p', 'reddit_alone.p']

with open('baseline_results.csv', 'w+', newline="") as csvfile:
    fieldnames = ['Test Name', 'Success Rate', 'false +', 'false -']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for name in filenames:
        errors, false_pos, false_neg = baseline(name)
        success_rate = 1 - errors
        writer.writerow({'Test Name': name, 'Success Rate': success_rate, 
                         'false +': false_pos, 'false -': false_neg})
        

In [33]:
with open('combined_train_test.p', 'rb') as pfile:
    train, test = pickle.load(pfile)
print(train['index'])


3060      3060
14920    14920
5906      5906
416        416
9063      9063
37719    37719
10127    10127
2709      2709
6285      6285
19720    19720
16030    16030
7619      7619
10225    10225
3481      3481
22764    22764
40947    40947
62244    62244
54079    54079
7858      7858
8664      8664
11481    11481
10578    10578
38016    38016
5549      5549
16164    16164
404        404
39709    39709
4532      4532
8677      8677
1954      1954
         ...  
25422    25422
37879    37879
46972    46972
47155    47155
50259    50259
43531    43531
6117      6117
36256    36256
9614      9614
24227    24227
64263    64263
2638      2638
12727    12727
64386    64386
3976      3976
9862      9862
45161    45161
3404      3404
12075    12075
36446    36446
4460      4460
2638      2638
1748      1748
7667      7667
40570    40570
47370    47370
2144      2144
13158    13158
209        209
5442      5442
Name: index, Length: 37582, dtype: int64


In [29]:
print(train.values[0])

[1 4.0
 '<p>what specific behaviour confirmed the existence of the w and z bosons at the ua1 and ua2 experiments?</p>\n\n<p>thanks!</p>\n'
 'physics' 4 'detection of w and z bosons' 879.0 0.004550625711035267 0 1
 9840.0 0.0 1.0]
