In [1]:
import pickle
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



In [2]:
train = pd.read_pickle('train_bow_labels.pkl')
test = pd.read_pickle('test_bow_labels.pkl')

train.head()

Unnamed: 0,id,title,text_clean,pov
0,619941,Loch Katrine,infobox lake name loch katrin imag loch katrin...,False
1,3884222,Bhadayasa,infobox royalti imag filebhadrayasha coinjpg a...,False
2,4229879,Lee Jones (author),lee jone onlin poker execut author win low lim...,False
3,5320685,School District 54 Bulkley Valley,infobox school district name school district b...,False
4,9146365,Combing,interlac disambiguationinterlaceth comb hairco...,False


In [3]:
#create bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
train_data_features = vectorizer.fit_transform(train['text_clean']).toarray()
feature_matrix = pd.DataFrame(train_data_features, index = train['title'], columns=vectorizer.get_feature_names())

test_data_features = vectorizer.transform(test['text_clean']).toarray()
test_feature_matrix = pd.DataFrame(test_data_features, index = test['title'], columns=vectorizer.get_feature_names())

In [18]:
#create ensemble data frame

results = pd.DataFrame(0, index=['logistic', 'forest', 'linear_svc', 'xgboost', 'naive_bayes'], 
                       columns= ['Accuracy', 'Sensitivity', 'Specificity', 'ROC-AUC', 'Avg Logloss'])
results

Unnamed: 0,Accuracy,Sensitivity,Specificity,ROC-AUC,Avg Logloss
logistic,0,0,0,0,0
forest,0,0,0,0,0
linear_svc,0,0,0,0,0
xgboost,0,0,0,0,0
naive_bayes,0,0,0,0,0


In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, confusion_matrix

logit = LogisticRegression(C=.4) 

logit = logit.fit(feature_matrix, train['pov'])
results.loc['logistic', 'Accuracy']=logit.score(test_feature_matrix, test['pov'])

test_pred = map(lambda x: x[1], list(logit.predict_proba(test_feature_matrix)))
results.loc['logistic', 'Avg Logloss']=log_loss(test['pov'], test_pred)
results.loc['logistic', 'ROC-AUC']=roc_auc_score(test['pov'], test_pred)

tn, fp, fn, tp = confusion_matrix(test['pov'], np.round(test_pred)).ravel()
results.loc['logistic', 'Sensitivity']= float(tn)/float(tn+fp)
results.loc['logistic', 'Specificity']= float(tp)/float(tp+fn)
results

Unnamed: 0,Accuracy,Sensitivity,Specificity,ROC-AUC,Avg Logloss
logistic,0.873257,0.860165,0.888583,0.899859,0.784861
forest,0.0,0.0,0.0,0.0,0.0
linear_svc,0.0,0.0,0.0,0.0,0.0
xgboost,0.0,0.0,0.0,0.0,0.0
naive_bayes,0.0,0.0,0.0,0.0,0.0


In [33]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100, criterion='entropy') 
forest.fit(feature_matrix, train['pov'])

results.loc['forest', 'Accuracy']=forest.score(test_feature_matrix, test['pov'])

test_pred = map(lambda x: x[1], list(forest.predict_proba(test_feature_matrix)))
results.loc['forest', 'Avg Logloss']=log_loss(test['pov'], test_pred)
results.loc['forest', 'ROC-AUC']=roc_auc_score(test['pov'], test_pred)

tn, fp, fn, tp = confusion_matrix(test['pov'], np.round(test_pred)).ravel()
results.loc['forest', 'Sensitivity']= float(tn)/float(tn+fp)
results.loc['forest', 'Specificity']= float(tp)/float(tp+fn)
results

Unnamed: 0,Accuracy,Sensitivity,Specificity,ROC-AUC,Avg Logloss
logistic,0.873257,0.860165,0.888583,0.899859,0.784861
forest,0.878961,0.86134,0.899587,0.957395,0.310083
linear_svc,0.0,0.0,0.0,0.0,0.0
xgboost,0.0,0.0,0.0,0.0,0.0
naive_bayes,0.0,0.0,0.0,0.0,0.0


In [56]:
from sklearn.svm import SVC
svc = SVC(C=100, kernel='linear', probability=True) 
svc = svc.fit(feature_matrix, train['pov'])

results.loc['linear_svc', 'Accuracy']=svc.score(test_feature_matrix, test['pov'])

results.loc['linear_svc', 'Avg Logloss']= np.nan
results.loc['linear_svc', 'ROC-AUC']= np.nan

tn, fp, fn, tp = confusion_matrix(test['pov'], svc.predict(test_feature_matrix)).ravel()
print tn, fp, fn, tp
results.loc['linear_svc', 'Sensitivity']= float(tn)/float(tn+fp)
results.loc['linear_svc', 'Specificity']= float(tp)/float(tp+fn)
results

706 145 75 652


Unnamed: 0,Accuracy,Sensitivity,Specificity,ROC-AUC,Avg Logloss
logistic,0.873257,0.860165,0.888583,0.899859,0.784861
forest,0.878961,0.86134,0.899587,0.957395,0.310083
linear_svc,0.860583,0.829612,0.896836,0.874519,0.572519
xgboost,0.886565,0.86839,0.90784,0.943337,0.340819
naive_bayes,0.756654,0.743831,0.771664,0.807589,5.148624


In [59]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB().fit(train_data_features, train['pov'])

results.loc['naive_bayes', 'Accuracy']=mnb.score(test_feature_matrix, test['pov'])

test_pred = map(lambda x: x[1], list(mnb.predict_proba(test_feature_matrix)))
results.loc['naive_bayes', 'Avg Logloss']=log_loss(test['pov'], test_pred)
results.loc['naive_bayes', 'ROC-AUC']=roc_auc_score(test['pov'], test_pred)

tn, fp, fn, tp = confusion_matrix(test['pov'], np.round(test_pred)).ravel()
results.loc['naive_bayes', 'Sensitivity']= float(tn)/float(tn+fp)
results.loc['naive_bayes', 'Specificity']= float(tp)/float(tp+fn)
results

Unnamed: 0,Accuracy,Sensitivity,Specificity,ROC-AUC,Avg Logloss
logistic,0.873257,0.860165,0.888583,0.899859,0.784861
forest,0.878961,0.86134,0.899587,0.957395,0.310083
linear_svc,0.860583,0.829612,0.896836,,
xgboost,0.886565,0.86839,0.90784,0.943337,0.340819
naive_bayes,0.756654,0.743831,0.771664,0.807589,5.148624


In [48]:
import xgboost as xgb
dtrain = xgb.DMatrix(feature_matrix, train['pov'])
d_val = xgb.DMatrix(test_feature_matrix, test['pov'])

# hyperparameters
xgb_params = {
    'eta': 0.35,
    'max_depth': 4,
    'subsample': .8,
    'colsample_bytree': 0.7,
    'objective': 'binary:logistic',
    'eval_metric': 'error',
    'silent': 1
}

sub_model = xgb.train(xgb_params, dtrain, num_boost_round=2000, evals=[[dtrain, 'train'], (d_val, 'val')],
                      early_stopping_rounds=50, verbose_eval=20)

[0]	train-error:0.250343	val-error:0.307351
Multiple eval metrics have been passed: 'val-error' will be used for early stopping.

Will train until val-error hasn't improved in 50 rounds.
[20]	train-error:0.147328	val-error:0.211027
[40]	train-error:0.095706	val-error:0.186312
[60]	train-error:0.067839	val-error:0.159696
[80]	train-error:0.049338	val-error:0.149556
[100]	train-error:0.034491	val-error:0.143853
[120]	train-error:0.023984	val-error:0.13308
[140]	train-error:0.01873	val-error:0.132446
[160]	train-error:0.014619	val-error:0.126109
[180]	train-error:0.011421	val-error:0.119772
[200]	train-error:0.009593	val-error:0.119138
[220]	train-error:0.007081	val-error:0.116603
[240]	train-error:0.006167	val-error:0.115336
[260]	train-error:0.005482	val-error:0.114702
[280]	train-error:0.003426	val-error:0.112167
[300]	train-error:0.002969	val-error:0.112801
[320]	train-error:0.002056	val-error:0.1109
[340]	train-error:0.000685	val-error:0.108999
[360]	train-error:0.000685	val-error:0.

In [50]:
test_pred = np.round(sub_model.predict(d_val))
tn, fp, fn, tp = confusion_matrix(test['pov'], np.round(test_pred)).ravel()

results.loc['xgboost', 'Avg Logloss']=log_loss(test['pov'], sub_model.predict(d_val))
results.loc['xgboost', 'ROC-AUC']=roc_auc_score(test['pov'], sub_model.predict(d_val))
results.loc['xgboost', 'Accuracy']= float(tn+tp)/float(tn+tp+fp+fn)
results.loc['xgboost', 'Sensitivity']= float(tn)/float(tn+fp)
results.loc['xgboost', 'Specificity']= float(tp)/float(tp+fn)
results

Unnamed: 0,Accuracy,Sensitivity,Specificity,ROC-AUC,Avg Logloss
logistic,0.873257,0.860165,0.888583,0.899859,0.784861
forest,0.878961,0.86134,0.899587,0.957395,0.310083
linear_svc,0.860583,0.948296,0.327373,0.874534,0.5672
xgboost,0.886565,0.86839,0.90784,0.943337,0.340819
naive_bayes,0.756654,0.743831,0.771664,0.807589,5.148624
