# Logistic Regression

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from matplotlib.backends.backend_pdf import PdfPages

In [5]:
alldata = pd.read_csv('/home/wangmk/UM/Research/SURE/cleaned_data/full_cleaned_data.csv')

In [6]:
alldata['Agree'] = ~(alldata['True_state']^alldata['Alert'])

In [7]:
alldata.head()

Unnamed: 0,Trial,True_state,Alert,Identification,First_toggle_time,RMS,Attention,Click,Top_Left_Enemy,Top_Right_Enemy,Bottom_Left_Enemy,Bottom_Right_Enemy,Top_Left_Dark,Top_Right_Dark,Bottom_Left_Dark,Bottom_Right_Dark,Agree
0,0,True,True,True,4179,65.244487,0.914573,2,True,False,False,False,False,True,True,True,True
1,1,False,False,False,3653,67.293907,0.895,2,False,False,False,False,True,True,True,True,True
2,2,False,True,False,2186,100.119892,0.854271,2,False,False,False,False,True,False,True,False,False
3,3,False,False,False,3412,33.403009,0.889447,2,False,False,False,False,True,False,True,True,True
4,4,False,False,False,4514,70.063858,0.929648,2,False,False,False,False,True,True,True,True,True


In [8]:
alldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 17 columns):
Trial                 4000 non-null int64
True_state            4000 non-null bool
Alert                 4000 non-null bool
Identification        4000 non-null bool
First_toggle_time     4000 non-null int64
RMS                   4000 non-null float64
Attention             4000 non-null float64
Click                 4000 non-null int64
Top_Left_Enemy        4000 non-null bool
Top_Right_Enemy       4000 non-null bool
Bottom_Left_Enemy     4000 non-null bool
Bottom_Right_Enemy    4000 non-null bool
Top_Left_Dark         4000 non-null bool
Top_Right_Dark        4000 non-null bool
Bottom_Left_Dark      4000 non-null bool
Bottom_Right_Dark     4000 non-null bool
Agree                 4000 non-null bool
dtypes: bool(12), float64(2), int64(3)
memory usage: 203.2 KB


In [9]:
abnormal = alldata[alldata['RMS']>60]
normal = alldata[alldata['RMS']<=60]

In [10]:
abnormal.shape

(160, 17)

Next We try logistic regression, the first and the most simple machine learning model for binary outcomes.

In [58]:
accuracy = []
precision = []
recall = []
auc = []
coefficients = pd.DataFrame(columns = alldata.drop(['RMS'],axis=1).columns)

In [59]:
for seed in range(10):
    newdata = abnormal.append(normal.sample(n=160,random_state=seed))
    newdata['performance'] = newdata['RMS'] <= 60
    newdata['First_toggle_time'] = newdata['First_toggle_time']/10000
    newdata['Trial'] = newdata['Trial']/100
    X = newdata.drop(['RMS','performance'],axis=1)
    y = newdata['performance']
    for state in np.arange(20,30):
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=state,stratify=y)
        logreg = LogisticRegression('l1')
        searcher = GridSearchCV(logreg, {'C':[0.001, 0.01, 0.1, 1, 10,100]})
        searcher.fit(X_train, y_train)
        coefficients.loc[str(seed)+str(state),] = searcher.best_estimator_.coef_
        y_pred = searcher.predict(X_test)
        outcome = pd.DataFrame({'Real':y_test, 'Predicted':y_pred})
        precision.append(np.sum(outcome['Real'] & outcome['Predicted'])/np.sum(outcome['Predicted']))
        recall.append(np.sum(outcome['Real'] & outcome['Predicted'])/np.sum(outcome['Real']))
        accuracy.append(np.sum(y_pred==y_test)/len(y_pred))
        y_pred_prob = searcher.predict_proba(X_test)[:,1]
        auc.append(roc_auc_score(y_test,y_pred_prob))

In [65]:
conclusion = pd.DataFrame(coefficients.apply(np.mean),columns=['Coefficient'])

In [62]:
import pickle

In [63]:
outcome = {'accuracy':accuracy,'precision':precision,'recall':recall,'auc':auc}
with open('outcome_logreg.pkl','wb+') as f:
    pickle.dump(outcome,f)

In [64]:
coefficients.to_csv('/home/wangmk/UM/Research/SURE/cleaned_data/coefficients_logreg.csv')

In [66]:
conclusion['abs_coefficient'] = conclusion['Coefficient'].apply(np.abs)

In [67]:
conclusion.sort_values('abs_coefficient')

Unnamed: 0,Coefficient,abs_coefficient
Identification,-0.008528,0.008528
Bottom_Right_Enemy,0.043128,0.043128
Alert,-0.101741,0.101741
Bottom_Right_Dark,0.120505,0.120505
Top_Left_Dark,-0.147959,0.147959
First_toggle_time,-0.178457,0.178457
Top_Right_Dark,0.195775,0.195775
Agree,0.20696,0.20696
Top_Right_Enemy,0.236424,0.236424
Bottom_Left_Dark,0.267693,0.267693
