## Logistic Regression

In [1]:
# import required package for data handling
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import `logistic regression` model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [2]:
# read in the csv
df = pd.read_csv('../csv_files/Capstone_p2_final_Classification.csv',index_col=0)
df.head(2)

Unnamed: 0,H_FTPct,H_EFGPct,H_ThreePARt,H_FTR,H_REBPct,H_BLKPct,H_AST_TOV_Ratio,A_FTPct,A_EFGPct,A_ThreePARt,A_FTR,A_REBPct,A_BLKPct,A_AST_TOV_Ratio,Target
0,0.833,0.461538,0.395604,0.574169,0.506173,0.191273,1.10146,0.952,0.628049,0.463415,0.71138,0.493827,0.10989,1.135444,0
1,0.885,0.430851,0.404255,0.525924,0.538462,0.156174,1.02108,0.87,0.542683,0.414634,0.727744,0.461538,0.053191,1.146123,0


In [3]:
# Splitting up our data into variable and target data
X = df.iloc[:, :-1] # Variable
Y = df.Target # Target

In [4]:
# Split dataset into training set and test set
# 70% training and 30% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2019)

In [5]:
# import the RandomOverSampler package from imblearn 
from imblearn.over_sampling import RandomOverSampler

# define the model
ros = RandomOverSampler(random_state=2019)

# fit the data only to the RandomOverSampler model
# this will help address the imbalanced nature of the target variable 
X_train_resample, Y_train_resample = ros.fit_resample(X_train, Y_train)



In [6]:
# initiate the logistic regression function 
logreg = LogisticRegression(solver='liblinear', random_state=2019)

# pass the training data into the model. This training data includes all of the independent variables 
logreg = logreg.fit(X_train_resample,Y_train_resample) 

# tell the model what to predict, or in this case classify, and what variables to use to predict the dependent variable 
Y_pred=logreg.predict(X_test)

In [7]:
# calculate accuracy, precision and recall? measures of the model 
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
print("Precision:",metrics.precision_score(Y_test, Y_pred))
print('Recall:',metrics.recall_score(Y_test, Y_pred))

Accuracy: 0.9146919431279621
Precision: 0.9417828987265009
Recall: 0.9087185488589818


In [8]:
# Calculating accuracy, F1 score, and AUC 
print('Accuracy:', metrics.accuracy_score(Y_test, Y_pred))
print('F1_Score:', metrics.f1_score(Y_test, Y_pred, average='weighted'))
print('AUC:', metrics.roc_auc_score(Y_test, Y_pred, average='weighted'))

Accuracy: 0.9146919431279621
F1_Score: 0.9149334345968076
AUC: 0.9158050575620209


In [9]:
array = logreg.coef_
arrayT = array.T

In [10]:
coef = pd.DataFrame(X.columns)
coef['coef'] = arrayT
coef

Unnamed: 0,0,coef
0,H_FTPct,3.013265
1,H_EFGPct,18.193296
2,H_ThreePARt,-0.388624
3,H_FTR,5.556853
4,H_REBPct,8.940674
5,H_BLKPct,1.086536
6,H_AST_TOV_Ratio,12.064279
7,A_FTPct,-2.82817
8,A_EFGPct,-17.613604
9,A_ThreePARt,-0.220391


In [11]:
coef.sort_values('coef')

Unnamed: 0,0,coef
8,A_EFGPct,-17.613604
13,A_AST_TOV_Ratio,-12.070386
11,A_REBPct,-8.0399
10,A_FTR,-6.732477
7,A_FTPct,-2.82817
12,A_BLKPct,-1.174212
2,H_ThreePARt,-0.388624
9,A_ThreePARt,-0.220391
5,H_BLKPct,1.086536
0,H_FTPct,3.013265


## Evaluation of LR

In [9]:
# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []

#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):
    
    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=2019)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(logreg, X_train, Y_train, cv=kfold, scoring=scoring)

    #calculate f1-score and AUC
    clf_roc_auc = roc_auc_score(Y_test, Y_pred)
    f1_score_lst.append(precision_recall_fscore_support(Y_test, Y_pred, average='weighted')[2])
    auc_lst.append(clf_roc_auc)


print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))

#result=logit_model.fit()
confusion_matrix_y = confusion_matrix(Y_test, Y_pred)


#print(result.summary())
print('Accuracy of classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))

print("10-fold cross validation average accuracy of classifier: %.3f" % (results.mean()))

print('Confusion Matrix for Logistic Regression Classfier:')
print(confusion_matrix_y)

print('Classification Report for Logistic Regression Classfier:')
print(classification_report(Y_test, Y_pred))

F1 0.9149; AUC 0.9158 
Accuracy of classifier on test set: 0.91
10-fold cross validation average accuracy of classifier: 0.924
Confusion Matrix for Logistic Regression Classfier:
[[1149   96]
 [ 156 1553]]
Classification Report for Logistic Regression Classfier:
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      1245
           1       0.94      0.91      0.92      1709

    accuracy                           0.91      2954
   macro avg       0.91      0.92      0.91      2954
weighted avg       0.92      0.91      0.91      2954

