## Logistic Regression

In [1]:
# import required package for data handling
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import `logistic regression` model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [2]:
# read in the csv
df = pd.read_csv('../csv_files/Capstone_p1_final_Classification.csv',index_col=0)
df.head(2)

Unnamed: 0,H_FTPct,H_EFGPct,H_ThreePARt,H_FTR,H_REBPct,H_BLKPct,H_AST_TOV_Ratio,A_FTPct,A_EFGPct,A_ThreePARt,A_FTR,A_REBPct,A_BLKPct,A_AST_TOV_Ratio,Target
0,0.672228,0.357844,0.718132,0.601149,0.498663,0.265413,0.671275,0.907603,0.867653,0.864242,0.480322,0.501337,0.85586,0.773579,0
1,0.774289,0.272976,0.736725,0.499349,0.615201,0.176942,0.489684,0.749759,0.621326,0.758603,0.528431,0.384799,0.414273,0.799606,0


In [3]:
# Splitting up our data into variable and target data
X = df.iloc[:, :-1] # Variable
Y = df.Target # Target

In [4]:
# Split dataset into training set and test set
# 70% training and 30% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2019)

In [5]:
# import the RandomOverSampler package from imblearn 
from imblearn.over_sampling import RandomOverSampler

# define the model
ros = RandomOverSampler(random_state=2019)

# fit the data only to the RandomOverSampler model
# this will help address the imbalanced nature of the target variable 
X_train_resample, Y_train_resample = ros.fit_resample(X_train, Y_train)



In [6]:
# initiate the logistic regression function 
logreg = LogisticRegression(solver='liblinear', random_state=2019)

# pass the training data into the model. This training data includes all of the independent variables 
logreg = logreg.fit(X_train_resample,Y_train_resample) 

# tell the model what to predict, or in this case classify, and what variables to use to predict the dependent variable 
Y_pred=logreg.predict(X_test)

In [7]:
# calculate accuracy, precision and recall? measures of the model 
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
print("Precision:",metrics.precision_score(Y_test, Y_pred))
print('Recall:',metrics.recall_score(Y_test, Y_pred))

Accuracy: 0.9170616113744076
Precision: 0.9420289855072463
Recall: 0.9128145114101814


In [8]:
# Calculating accuracy, F1 score, and AUC 
print('Accuracy:', metrics.accuracy_score(Y_test, Y_pred))
print('F1_Score:', metrics.f1_score(Y_test, Y_pred, average='weighted'))
print('AUC:', metrics.roc_auc_score(Y_test, Y_pred, average='weighted'))

Accuracy: 0.9170616113744076
F1_Score: 0.9172727374407681
AUC: 0.9178530388376207


In [9]:
array = logreg.coef_
arrayT = array.T

In [10]:
coef = pd.DataFrame(X.columns)
coef['coef'] = arrayT
coef

Unnamed: 0,0,coef
0,H_FTPct,2.459139
1,H_EFGPct,11.635012
2,H_ThreePARt,-0.379603
3,H_FTR,4.452513
4,H_REBPct,4.126544
5,H_BLKPct,-0.009071
6,H_AST_TOV_Ratio,9.036526
7,A_FTPct,-2.269955
8,A_EFGPct,-11.085825
9,A_ThreePARt,-0.030256


In [11]:
coef.sort_values('coef')

Unnamed: 0,0,coef
8,A_EFGPct,-11.085825
13,A_AST_TOV_Ratio,-9.389043
10,A_FTR,-4.106754
11,A_REBPct,-3.765278
7,A_FTPct,-2.269955
2,H_ThreePARt,-0.379603
9,A_ThreePARt,-0.030256
5,H_BLKPct,-0.009071
12,A_BLKPct,0.036467
0,H_FTPct,2.459139


## Evaluation of LR

In [24]:
# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []

#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):
    
    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=2019)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(logreg, X_train, Y_train, cv=kfold, scoring=scoring)

    #calculate f1-score and AUC
    clf_roc_auc = roc_auc_score(Y_test, Y_pred)
    f1_score_lst.append(precision_recall_fscore_support(Y_test, Y_pred, average='weighted')[2])
    auc_lst.append(clf_roc_auc)


print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))

#result=logit_model.fit()
confusion_matrix_y = confusion_matrix(Y_test, Y_pred)


#print(result.summary())
print('Accuracy of classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))

print("10-fold cross validation average accuracy of classifier: %.3f" % (results.mean()))

print('Confusion Matrix for Logistic Regression Classfier:')
print(confusion_matrix_y)

print('Classification Report for Logistic Regression Classfier:')
print(classification_report(Y_test, Y_pred))

F1 0.9173; AUC 0.9179 
Accuracy of classifier on test set: 0.92
10-fold cross validation average accuracy of classifier: 0.926
Confusion Matrix for Logistic Regression Classfier:
[[1149   96]
 [ 149 1560]]
Classification Report for Logistic Regression Classfier:
              precision    recall  f1-score   support

           0       0.89      0.92      0.90      1245
           1       0.94      0.91      0.93      1709

    accuracy                           0.92      2954
   macro avg       0.91      0.92      0.92      2954
weighted avg       0.92      0.92      0.92      2954

