## Logistic Regression - Combined

In [1]:
# import required package for data handling
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import `logistic regression` model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score

In [2]:
# read in the csv
df = pd.read_csv('../csv_files/Model_Ready_Classification.csv',index_col=0)
df.head(2)

Unnamed: 0,H_FTPct,H_EFGPct,H_ThreePARt,H_FTR,H_REBPct,H_BLKPct,H_AST_TOV_Ratio,A_FTPct,A_EFGPct,A_ThreePARt,A_FTR,A_REBPct,A_BLKPct,A_AST_TOV_Ratio,Target
0,0.833,0.461538,0.395604,0.32967,0.506173,0.036585,1.785714,0.952,0.628049,0.463415,0.256098,0.493827,0.10989,2.142857,0
1,0.885,0.430851,0.404255,0.276596,0.538462,0.02439,1.133333,0.87,0.542683,0.414634,0.280488,0.461538,0.053191,2.266667,0


In [3]:
# Splitting up our data into variable and target data
X = df.iloc[:, :-1] # Variable
Y = df.Target # Target

In [4]:
# Split dataset into training set and test set
# 70% training and 30% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2019)

In [5]:
# initiate the logistic regression function 
logreg = LogisticRegression(solver='liblinear')

# pass the training data into the model. This training data includes all of the independent variables 
logreg = logreg.fit(X_train,Y_train) 

# tell the model what to predict, or in this case classify, and what variables to use to predict the dependent variable 
Y_pred=logreg.predict(X_test)

In [6]:
X.columns

Index(['H_FTPct', 'H_EFGPct', 'H_ThreePARt', 'H_FTR', 'H_REBPct', 'H_BLKPct',
       'H_AST_TOV_Ratio', 'A_FTPct', 'A_EFGPct', 'A_ThreePARt', 'A_FTR',
       'A_REBPct', 'A_BLKPct', 'A_AST_TOV_Ratio'],
      dtype='object')

In [7]:
array = logreg.coef_
arrayT = array.T

In [8]:
coef = pd.DataFrame(X.columns)
coef['coef'] = arrayT
coef

Unnamed: 0,0,coef
0,H_FTPct,3.006877
1,H_EFGPct,16.985417
2,H_ThreePARt,-0.075831
3,H_FTR,5.45886
4,H_REBPct,8.340661
5,H_BLKPct,1.966318
6,H_AST_TOV_Ratio,1.324493
7,A_FTPct,-2.66899
8,A_EFGPct,-16.485376
9,A_ThreePARt,-0.237957


In [9]:
coef.sort_values('coef')

Unnamed: 0,0,coef
8,A_EFGPct,-16.485376
11,A_REBPct,-8.53419
10,A_FTR,-5.108213
7,A_FTPct,-2.66899
13,A_AST_TOV_Ratio,-1.429018
12,A_BLKPct,-0.820469
9,A_ThreePARt,-0.237957
2,H_ThreePARt,-0.075831
6,H_AST_TOV_Ratio,1.324493
5,H_BLKPct,1.966318


In [10]:
print(array.T)

[[  3.00687701]
 [ 16.98541736]
 [ -0.0758306 ]
 [  5.45886042]
 [  8.34066052]
 [  1.9663178 ]
 [  1.32449325]
 [ -2.66899028]
 [-16.4853755 ]
 [ -0.23795665]
 [ -5.10821253]
 [ -8.53419026]
 [ -0.82046932]
 [ -1.429018  ]]


In [11]:
logreg.coef_

array([[  3.00687701,  16.98541736,  -0.0758306 ,   5.45886042,
          8.34066052,   1.9663178 ,   1.32449325,  -2.66899028,
        -16.4853755 ,  -0.23795665,  -5.10821253,  -8.53419026,
         -0.82046932,  -1.429018  ]])

In [12]:
# calculate accuracy, precision and recall? measures of the model 
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
print("Precision:",metrics.precision_score(Y_test, Y_pred))
print('Recall:',metrics.recall_score(Y_test, Y_pred))

Accuracy: 0.915487849111353
Precision: 0.9211643420254699
Recall: 0.9364981504315659


In [13]:
# Calculating accuracy, F1 score, and AUC 
print('Accuracy:', metrics.accuracy_score(Y_test, Y_pred))
print('F1_Score:', metrics.f1_score(Y_test, Y_pred, average='weighted'))
print('AUC:', metrics.roc_auc_score(Y_test, Y_pred, average='weighted'))

Accuracy: 0.915487849111353
F1_Score: 0.9153279904321571
AUC: 0.9109803527488226


In [19]:
# Calculating R^2
print('R^2:', metrics.r2_score(Y_test, Y_pred))

R^2: 0.6510638413445086


## Evaluation of LR

In [8]:
# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []

#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):
    
    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=2019)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(logreg, X_train, Y_train, cv=kfold, scoring=scoring)

    #calculate f1-score and AUC
    clf_roc_auc = roc_auc_score(Y_test, Y_pred)
    f1_score_lst.append(precision_recall_fscore_support(Y_test, Y_pred, average='weighted')[2])
    auc_lst.append(clf_roc_auc)


print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))

#result=logit_model.fit()
confusion_matrix_y = confusion_matrix(Y_test, Y_pred)


#print(result.summary())
print('Accuracy of classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))

print("10-fold cross validation average accuracy of classifier: %.3f" % (results.mean()))

print('Confusion Matrix for Logistic Regression Classfier:')
print(confusion_matrix_y)

print('Classification Report for Logistic Regression Classfier:')
print(classification_report(Y_test, Y_pred))

F1 0.9050; AUC 0.9006 
Accuracy of classifier on test set: 0.91
10-fold cross validation average accuracy of classifier: 0.916
Confusion Matrix for Logistic Regression Classfier:
[[1085  160]
 [ 120 1589]]
Classification Report for Logistic Regression Classfier:
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      1245
           1       0.91      0.93      0.92      1709

    accuracy                           0.91      2954
   macro avg       0.90      0.90      0.90      2954
weighted avg       0.91      0.91      0.90      2954

