## Logistic Regression - Away Stats

In [14]:
# import required package for data handling
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import `logistic regression` model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [15]:
# read in the csv
df = pd.read_csv('../csv_files/Away_Only_Classification.csv',index_col=0)
df.head(2)

Unnamed: 0,A_FTPct,A_EFGPct,A_ThreePARt,A_FTR,A_REBPct,A_BLKPct,A_AST_TOV_Ratio,Target
0,0.952,0.628049,0.463415,0.256098,0.493827,0.10989,2.142857,1
1,0.87,0.542683,0.414634,0.280488,0.461538,0.053191,2.266667,1


In [16]:
# Splitting up our data into variable and target data
X = df.iloc[:, :-1] # Variable
Y = df.Target # Target

In [17]:
# Split dataset into training set and test set
# 70% training and 30% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2019)

In [18]:
# initiate the logistic regression function 
logreg = LogisticRegression(solver='liblinear')

# pass the training data into the model. This training data includes all of the independent variables 
logreg = logreg.fit(X_train,Y_train) 

# tell the model what to predict, or in this case classify, and what variables to use to predict the dependent variable 
Y_pred=logreg.predict(X_test)

In [19]:
X.columns

Index(['A_FTPct', 'A_EFGPct', 'A_ThreePARt', 'A_FTR', 'A_REBPct', 'A_BLKPct',
       'A_AST_TOV_Ratio'],
      dtype='object')

In [20]:
array = logreg.coef_
arrayT = array.T

In [21]:
coef = pd.DataFrame(X.columns)
coef['coef'] = arrayT
coef

Unnamed: 0,0,coef
0,A_FTPct,1.200421
1,A_EFGPct,11.091904
2,A_ThreePARt,-0.897082
3,A_FTR,3.37159
4,A_REBPct,10.756754
5,A_BLKPct,6.180153
6,A_AST_TOV_Ratio,0.745166


In [22]:
coef.sort_values('coef')

Unnamed: 0,0,coef
2,A_ThreePARt,-0.897082
6,A_AST_TOV_Ratio,0.745166
0,A_FTPct,1.200421
3,A_FTR,3.37159
5,A_BLKPct,6.180153
4,A_REBPct,10.756754
1,A_EFGPct,11.091904


In [23]:
print(array.T)

[[ 1.20042113]
 [11.09190425]
 [-0.89708172]
 [ 3.37159037]
 [10.75675384]
 [ 6.1801531 ]
 [ 0.74516558]]


In [24]:
logreg.coef_

array([[ 1.20042113, 11.09190425, -0.89708172,  3.37159037, 10.75675384,
         6.1801531 ,  0.74516558]])

In [25]:
# calculate accuracy, precision and recall? measures of the model 
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
print("Precision:",metrics.precision_score(Y_test, Y_pred))
print('Recall:',metrics.recall_score(Y_test, Y_pred))

Accuracy: 0.7782667569397427
Precision: 0.7842003853564548
Recall: 0.6538152610441768


In [26]:
# Calculating accuracy, F1 score, and AUC 
print('Accuracy:', metrics.accuracy_score(Y_test, Y_pred))
print('F1_Score:', metrics.f1_score(Y_test, Y_pred, average='weighted'))
print('AUC:', metrics.roc_auc_score(Y_test, Y_pred, average='weighted'))

Accuracy: 0.7782667569397427
F1_Score: 0.7745453278145638
AUC: 0.7613722297028959


In [27]:
# Model Accuracy, how often is the Decision Tree correct?
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

# We are going to look at the classification report and also the confusion matrix for the Decision Tree  
print(metrics.classification_report(Y_test, Y_pred))
print(metrics.confusion_matrix(Y_test, Y_pred))

Accuracy: 0.7782667569397427
              precision    recall  f1-score   support

           0       0.78      0.87      0.82      1709
           1       0.78      0.65      0.71      1245

    accuracy                           0.78      2954
   macro avg       0.78      0.76      0.77      2954
weighted avg       0.78      0.78      0.77      2954

[[1485  224]
 [ 431  814]]


## Evaluation of LR

In [8]:
# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []

#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):
    
    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=2019)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(logreg, X_train, Y_train, cv=kfold, scoring=scoring)

    #calculate f1-score and AUC
    clf_roc_auc = roc_auc_score(Y_test, Y_pred)
    f1_score_lst.append(precision_recall_fscore_support(Y_test, Y_pred, average='weighted')[2])
    auc_lst.append(clf_roc_auc)


print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))

#result=logit_model.fit()
confusion_matrix_y = confusion_matrix(Y_test, Y_pred)


#print(result.summary())
print('Accuracy of classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))

print("10-fold cross validation average accuracy of classifier: %.3f" % (results.mean()))

print('Confusion Matrix for Logistic Regression Classfier:')
print(confusion_matrix_y)

print('Classification Report for Logistic Regression Classfier:')
print(classification_report(Y_test, Y_pred))

F1 0.9050; AUC 0.9006 
Accuracy of classifier on test set: 0.91
10-fold cross validation average accuracy of classifier: 0.916
Confusion Matrix for Logistic Regression Classfier:
[[1085  160]
 [ 120 1589]]
Classification Report for Logistic Regression Classfier:
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      1245
           1       0.91      0.93      0.92      1709

    accuracy                           0.91      2954
   macro avg       0.90      0.90      0.90      2954
weighted avg       0.91      0.91      0.90      2954

