In [34]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import seaborn as sns
import statsmodels.api as sm

In [35]:
# Read data
df = pd.read_csv("./All Data.csv")

Unnamed: 0,AGT,K,D,KD,CKPM,GSPD,GD15,FB%,FT%,F3T%,...,BN%,LNE%,JNG%,WPM,CWPM,WCPM,win_rate,EGR,MLR,year
count,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,...,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0
mean,34.390476,125.087719,125.428571,1.012005,0.746416,-0.012301,-135.273183,0.444862,0.43589,0.425915,...,0.440426,0.447987,0.453461,3.219173,0.975764,1.288195,44.597018,32.896491,-2.355388,2017.949875
std,3.457307,92.600155,84.89899,0.53649,0.146694,0.075798,1237.030261,0.245878,0.241694,0.259136,...,0.233307,0.152075,0.141766,1.076066,0.439101,0.473416,25.363616,24.840955,15.627106,2.068547
min,24.5,5.0,8.0,0.17,0.33,-0.326,-5932.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-56.3,2015.0
25%,32.05,54.0,66.0,0.72,0.65,-0.049,-744.5,0.31,0.305,0.25,...,0.29,0.4905,0.4605,3.075,0.695,1.14,26.39,0.0,-8.95,2016.0
50%,34.3,104.0,95.0,0.95,0.74,0.0,0.0,0.5,0.46,0.45,...,0.46,0.498,0.492,3.44,1.1,1.39,42.86,41.1,0.0,2018.0
75%,36.4,176.5,161.5,1.22,0.85,0.0285,558.0,0.61,0.61,0.61,...,0.6,0.505,0.517,3.77,1.3,1.575,60.555,53.2,1.2,2019.0
max,48.5,477.0,460.0,7.0,1.13,0.269,3129.0,1.0,1.0,1.0,...,1.0,0.521,0.613,5.41,1.78,2.14,100.0,82.3,53.4,2021.0


In [16]:
# Convert W%
# 1 for >= 40% W%, 0 for < 40%

# Get unique winrates
wins = list(set(data.get('win_rate'))) 
# Extract winrates lower and higher than 40%
low = tuple([i for i in wins if i < 0.4])
high = tuple([i for i in wins if i >= 0.4])

# Replace winrates with binary 0/1
data.replace({low: 0, high: 1}, inplace = True)
data.head()

Unnamed: 0,AGT,K,D,KD,CKPM,GSPD,GD15,FB%,FT%,F3T%,...,BN%,LNE%,JNG%,WPM,CWPM,WCPM,win_rate,EGR,MLR,year
0,42.7,239,235,1.02,0.85,0.0,0.0,0.0,0.0,0.0,...,0.4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2015
1,35.8,214,269,0.8,0.96,-0.062,-1303.0,0.5,0.36,0.29,...,0.53,0.495,0.446,2.21,0.32,0.69,1.0,0.0,0.0,2015
2,39.3,80,121,0.66,0.85,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2015
3,33.9,291,175,1.66,0.98,0.153,867.0,0.43,0.79,0.86,...,0.88,0.52,0.506,2.37,0.34,0.82,1.0,0.0,0.0,2015
4,27.6,27,110,0.25,0.99,-0.326,-5932.0,0.2,0.0,0.0,...,0.0,0.457,0.387,2.82,0.27,0.6,0.0,0.0,0.0,2015


In [17]:
# SKLearn Logistic Regression and statsmodels process derived from https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

# Get features and remove target
features = list(data.columns)
features.remove('win_rate') 

# Separating out the features
X = pd.DataFrame(data.loc[:, features])
# Separating out the target
y = pd.DataFrame(data.loc[:,['win_rate']])

# Split data into train and test
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [18]:
# Instantiate the model
logMod = LogisticRegression(max_iter = 1000)

# Fit the model
logMod.fit(X_train, y_train.to_numpy().ravel())

# Get prediction
yPred = logMod.predict(X_test)

In [19]:
cnf_matrix = metrics.confusion_matrix(y_test, yPred)
cnf_matrix

array([[12,  2],
       [ 0, 86]], dtype=int64)

In [20]:
print("Accuracy:",metrics.accuracy_score(y_test, yPred))
print("Precision:",metrics.precision_score(y_test, yPred))
print("Recall:",metrics.recall_score(y_test, yPred))

Accuracy: 0.98
Precision: 0.9772727272727273
Recall: 1.0


In [23]:
rfe = RFE(logMod)
rfe = rfe.fit(X, y.values.ravel())

print(rfe.support_)
print(rfe.ranking_)

[ True False False  True  True False  True  True  True False False False]
[1 4 2 1 1 6 1 1 1 3 7 5]


In [24]:
X = X.loc[:, rfe.support_]
logit_model=sm.Logit(y,X)
result=logit_model.fit(method = 'bfgs')
print(result.summary2())

         Current function value: 0.205945
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.440     
Dependent Variable: win_rate         AIC:              176.3444  
Date:               2022-04-29 18:32 BIC:              200.2782  
No. Observations:   399              Log-Likelihood:   -82.172   
Df Model:           5                LL-Null:          -146.64   
Df Residuals:       393              LLR p-value:      3.9957e-26
Converged:          0.0000           Scale:            1.0000    
-------------------------------------------------------------------
           Coef.    Std.Err.      z      P>|z|     [0.025    0.975]
-------------------------------------------------------------------
GSPD      12.3789     2.6112    4.7408   0.0000    7.2611   17.4966
F3T%      -2.1634     1.1196   -1.9323   0.0533   -4.3579    0.0310
HLD%       0.9864     0.

