In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import seaborn as sns

In [2]:
# Read data and remove nan and irrelevant columns
df = pd.read_csv("./Data/2018/LCS/NA LCS 2018 Spring Playoffs - Team Stats - OraclesElixir.csv")
data = df.drop(['STL', 'Player', 'Team', 'Pos', 'D%P15'], axis = 1)
df.head()

Unnamed: 0,Player,Team,Pos,GP,W%,CTR%,K,D,A,KDA,...,CS%P15,DPM,DMG%,D%P15,EGPM,GOLD%,STL,WPM,CWPM,WCPM
0,Adrian,Echo Fox,Support,7,57%,71%,1,17,39,2.4,...,4.0%,164,8.2%,,111,9.1%,,1.3,0.41,0.21
1,Altec,Echo Fox,ADC,7,57%,57%,15,15,18,2.2,...,27.9%,504,23.9%,,274,23.2%,,0.38,0.09,0.28
2,aphromoo,100 Thieves,Support,8,38%,63%,1,18,37,2.1,...,4.9%,130,8.2%,,97,8.9%,,1.1,0.4,0.29
3,Apollo,Clutch Gaming,ADC,12,42%,58%,27,16,50,4.8,...,29.4%,561,31.3%,,272,26.3%,,0.46,0.17,0.31
4,Bjergsen,TSM,Middle,4,25%,75%,8,3,12,6.7,...,29.8%,546,32.3%,,281,27.3%,,0.47,0.21,0.25


In [3]:
# Convert percents to decimal form
for col in data:
    if data.dtypes[col] == 'object':
        data[col] = data[col].astype(str).str[:-1].astype('float') / 100.0
data.head()

Unnamed: 0,GP,W%,CTR%,K,D,A,KDA,KP,KS%,DTH%,...,CSD10,CSPM,CS%P15,DPM,DMG%,EGPM,GOLD%,WPM,CWPM,WCPM
0,7,0.57,0.71,1,17,39,2.4,0.556,0.014,0.202,...,-4.6,1.3,0.04,164,0.082,111,0.091,1.3,0.41,0.21
1,7,0.57,0.57,15,15,18,2.2,0.458,0.208,0.179,...,1.7,9.5,0.279,504,0.239,274,0.232,0.38,0.09,0.28
2,8,0.38,0.63,1,18,37,2.1,0.623,0.016,0.228,...,5.3,1.6,0.049,130,0.082,97,0.089,1.1,0.4,0.29
3,12,0.42,0.58,27,16,50,4.8,0.74,0.26,0.142,...,-2.9,9.5,0.294,561,0.313,272,0.263,0.46,0.17,0.31
4,4,0.25,0.75,8,3,12,6.7,0.87,0.348,0.061,...,3.0,10.3,0.298,546,0.323,281,0.273,0.47,0.21,0.25


In [4]:
# Convert W%
# 1 for >= 40% W%, 0 for < 40%

# Get unique winrates
wins = list(set(data.get('W%'))) 
# Extract winrates lower and higher than 40%
low = tuple([i for i in wins if i < 0.4])
high = tuple([i for i in wins if i >= 0.4])

# Replace winrates with binary 0/1
data.replace({low: 0, high: 1}, inplace = True)
data.head()

Unnamed: 0,GP,W%,CTR%,K,D,A,KDA,KP,KS%,DTH%,...,CSD10,CSPM,CS%P15,DPM,DMG%,EGPM,GOLD%,WPM,CWPM,WCPM
0,7,1.0,0.71,1,17,39,2.4,0.556,0.014,0.202,...,-4.6,1.3,0.04,164,0.082,111,0.091,1.3,0.41,0.21
1,7,1.0,1.0,15,15,18,2.2,0.458,0.208,0.179,...,1.7,9.5,0.279,504,0.239,274,0.232,0.0,0.09,0.28
2,8,0.0,0.63,1,18,37,2.1,0.623,0.016,0.228,...,5.3,1.6,0.049,130,0.082,97,0.089,1.1,0.4,0.29
3,12,1.0,0.58,27,16,50,4.8,0.74,0.26,0.142,...,-2.9,9.5,0.294,561,0.313,272,0.263,0.46,0.17,0.31
4,4,0.0,0.75,8,3,12,6.7,0.87,0.348,0.061,...,3.0,10.3,0.298,546,0.323,281,0.273,0.47,0.21,0.0


In [10]:
# SKLearn Logistic Regression process derived from https://www.datacamp.com/community/tutorials/understanding-logistic-regression-python

# Get features and remove target
features = list(data.columns)
features.remove('W%') 

# Separating out the features
X = data.loc[:, features].values 
# Separating out the target
y = data.loc[:,['W%']].values

# Split data into train and test
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [11]:
# Instantiate the model
logMod = LogisticRegression(max_iter = 1000)

# Fit the model
logMod.fit(X_train, y_train.ravel())

# Get prediction
yPred = logMod.predict(X_test)

In [12]:
cnf_matrix = metrics.confusion_matrix(y_test, yPred)
cnf_matrix

array([[4, 1],
       [1, 2]], dtype=int64)

In [13]:
print("Accuracy:",metrics.accuracy_score(y_test, yPred))
print("Precision:",metrics.precision_score(y_test, yPred))
print("Recall:",metrics.recall_score(y_test, yPred))

Accuracy: 0.75
Precision: 0.6666666666666666
Recall: 0.6666666666666666
