In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import seaborn as sns

In [2]:
# Read data
df = pd.read_csv("./Data/2018/LCS/NA LCS 2018 Spring Playoffs - Team Stats - OraclesElixir.csv")
# Replace NaN percents with 0%
df = df.fillna('0%')
# Calculate Win%
df['W%'] = (df['W'] / (df['W'] + df['L'])) * 100
df.head()

# Remove Team, Win, Loss columns
data = df.drop(['Team', 'W', 'L'], axis = 1)
data.head()

Unnamed: 0,GP,AGT,K,D,KD,CKPM,GPR,GSPD,EGR,MLR,...,DRG%,ELD%,FBN%,BN%,LNE%,JNG%,WPM,CWPM,WCPM,W%
0,8,37.0,61,79,0.77,0.47,-0.2,-3.4%,43.7,-6.2,...,39%,50%,25%,36%,50.4%,48.9%,3.48,1.09,1.65,37.5
1,3,38.8,22,30,0.73,0.45,0.07,-4.3%,53.7,-53.7,...,42%,50%,33%,20%,50.7%,48.1%,2.98,1.09,1.83,0.0
2,12,35.0,104,113,0.92,0.52,-0.6,-5.0%,42.9,-1.2,...,40%,50%,50%,47%,48.9%,45.6%,4.29,1.27,1.39,41.666667
3,7,27.9,72,84,0.86,0.8,1.53,6.9%,67.5,-10.4,...,60%,0%,43%,50%,50.1%,59.9%,3.57,0.85,1.05,57.142857
4,10,31.9,138,65,2.12,0.64,0.15,6.3%,54.8,35.2,...,74%,50%,80%,79%,50.5%,50.0%,3.95,1.24,1.32,90.0


In [3]:
# Convert percents to decimal form
for col in data:
    if data.dtypes[col] == 'object':
        data[col] = data[col].astype(str).str[:-1].astype('float') / 100.0
data.head()

Unnamed: 0,GP,AGT,K,D,KD,CKPM,GPR,GSPD,EGR,MLR,...,DRG%,ELD%,FBN%,BN%,LNE%,JNG%,WPM,CWPM,WCPM,W%
0,8,37.0,61,79,0.77,0.47,-0.2,-0.034,43.7,-6.2,...,0.39,0.5,0.25,0.36,0.504,0.489,3.48,1.09,1.65,37.5
1,3,38.8,22,30,0.73,0.45,0.07,-0.043,53.7,-53.7,...,0.42,0.5,0.33,0.2,0.507,0.481,2.98,1.09,1.83,0.0
2,12,35.0,104,113,0.92,0.52,-0.6,-0.05,42.9,-1.2,...,0.4,0.5,0.5,0.47,0.489,0.456,4.29,1.27,1.39,41.666667
3,7,27.9,72,84,0.86,0.8,1.53,0.069,67.5,-10.4,...,0.6,0.0,0.43,0.5,0.501,0.599,3.57,0.85,1.05,57.142857
4,10,31.9,138,65,2.12,0.64,0.15,0.063,54.8,35.2,...,0.74,0.5,0.8,0.79,0.505,0.5,3.95,1.24,1.32,90.0


In [4]:
# Convert W%
# 1 for >= 40% W%, 0 for < 40%

# Get unique winrates
wins = list(set(data.get('W%'))) 
# Extract winrates lower and higher than 40%
low = tuple([i for i in wins if i < 0.4])
high = tuple([i for i in wins if i >= 0.4])

# Replace winrates with binary 0/1
data.replace({low: 0, high: 1}, inplace = True)
data.head()

Unnamed: 0,GP,AGT,K,D,KD,CKPM,GPR,GSPD,EGR,MLR,...,DRG%,ELD%,FBN%,BN%,LNE%,JNG%,WPM,CWPM,WCPM,W%
0,8,37.0,61,79,0.77,0.47,-0.2,-0.034,43.7,-6.2,...,0.39,0.5,0.25,0.36,0.504,0.489,3.48,1.09,1.65,1.0
1,3,38.8,22,30,0.73,0.45,0.07,-0.043,53.7,-53.7,...,0.42,0.5,0.33,0.2,0.507,0.481,2.98,1.09,1.83,0.0
2,12,35.0,104,113,0.92,0.52,-0.6,-0.05,42.9,-1.2,...,0.4,0.5,0.5,0.47,0.489,0.456,4.29,1.27,1.39,1.0
3,7,27.9,72,84,0.86,0.8,1.53,0.069,67.5,-10.4,...,0.6,0.0,0.43,0.5,0.501,0.599,3.57,0.85,1.05,1.0
4,10,31.9,138,65,2.12,0.64,0.15,0.063,54.8,35.2,...,0.74,0.5,0.8,0.79,0.505,0.5,3.95,1.24,1.32,1.0


In [5]:
# SKLearn Logistic Regression process derived from https://www.datacamp.com/community/tutorials/understanding-logistic-regression-python

# Get features and remove target
features = list(data.columns)
features.remove('W%') 

# Separating out the features
X = data.loc[:, features].values 
# Separating out the target
y = data.loc[:,['W%']].values

# Split data into train and test
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [6]:
# Instantiate the model
logMod = LogisticRegression(max_iter = 1000)

# Fit the model
logMod.fit(X_train, y_train.ravel())

# Get prediction
yPred = logMod.predict(X_test)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, yPred)
cnf_matrix

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, yPred))
print("Precision:",metrics.precision_score(y_test, yPred))
print("Recall:",metrics.recall_score(y_test, yPred))