In [103]:
# this is a test of classification algorithms with scikit
#  the basic application is to predict whether or not a team will win based on the ADC character matchup, and year
LEAGUE_PATH = "datasets/lol/LeagueofLegends.csv"

import pandas as pd

def load_lol_data(file_path=LEAGUE_PATH):
    return pd.read_csv(file_path)

In [104]:
matches = load_lol_data()
matches.head()

Unnamed: 0,MatchHistory,League,Season,Year,blueTeamTag,bResult,rResult,redTeamTag,gamelength,blueTop,...,redTop,redTopChamp,redJungle,redJungleChamp,redMiddle,redMiddleChamp,redADC,redADCChamp,redSupportChamp,redSupport
0,http://matchhistory.na.leagueoflegends.com/en/...,North_America,Spring_Season,2015,TSM,1,0,C9,40,Dyrus,...,Balls,Gnar,Meteos,Elise,Hai,Fizz,Sneaky,Sivir,Thresh,LemonNation
1,http://matchhistory.na.leagueoflegends.com/en/...,North_America,Spring_Season,2015,CST,0,1,DIG,38,Cris,...,Gamsu,Irelia,Crumbzz,JarvanIV,Shiphtur,Azir,CoreJJ,Corki,Annie,KiWiKiD
2,http://matchhistory.na.leagueoflegends.com/en/...,North_America,Spring_Season,2015,WFX,1,0,GV,40,Flaresz,...,Hauntzer,Sion,Saintvicious,LeeSin,Keane,Azir,Cop,Corki,Janna,BunnyFuFuu
3,http://matchhistory.na.leagueoflegends.com/en/...,North_America,Spring_Season,2015,TIP,0,1,TL,41,Rhux,...,Quas,Gnar,IWDominate,Nunu,Fenix,Lulu,KEITH,KogMaw,Janna,Xpecial
4,http://matchhistory.na.leagueoflegends.com/en/...,North_America,Spring_Season,2015,CLG,1,0,T8,35,Benny,...,CaliTrlolz8,Sion,Porpoise8,RekSai,Slooshi8,Lulu,Maplestreet8,Corki,Annie,Dodo8


In [105]:
from typing import Any, List

def process_line(line: List[Any]) -> List[Any]:
    """
    processes a single row in the above and table and turns it into the lists
    to add to the new processed data set
    
    format to return:
    ["Year", "Win", "perADCChamp", "oppADCChamp"]
    """
    
    red_row = [line[6], line[3], line[26], line[16]]
    blue_row = [line[5], line[3], line[16], line[26]]
    
    return [red_row]
    
    

In [106]:
# loop through dataframes object to get thing we want

processed_games = []
max_len = 0
for idx, row in matches.iterrows():
    result = process_line(row)
    processed_games.extend(result)
    if idx > max_len:
        max_len = idx

max_len += 1 # because its 0-index

print("Length of unprocessed games: %d" % max_len)
print("Length of processed games: %d" % len(processed_games))

Length of unprocessed games: 3645
Length of processed games: 3645


In [107]:
processed_games[1]

[1, 2015, 'Corki', 'Caitlyn']

In [108]:
# turn it back into dataframes so we can do some analysis

matches_panda = pd.DataFrame(processed_games, columns=['Win', 'Year', 'OwnADC', 'OppADC'])

In [109]:
matches_panda.head()

Unnamed: 0,Win,Year,OwnADC,OppADC
0,0,2015,Sivir,Jinx
1,1,2015,Corki,Caitlyn
2,0,2015,Corki,Sivir
3,1,2015,KogMaw,Sivir
4,0,2015,Corki,Tristana


In [110]:
matches_panda["OwnADC"].value_counts()

Sivir          615
Lucian         472
Ashe           395
Ezreal         366
Jhin           362
Corki          298
Kalista        257
Caitlyn        185
Varus          150
Tristana       105
KogMaw         101
Jinx            95
Vayne           65
Graves          60
Twitch          38
Urgot           32
Draven          15
Ziggs           13
MissFortune     10
Kennen           4
Kindred          2
Jayce            2
Lulu             1
Quinn            1
Janna            1
Name: OwnADC, dtype: int64

In [111]:
matches_panda["OppADC"].value_counts()

Sivir          556
Lucian         468
Ashe           366
Jhin           364
Ezreal         343
Kalista        325
Corki          269
Caitlyn        188
Varus          159
KogMaw         115
Tristana       114
Graves          81
Jinx            80
Vayne           72
Twitch          31
Urgot           29
MissFortune     29
Ziggs           26
Draven          15
Kennen           8
Mordekaiser      7
Name: OppADC, dtype: int64

In [112]:
# time to set up data pipeline to get this show on the road
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

ready_attrs = ["Win", "Year"]
own_adc_carry = ["OwnADC"]
opp_adc_carry = ["OppADC"]    

norm_pipeline = Pipeline([
    ('selector', DataFrameSelector(ready_attrs)),
])

own_adc_carry_pipe = Pipeline([
    ('selector', DataFrameSelector(own_adc_carry)),
    ('label_binarizer', LabelBinarizer()),
])

opp_adc_carry_pipe = Pipeline([
    ('selector', DataFrameSelector(opp_adc_carry)),
    ('label_binarizer', LabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('norm_pipeline', norm_pipeline),
    ('own_adc_pipeline', own_adc_carry_pipe),
    ('opp_adc_pipeline', opp_adc_carry_pipe),
])

In [113]:
ready_data = full_pipeline.fit_transform(matches_panda)

In [114]:
ready_data.shape

(3645, 48)

In [115]:
# grabs ONLY the first column (which is label)
match_labels = ready_data[:,0].copy()

In [116]:
# grabs all data EXCEPT for first column (which is label)
match_data = ready_data[:,1:].copy()

In [117]:
# time to shuffle the data up
import numpy as np

shuffle_index = np.random.permutation(3645)
shuffled_X_train, shuffled_y_train = match_data[shuffle_index], match_labels[shuffle_index]

In [118]:
# time to fit a basic model!!

from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(shuffled_X_train, shuffled_y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)

In [119]:
sgd_clf.predict(shuffled_X_train[21])



array([1])

In [120]:
# let's do some cross validation
from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, shuffled_X_train, shuffled_y_train, cv=3, scoring="accuracy")

array([ 0.45476974,  0.45514403,  0.54530478])

In [121]:
# well that was gad, let's mess with grid search
from sklearn.model_selection import GridSearchCV

parameters = {
    'loss': ('log', 'hinge'),
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.001, 0.0001, 0.00001, 0.000001]
}

classifier = SGDClassifier()

grid_search = GridSearchCV(classifier, parameters, cv=5, scoring='accuracy')
grid_search.fit(shuffled_X_train, shuffled_y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [0.001, 0.0001, 1e-05, 1e-06], 'penalty': ['l1', 'l2', 'elasticnet'], 'loss': ('log', 'hinge')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [122]:
# let's look at eaulation scores
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.472976680384 {'loss': 'log', 'penalty': 'l1', 'alpha': 0.001}
0.527023319616 {'loss': 'log', 'penalty': 'l2', 'alpha': 0.001}
0.490809327846 {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.001}
0.542661179698 {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.001}
0.527023319616 {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.001}
0.513580246914 {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.001}
0.508916323731 {'loss': 'log', 'penalty': 'l1', 'alpha': 0.0001}
0.509190672154 {'loss': 'log', 'penalty': 'l2', 'alpha': 0.0001}
0.527023319616 {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.0001}
0.504526748971 {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.0001}
0.490809327846 {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.0001}
0.491083676269 {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.0001}
0.508916323731 {'loss': 'log', 'penalty': 'l1', 'alpha': 1e-05}
0.508916323731 {'loss': 'log', 'penalty': 'l2', 'alpha': 1e-05}
0.491083676269 {'loss': 'log', 'penalty': 'elasticnet'

In [123]:
grid_search.best_estimator_

SGDClassifier(alpha=1e-05, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [124]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(grid_search.best_estimator_, shuffled_X_train, shuffled_y_train, cv=3)

In [130]:
from sklearn.metrics import confusion_matrix
confusion_matrix(shuffled_y_train, y_train_pred)

array([[ 662, 1325],
       [ 553, 1105]])