# 2022 LOL Analysis

**Name(s)**: Owen Shi

**Website Link**: https://oowenn.github.io/2022lolanalysis/

In [306]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.graph_objects as go
import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import * # Feel free to uncomment and use this.

pd.set_option('display.max_columns', None)

# pd.set_option('display.max_rows', len(s))
# print(s)
# pd.reset_option('display.max_rows')

## Step 1: Introduction

In [307]:
df = pd.read_csv('2022_LoL_esports_match_data_from_OraclesElixir.csv', low_memory=False)

### Questions

- Does side give a significant advantage in a game?

- Looking at tier-one professional leagues, which league has the most “action-packed” games? Is the amount of “action” in this league significantly different than in other leagues? We can define action using factors including kills per minute, multikills, and/or objectives taken. (pick this one probably)



## Step 2: Data Cleaning and Exploratory Data Analysis

### Cleaning

In [309]:
# isolate the team rows
teams = df.loc[(df['position'] == 'team')]

# isolate the tier 1 leagues
abbrs = [
    'LCK',
    'LPL',
    'LEC',
    'LCS',
    'PCS',
    'VCS',
    'CBLOL',
    'LLA'
]
teams = teams.loc[teams['league'].isin(abbrs)]

# drop cols relating to individual players
cols_to_drop = [
    'playername', 
    'playerid', 
    'champion', 
    'firstbloodkill',
    'firstbloodassist', 
    'firstbloodvictim', 
    'damageshare',
    'earnedgoldshare',
    'kills',
    'deaths'
]
teams = teams.drop(columns=cols_to_drop)

# add diffs columns
teams['killsdif'] = teams['teamkills'] - teams['teamdeaths']
teams['eldersdif'] = teams['elders'] - teams['opp_elders']
teams['heraldsdif'] = teams['heralds'] - teams['opp_heralds']
teams['baronsdif'] = teams['barons'] - teams['opp_barons']
teams['towersdif'] = teams['towers'] - teams['opp_towers']
teams['inhibsdif'] = teams['inhibitors'] - teams['opp_inhibitors']

# rename team kpm to kpm
teams = teams.rename(columns={'team kpm': 'kpm'})

# cast appropriate columns to bool
bool_cols = [
    'result',
    'firstblood',
    'firstdragon',
    'firstherald',
    'firstbaron',
    'firsttower',
    'firstmidtower',
    'firsttothreetowers'
]
teams[bool_cols] = teams[bool_cols].fillna(0).astype(bool)

# change 'side' column to 'red_side' bool column
teams['red_side'] = (teams['side'] == 'Red').astype(bool)
teams = teams.drop(columns=['side'])

# insert multikills column
teams['multikills'] = teams[['doublekills', 'triplekills', 'quadrakills', 'pentakills']].sum(axis=1)

# insert objectives captured column
teams['objectives captured'] = teams[['dragons', 'heralds', 'barons']].sum(axis=1)

# insert picks and bans columns
teams['picks'] = teams.apply(lambda x: [x[f'pick{i}'] for i in range(1, 6)] , axis=1)
teams['bans'] = teams.apply(lambda x: [x[f'ban{i}'] for i in range(1, 6)] , axis=1)
teams = teams.drop(columns=['pick1', 'pick2', 'pick3', 'pick4', 'pick5', 
                            'ban1', 'ban2', 'ban3', 'ban4', 'ban5'])

# # remove incomplete rows
# teams = teams.loc[teams['datacompleteness'] == 'complete']

# remove irrelevant columns
teams = teams.drop(
            columns=['url',
                     'gameid', 
                     'year', 
                     'split', 
                     'playoffs',
                     'date',
                     'game',
                     'patch',
                     'participantid',
                     'position',
                     'void_grubs', # did not exist in 2022
                     'opp_void_grubs', # did not exist in 2022
            ]
        )

# fill na values
cols = ['dragons (type unknown)']
teams[cols] = teams[cols].fillna(0)

teams.head()

Unnamed: 0,datacompleteness,league,teamname,teamid,gamelength,result,assists,teamkills,teamdeaths,doublekills,triplekills,quadrakills,pentakills,firstblood,kpm,ckpm,firstdragon,dragons,opp_dragons,elementaldrakes,opp_elementaldrakes,infernals,mountains,clouds,oceans,chemtechs,hextechs,dragons (type unknown),elders,opp_elders,firstherald,heralds,opp_heralds,firstbaron,barons,opp_barons,firsttower,towers,opp_towers,firstmidtower,firsttothreetowers,turretplates,opp_turretplates,inhibitors,opp_inhibitors,damagetochampions,dpm,damagetakenperminute,damagemitigatedperminute,wardsplaced,wpm,wardskilled,wcpm,controlwardsbought,visionscore,vspm,totalgold,earnedgold,earned gpm,goldspent,gspd,gpr,total cs,minionkills,monsterkills,monsterkillsownjungle,monsterkillsenemyjungle,cspm,goldat10,xpat10,csat10,opp_goldat10,opp_xpat10,opp_csat10,golddiffat10,xpdiffat10,csdiffat10,killsat10,assistsat10,deathsat10,opp_killsat10,opp_assistsat10,opp_deathsat10,goldat15,xpat15,csat15,opp_goldat15,opp_xpat15,opp_csat15,golddiffat15,xpdiffat15,csdiffat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15,killsdif,eldersdif,heraldsdif,baronsdif,towersdif,inhibsdif,red_side,multikills,objectives captured,picks,bans
34,partial,LPL,Oh My God,oe:team:f4c4528c6981e104a11ea7548630c23,1365,True,35,13,6,,,,,False,0.57,0.84,False,2.0,1.0,,,,,,,,,2.0,,,False,,,False,1.0,0.0,False,8.0,3.0,False,False,,,1.0,0.0,40086.0,1762.02,2263.25,,79.0,3.47,33.0,1.45,32.0,162.0,7.12,45468,30167.0,1326.02,36908.0,-0.00586,,,,172.0,98.0,18.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7,,,1.0,5.0,1.0,False,0.0,3.0,"[Jinx, Jarvan IV, Nautilus, Syndra, Gwen]","[Renekton, Lee Sin, Caitlyn, Jayce, Camille]"
35,partial,LPL,ThunderTalk Gaming,oe:team:df80f468a3f9a722df056fe9104f052,1365,False,11,6,13,,,,,True,0.26,0.84,False,1.0,2.0,,,,,,,,,1.0,,,False,,,False,0.0,1.0,False,3.0,8.0,False,False,,,0.0,1.0,30417.0,1337.01,2541.89,,64.0,2.81,34.0,1.49,26.0,155.0,6.81,38538,23237.0,1021.41,37125.0,0.00586,,,,116.0,94.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-7,,,-1.0,-5.0,-1.0,True,0.0,1.0,"[Xin Zhao, Thresh, Aphelios, Vex, Jax]","[Samira, Diana, Akali, LeBlanc, Rumble]"
58,partial,LPL,Oh My God,oe:team:f4c4528c6981e104a11ea7548630c23,1444,True,40,22,9,,,,,True,0.91,1.25,False,2.0,1.0,,,,,,,,,2.0,,,False,,,False,1.0,0.0,False,9.0,2.0,False,False,,,1.0,0.0,59746.0,2482.52,3026.05,,67.0,2.78,32.0,1.33,38.0,180.0,7.48,54283,38176.0,1586.26,50858.0,0.299,,,,178.0,88.0,41.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,13,,,1.0,7.0,1.0,False,0.0,3.0,"[Jinx, Xin Zhao, Rakan, Rumble, Corki]","[Renekton, Caitlyn, Thresh, Jayce, Camille]"
59,partial,LPL,ThunderTalk Gaming,oe:team:df80f468a3f9a722df056fe9104f052,1444,False,16,8,22,,,,,False,0.33,1.25,False,1.0,2.0,,,,,,,,,1.0,,,False,,,False,0.0,1.0,False,2.0,9.0,False,False,,,0.0,1.0,35129.0,1459.65,3107.16,,82.0,3.41,21.0,0.87,29.0,159.0,6.61,41155,25048.0,1040.78,37638.0,-0.299,,,,115.0,88.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-14,,,-1.0,-7.0,-1.0,True,0.0,1.0,"[Lee Sin, Leona, Ziggs, Gangplank, Twisted Fate]","[Samira, Diana, Jarvan IV, LeBlanc, Akali]"
82,partial,LPL,FunPlus Phoenix,oe:team:33d17f3717f58e12a3da80b377221fb,1893,True,25,12,8,,,,,False,0.38,0.63,False,4.0,1.0,,,,,,,,,4.0,,,False,,,False,2.0,0.0,False,10.0,3.0,False,False,,,3.0,0.0,54264.0,1719.94,2528.49,,100.0,3.17,59.0,1.87,41.0,274.0,8.68,64011,43324.0,1373.19,58619.0,0.149,,,,264.0,162.0,34.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,,,2.0,7.0,3.0,False,0.0,6.0,"[Jinx, Viego, Thresh, Corki, Graves]","[LeBlanc, Twisted Fate, Aphelios, Nautilus, Le..."


### Univariate Analysis

In [310]:
col = 'kpm'
fig = px.histogram(teams, x=col, title=f'Distribution of {col}')
fig.show()
fig.write_html('assets/kpm_dist.html', include_plotlyjs='cdn')

In [311]:
col = 'multikills'
fig = px.histogram(teams, x=col, title=f'Distribution of {col}')
fig.show()
fig.write_html('assets/multikills_dist.html', include_plotlyjs='cdn')

In [312]:
cols = ['doublekills', 'triplekills', 'quadrakills', 'pentakills']
fig = make_subplots(rows=1, cols=len(cols), subplot_titles=cols)
for i, col in enumerate(cols, 1):
    fig.add_trace(go.Histogram(x=df[col], name=col), row=1, col=i)

fig.update_layout(
    title_text="Histograms of Multikills",
    title_x=0.5,
    title_font=dict(size=24),
    showlegend=False,
    margin=dict(t=100),
)
fig.show()
fig.write_html('assets/all_multikills_dist.html', include_plotlyjs='cdn')

### Bivariate Analsis

In [313]:
fig = px.box(teams, x='league', y='kpm', title='kpm grouped by league')
fig.show()
fig.write_html('assets/kpm_by_league.html', include_plotlyjs='cdn')

In [314]:
fig = px.box(teams, x='league', y='multikills', title='multikills grouped by league')
fig.show()
fig.write_html('assets/multikills_by_league.html', include_plotlyjs='cdn')

In [315]:
fig = px.box(teams, x='league', y='objectives captured', title='objectives captured grouped by league')
fig.show()
fig.write_html('assets/objectives_by_league.html', include_plotlyjs='cdn')

### Interesting Aggregates

In [316]:
# calculate the pick rates of each champion across different leagues
teams_copy = teams.explode('picks')
teams_copy['champion'] = teams_copy['picks']
pt = pd.pivot_table(teams_copy, index='picks', columns='league', values='champion', aggfunc='size').fillna(0)
pt = pt.div(pt.sum(axis=0), axis=1) * 100
pt

league,CBLOL,LCK,LCS,LEC,LLA,LPL,PCS,VCS
picks,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Aatrox,0.53,0.41,1.57,0.91,0.70,0.39,0.81,0.79
Ahri,3.46,2.91,2.45,2.26,1.93,2.48,3.03,3.27
Akali,0.58,1.48,0.85,1.07,0.91,0.97,0.66,1.68
Akshan,0.04,0.11,0.13,0.25,0.00,0.04,0.00,0.00
Alistar,0.95,0.47,0.85,0.37,1.28,0.85,1.07,1.06
...,...,...,...,...,...,...,...,...
Zeri,2.43,2.23,3.37,2.47,2.35,2.13,2.77,3.04
Ziggs,0.00,0.17,0.03,0.04,0.11,0.20,0.15,0.07
Zilean,0.16,0.21,0.56,0.58,0.05,0.04,0.26,0.07
Zoe,0.58,0.39,0.33,0.45,0.53,0.50,0.70,0.03


In [317]:
pt.idxmax()

league
CBLOL    Nautilus
LCK      Aphelios
LCS          Jinx
LEC      Aphelios
LLA      Aphelios
LPL      Aphelios
PCS      Nautilus
VCS      Nautilus
dtype: object

## Step 3: Assessment of Missingness

In [318]:
s = teams.isna().sum()

pd.set_option('display.max_rows', len(s))
print(s)
pd.reset_option('display.max_rows')

datacompleteness               0
league                         0
teamname                       0
teamid                         0
gamelength                     0
result                         0
assists                        0
teamkills                      0
teamdeaths                     0
doublekills                 1572
triplekills                 1572
quadrakills                 1572
pentakills                  1572
firstblood                     0
kpm                            0
ckpm                           0
firstdragon                    0
dragons                        0
opp_dragons                    0
elementaldrakes             1660
opp_elementaldrakes         1660
infernals                   1572
mountains                   1572
clouds                      1572
oceans                      1572
chemtechs                   1660
hextechs                    1660
dragons (type unknown)         0
elders                      1572
opp_elders                  1572
firstheral

### NMAR Missingness

There do not seem to be any columns that have NMAR missingness mechanisms

### Missingness Dependency

Inspect the missingness of `'doublekills'` depending on `'league'` and `'teamname'`

First let's inspect the missingess of `'doublekills'` depending on `'league'`

In [319]:
def two_tail_pval(observed, values):
    values = np.array(values)
    mean = values.mean()
    std = values.std()
    values = (values - mean) / std
    observed = (observed - mean) / std
    return (np.abs(values) >= observed).mean()
    

In [320]:
def tvd_test_stat(df, group1, group2, col):
    missing_true = df.loc[group1, col].value_counts()
    missing_true = missing_true / missing_true.sum()
    missing_true = missing_true.reindex(df[col].unique(), fill_value=0)
    missing_false = df.loc[group2, col].value_counts()
    missing_false = missing_false / missing_false.sum()
    missing_false = missing_false.reindex(shuffled[col].unique(), fill_value=0)
    tvd = 0.5 * np.abs(missing_true - missing_false).sum()
    return tvd

In [321]:
col = 'league'
n_reps = 500
shuffled = teams.copy()
shuffled['doublekills_misisng'] = shuffled['doublekills'].isna()
group1 = shuffled['doublekills_misisng'] == True
group2 = shuffled['doublekills_misisng'] == False

tvds = []
observed = tvd_test_stat(shuffled, group1, group2, col)

for _ in range(500):
    shuffled[col] = np.random.permutation(shuffled[col])
    tvd = tvd_test_stat(shuffled, group1, group2, col)
    tvds.append(tvd)
    
pval = two_tail_pval(observed, tvds)
pval

0.0

In [322]:
fig = px.histogram(tvds)
fig.add_vline(x=observed, line_width=3, line_dash='dash', line_color='red', annotation_text=f'Observed: {observed}', annotation_position='top right')
fig.update_layout(
    xaxis_title='Total Variation Distance',
    yaxis_title='Density',
    title="'doublekills' Missingness on 'league' Hypothesis Test"
)
fig.update_traces(showlegend=False)
fig.show()
fig.write_html('assets/doublekills_on_league.html', include_plotlyjs='cdn')

Running a permutation test to compare the distributions of `'league'` when `'doublekills'` is missing or not we get a p-value of 0.0. Thus, we can conclude that `'doublekills'` is MAR depending on `'league'`

Now let's inspect the missingness of `'doublekills'` depending on `'teamname'`

In [323]:
def abs_dif_of_means(df, group1, group2, col):
    m1 = df.loc[group1, col].mean()
    m2 = df.loc[group2, col].mean()
    return np.abs(m1 - m2)

def dif_of_means(df, group1, group2, col):
    m1 = df.loc[group1, col].mean()
    m2 = df.loc[group2, col].mean()
    return m1 - m2

In [324]:
col = 'dpm'
n_reps = 500
shuffled = teams.copy()
shuffled['doublekills_misisng'] = shuffled['doublekills'].isna()
group1 = shuffled['doublekills_misisng'] == True
group2 = shuffled['doublekills_misisng'] == False

doms = []
observed = abs_dif_of_means(shuffled, group1, group2, col)

for _ in range(500):
    shuffled[col] = np.random.permutation(shuffled[col])
    dom = abs_dif_of_means(shuffled, group1, group2, col)
    doms.append(dom)
    
pval = two_tail_pval(observed, doms)
pval

0.056

In [325]:
fig = px.histogram(doms)
fig.add_vline(x=observed, line_width=3, line_dash='dash', line_color='red', annotation_text=f'Observed: {observed}', annotation_position='top right')
fig.update_layout(
    xaxis_title='Absolute Difference of Means',
    yaxis_title='Density',
    title="'doublekills' Missingness on 'dpm' Hypothesis Test"
)
fig.update_traces(showlegend=False)
fig.show()
fig.write_html('assets/doublekills_on_dpm.html', include_plotlyjs='cdn')

Running a permutation test to compare the distributions of `'kpm'` when `'doublekills'` is missing or not we got a p-value of 0.434. Thus, we can conclude that the missingness of `'doublekills'` is not dependent on `'kpm'`.

## Step 4: Hypothesis Testing

### Hypothesis 1

Null: The mean kpm (kills per minute) between the LPL league and LCK league is the same

Alternative: The mean kpm (kills per minute) of the LPL league is higher than the LCK league

Test Statistic: difference of means

In [326]:
col = 'kpm'
n_reps = 500
shuffled = teams.copy()
shuffled = shuffled.loc[shuffled['league'].isin(['LPL', 'LCK'])]
group1 = shuffled['league'] == 'LPL'
group2 = shuffled['league'] == 'LCK'

doms = []
observed = dif_of_means(shuffled, group1, group2, col)

for _ in range(500):
    shuffled[col] = np.random.permutation(shuffled[col])
    dom = dif_of_means(shuffled, group1, group2, col)
    doms.append(dom)
    
pval = (np.array(doms) >= observed).mean()
pval

0.0

In [327]:
fig = px.histogram(doms)
fig.add_vline(x=observed, line_width=3, line_dash='dash', line_color='red', annotation_text=f'Observed: {observed}', annotation_position='top right')
fig.update_layout(
    xaxis_title='Difference of Means',
    yaxis_title='Density',
    title="KPM difference between LPL and LCK Hypothesis Test"
)
fig.update_traces(showlegend=False)
fig.show()
fig.write_html('assets/kpm_lck_lpl.html', include_plotlyjs='cdn')

With a p-value of 0, we have evidence to conclude that the mean kpm of the LPL is greater than that of the LCK

### Hypothesis 2

Null: The mean result (winrate) between Red side and Blue side is the same

Alternative: The mean result (winrate) between Red side and Blue side is not the same

Test Statistic: difference of means

In [328]:
col = 'result'
n_reps = 500
shuffled = teams.copy()
group1 = shuffled['red_side'] == 1
group2 = shuffled['red_side'] == 0

doms = []
observed = abs_dif_of_means(shuffled, group1, group2, col)

for _ in range(500):
    shuffled[col] = np.random.permutation(shuffled[col])
    dom = abs_dif_of_means(shuffled, group1, group2, col)
    doms.append(dom)
    
pval = two_tail_pval(observed, doms)
pval

0.0

In [329]:
fig = px.histogram(doms)
fig.add_vline(x=observed, line_width=3, line_dash='dash', line_color='red', annotation_text=f'Observed: {observed}', annotation_position='top right')
fig.update_layout(
    xaxis_title='Absolute Difference of Means',
    yaxis_title='Density',
    title="Winrate Difference Between Red Side and Blue Side Hypothesis Test"
)
fig.update_traces(showlegend=False)
fig.show()
fig.write_html('assets/red_vs_blue.html', include_plotlyjs='cdn')

With a p-value of 0, we have evidence to conclude that the winrate on the Red side is higher than that of the Blue side.

## Step 5: Framing a Prediction Problem

### Question

Can we predict if a team will win/lose the game given all of the information available? (maybe only availabe by 15min like stats and first drag/herald)

- binary classification

- predict `'result'` column

- measure with accuracy

## Step 6: Baseline Model

In [332]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [333]:
df.head()

Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,participantid,side,position,playername,playerid,teamname,teamid,champion,ban1,ban2,ban3,ban4,ban5,pick1,pick2,pick3,pick4,pick5,gamelength,result,kills,deaths,assists,teamkills,teamdeaths,doublekills,triplekills,quadrakills,pentakills,firstblood,firstbloodkill,firstbloodassist,firstbloodvictim,team kpm,ckpm,firstdragon,dragons,opp_dragons,elementaldrakes,opp_elementaldrakes,infernals,mountains,clouds,oceans,chemtechs,hextechs,dragons (type unknown),elders,opp_elders,firstherald,heralds,opp_heralds,void_grubs,opp_void_grubs,firstbaron,barons,opp_barons,firsttower,towers,opp_towers,firstmidtower,firsttothreetowers,turretplates,opp_turretplates,inhibitors,opp_inhibitors,damagetochampions,dpm,damageshare,damagetakenperminute,damagemitigatedperminute,wardsplaced,wpm,wardskilled,wcpm,controlwardsbought,visionscore,vspm,totalgold,earnedgold,earned gpm,earnedgoldshare,goldspent,gspd,gpr,total cs,minionkills,monsterkills,monsterkillsownjungle,monsterkillsenemyjungle,cspm,goldat10,xpat10,csat10,opp_goldat10,opp_xpat10,opp_csat10,golddiffat10,xpdiffat10,csdiffat10,killsat10,assistsat10,deathsat10,opp_killsat10,opp_assistsat10,opp_deathsat10,goldat15,xpat15,csat15,opp_goldat15,opp_xpat15,opp_csat15,golddiffat15,xpdiffat15,csdiffat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15
0,ESPORTSTMNT01_2690210,complete,,LCKC,2022,Spring,0,2022-01-10 07:44:08,1,12.01,1,Blue,top,Soboro,oe:player:38e0af7278d6769d0c81d7c4b47ac1e,Fredit BRION Challengers,oe:team:68911b3329146587617ab2973106e23,Renekton,Karma,Caitlyn,Syndra,Thresh,Lulu,,,,,,1713,0,2,3,2,9,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32,0.98,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,0.0,0.0,15768.0,552.29,0.28,1072.4,777.79,8.0,0.28,6.0,0.21,5.0,26.0,0.91,10934,7164.0,250.93,0.25,10275.0,,,231.0,220.0,11.0,,,8.09,3228.0,4909.0,89.0,3176.0,4953.0,81.0,52.0,-44.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,5025.0,7560.0,135.0,4634.0,7215.0,121.0,391.0,345.0,14.0,0.0,1.0,0.0,0.0,1.0,0.0
1,ESPORTSTMNT01_2690210,complete,,LCKC,2022,Spring,0,2022-01-10 07:44:08,1,12.01,2,Blue,jng,Raptor,oe:player:637ed20b1e41be1c51bd1a4cb211357,Fredit BRION Challengers,oe:team:68911b3329146587617ab2973106e23,Xin Zhao,Karma,Caitlyn,Syndra,Thresh,Lulu,,,,,,1713,0,2,5,6,9,19,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.32,0.98,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,0.0,1.0,11765.0,412.08,0.21,944.27,650.16,6.0,0.21,18.0,0.63,6.0,48.0,1.68,9138,5368.0,188.02,0.19,8750.0,,,148.0,33.0,115.0,,,5.18,3429.0,3484.0,58.0,2944.0,3052.0,63.0,485.0,432.0,-5.0,1.0,2.0,0.0,0.0,0.0,1.0,5366.0,5320.0,89.0,4825.0,5595.0,100.0,541.0,-275.0,-11.0,2.0,3.0,2.0,0.0,5.0,1.0
2,ESPORTSTMNT01_2690210,complete,,LCKC,2022,Spring,0,2022-01-10 07:44:08,1,12.01,3,Blue,mid,Feisty,oe:player:d1ae0e2f9f3ac1e0e0cdcb86504ca77,Fredit BRION Challengers,oe:team:68911b3329146587617ab2973106e23,LeBlanc,Karma,Caitlyn,Syndra,Thresh,Lulu,,,,,,1713,0,2,2,3,9,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32,0.98,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,0.0,0.0,14258.0,499.4,0.25,581.65,227.78,19.0,0.67,7.0,0.25,7.0,29.0,1.02,9715,5945.0,208.23,0.21,8725.0,,,193.0,177.0,16.0,,,6.76,3283.0,4556.0,81.0,3121.0,4485.0,81.0,162.0,71.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,5118.0,6942.0,120.0,5593.0,6789.0,119.0,-475.0,153.0,1.0,0.0,3.0,0.0,3.0,3.0,2.0
3,ESPORTSTMNT01_2690210,complete,,LCKC,2022,Spring,0,2022-01-10 07:44:08,1,12.01,4,Blue,bot,Gamin,oe:player:998b3e49b01ecc41eacc392477a98cf,Fredit BRION Challengers,oe:team:68911b3329146587617ab2973106e23,Samira,Karma,Caitlyn,Syndra,Thresh,Lulu,,,,,,1713,0,2,4,2,9,19,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.32,0.98,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,0.0,0.0,11106.0,389.0,0.2,463.85,218.88,12.0,0.42,6.0,0.21,4.0,25.0,0.88,10605,6835.0,239.4,0.24,10425.0,,,226.0,208.0,18.0,,,7.92,3600.0,3103.0,78.0,3304.0,2838.0,90.0,296.0,265.0,-12.0,1.0,1.0,0.0,0.0,0.0,0.0,5461.0,4591.0,115.0,6254.0,5934.0,149.0,-793.0,-1343.0,-34.0,2.0,1.0,2.0,3.0,3.0,0.0
4,ESPORTSTMNT01_2690210,complete,,LCKC,2022,Spring,0,2022-01-10 07:44:08,1,12.01,5,Blue,sup,Loopy,oe:player:e9741b3a238723ea6380ef2113fae63,Fredit BRION Challengers,oe:team:68911b3329146587617ab2973106e23,Leona,Karma,Caitlyn,Syndra,Thresh,Lulu,,,,,,1713,0,1,5,6,9,19,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.32,0.98,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,0.0,0.0,3663.0,128.3,0.06,475.03,490.12,29.0,1.02,14.0,0.49,11.0,69.0,2.42,6678,2908.0,101.86,0.1,6395.0,,,42.0,42.0,0.0,,,1.47,2678.0,2161.0,16.0,2150.0,2748.0,15.0,528.0,-587.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,3836.0,3588.0,28.0,3393.0,4085.0,21.0,443.0,-497.0,7.0,1.0,2.0,2.0,0.0,6.0,2.0


In [334]:
features = ['red_side',
            'golddiffat15', 
            'xpdiffat15', 
            'csdiffat15', 
            'killsat15', 
            'assistsat15', 
            'deathsat15'
]
teams_dropped = teams[features + ['result']].dropna()
X_train, X_test, y_train, y_test = train_test_split(teams_dropped.drop(columns=['result']), teams_dropped['result'])
dt = DecisionTreeClassifier(max_depth=2, criterion='entropy')
dt.fit(X_train, y_train)
print(dt.score(X_train, y_train))
dt.score(X_test, y_test)

0.74640522875817


0.7284313725490196

## Step 7: Final Model

In [335]:
%%time
features = ['red_side',
            'killsdif',
            'objectives captured',
            'earned gpm'
]
teams_dropped = teams[features + ['result']].dropna()
X_train, X_test, y_train, y_test = train_test_split(teams_dropped.drop(columns=['result']), teams_dropped['result'])
rfc = RandomForestClassifier()
hyperparameters = {
    'max_depth': np.arange(2, 5, 1),
    'criterion': ['gini'],
    'min_samples_split': np.arange(40, 60, 2)
}
grids = GridSearchCV(
    rfc,
    n_jobs=-1,
    param_grid=hyperparameters,
    cv=5
)
grids.fit(X_train, y_train)
grids.best_params_

CPU times: user 410 ms, sys: 136 ms, total: 546 ms
Wall time: 5.11 s


{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 48}

In [336]:
rfc = RandomForestClassifier(**grids.best_params_)
rfc.fit(X_train, y_train)
print(rfc.score(X_train, y_train))
rfc.score(X_test, y_test)

0.9752300070771408


0.9723991507430998

In [337]:
features = ['red_side',
            'killsdif',
            'objectives captured',
            'earned gpm',
]
best_params = {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 56}
teams_dropped = teams[features + ['result']].dropna()
X_train, X_test, y_train, y_test = train_test_split(teams_dropped.drop(columns=['result']), teams_dropped['result'])
rfc = RandomForestClassifier(**best_params)
rfc.fit(X_train, y_train)
print(rfc.score(X_train, y_train))
rfc.score(X_test, y_test)


0.9733427695211134


0.9723991507430998

## Step 8: Fairness Analysis

In [338]:
from sklearn.metrics import precision_score, recall_score, f1_score

Group 1: Red Side Teams

Group 2: Blue Side Teams

Null: The model is fair. Its F1-score is the same for teams playing Red side and Blue side.

Alternative: The model is unfair. Its F1-score is higher for teams playing on Red side over Blue side.

In [339]:
def test_statistic(df, group_ser, model):
    group1 = df.loc[group_ser == True]
    group2 = df.loc[group_ser == False]
    group1_predict = model.predict(group1.drop(columns=['result']))
    group2_predict = model.predict(group2.drop(columns=['result']))
    group1_score = f1_score(group1_predict, group1['result'])
    group2_score = f1_score(group2_predict, group2['result'])
    return group1_score - group2_score


In [340]:
observed = test_statistic(teams_dropped, teams_dropped['red_side'], rfc)
scores = []
for _ in range(500):
    shuffled = np.random.permutation(teams_dropped['red_side'])
    tstat = test_statistic(teams_dropped, shuffled, rfc)
    scores.append(tstat)
    
pval = (np.array(scores) >= observed).mean()
pval

0.678

In [341]:
fig = px.histogram(scores)
fig.add_vline(x=observed, line_width=3, line_dash='dash', line_color='red', annotation_text=f'Observed: {observed}', annotation_position='top right')
fig.update_layout(
    xaxis_title='F1-Scores',
    yaxis_title='Density',
    title="Red Side and Blue Side Difference of Mean F1-Scores Hypothesis Test"
)
fig.update_traces(showlegend=False)
fig.show()
fig.write_html('assets/red_blue_f1.html', include_plotlyjs='cdn')

We fail to reject the null hypothesis. 