# Exploratory

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.colors import n_colors
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment', None) # default='warn'

sns.set_style("whitegrid")

In [3]:
# Load combined data
df_raw = pd.read_csv('../data/combined.csv')
df = df_raw.copy()

# Load hero feature data
df_features = pd.read_csv('../data/features.csv')

### Quality Checks

In [3]:
# Snippet of df
df.head()

Unnamed: 0,match_id,match_seq_num,radiant_win,start_time,duration,avg_mmr,num_mmr,lobby_type,game_mode,avg_rank_tier,num_rank_tier,cluster,hero0_pick,hero1_pick,hero2_pick,hero3_pick,hero4_pick,hero5_pick,hero6_pick,hero7_pick,hero8_pick,hero9_pick,hero0_slot,hero1_slot,hero2_slot,hero3_slot,hero4_slot,hero5_slot,hero6_slot,hero7_slot,hero8_slot,hero9_slot
0,6447015200,5388383445,True,1645660804,2502,3667.0,3.0,7,22,47,6,172,94.0,87.0,40.0,71.0,22.0,128.0,48.0,57.0,4.0,137.0,0.0,1.0,2.0,3.0,4.0,128.0,129.0,130.0,131.0,132.0
1,6447015219,5388383380,False,1645660804,2487,2360.0,1.0,7,22,32,6,236,137.0,136.0,63.0,75.0,12.0,36.0,94.0,76.0,14.0,5.0,0.0,1.0,2.0,3.0,4.0,128.0,129.0,130.0,131.0,132.0
2,6447015314,5388392814,False,1645660809,3611,3992.0,2.0,7,22,52,5,182,137.0,32.0,7.0,79.0,74.0,1.0,47.0,22.0,14.0,8.0,0.0,1.0,2.0,3.0,4.0,128.0,129.0,130.0,131.0,132.0
3,6447015315,5388383693,True,1645660809,2511,2698.0,1.0,7,22,33,5,184,113.0,23.0,50.0,59.0,119.0,11.0,84.0,137.0,19.0,53.0,0.0,1.0,2.0,3.0,4.0,128.0,129.0,130.0,131.0,132.0
4,6447015300,5388382266,False,1645660809,2431,3300.0,1.0,0,4,23,3,273,25.0,30.0,70.0,4.0,7.0,88.0,1.0,68.0,2.0,96.0,0.0,1.0,2.0,3.0,4.0,128.0,129.0,130.0,131.0,132.0


In [38]:
# Snippet of df_features
df_features.head()

Unnamed: 0,hero_id,hero,attack_type,primary_attribute,strength,strength_rate,agility,agility_rate,intelligence,intelligence_rate,movement_speed,armor,damage_min,damage_max,range,attack_speed,base_attack_time,attack_point,attack_backswing,vision_day,vision_night,turn_rate,collision_size,health_regen
0,102,Abaddon,Melee,Strength,22,2.8,23,1.5,18,2.0,325,2.83,50,60,150,120,1.7,0.56,0.41,1800,800,0.6,24,3.2
1,73,Alchemist,Melee,Strength,25,2.9,22,1.5,25,1.8,305,2.67,52,58,150,100,1.7,0.35,0.65,1800,800,0.6,24,2.75
2,68,Ancient Apparition,Ranged,Intelligence,20,1.9,20,2.2,23,3.4,285,2.33,44,54,675,100,1.7,0.45,0.3,1800,800,0.6,24,2.25
3,1,Anti-Mage,Melee,Agilitiy,23,1.6,24,2.8,12,1.8,310,4.0,53,57,150,100,1.4,0.3,0.6,1800,800,0.6,24,2.55
4,113,Arc Warden,Ranged,Agilitiy,22,2.6,20,2.5,24,2.6,285,2.33,47,57,625,100,1.7,0.3,0.7,1800,800,0.7,24,2.45


In [41]:
# Check for missing values in features
missing = df_features.isnull().sum()
missing[missing!=0]

attack_backswing    2
dtype: int64

In [42]:
# Check max, min in each feature
df_features.describe()

Unnamed: 0,hero_id,strength,strength_rate,agility,agility_rate,intelligence,intelligence_rate,movement_speed,armor,damage_min,damage_max,range,attack_speed,base_attack_time,attack_point,attack_backswing,vision_day,vision_night,turn_rate,collision_size,health_regen
count,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,121.0,123.0,123.0,123.0,123.0,123.0
mean,63.455285,21.398374,2.722764,18.170732,2.129268,19.878049,2.429268,300.853659,3.520244,49.837398,56.463415,347.682927,101.666667,1.697561,0.403659,0.570603,1790.243902,826.01626,0.645528,23.219512,2.670325
std,37.024859,2.949687,0.608924,4.431089,0.791422,3.878451,0.873258,14.580294,1.541329,7.441988,7.941686,201.092602,6.162641,0.079407,0.097604,0.232533,91.807563,142.482325,0.091662,3.460637,0.671823
min,1.0,16.0,1.5,0.0,0.0,12.0,1.0,275.0,0.0,33.0,36.0,150.0,90.0,1.4,0.17,0.0,800.0,800.0,0.6,8.0,1.85
25%,32.5,19.0,2.3,15.0,1.6,17.0,1.75,290.0,2.5,45.0,52.0,150.0,100.0,1.7,0.3,0.44,1800.0,800.0,0.6,24.0,2.25
50%,63.0,21.0,2.6,18.0,2.0,20.0,2.1,300.0,3.33,49.0,57.0,330.0,100.0,1.7,0.4,0.54,1800.0,800.0,0.6,24.0,2.5
75%,93.5,23.0,3.1,22.0,2.6,23.0,3.2,310.0,4.33,53.0,60.0,550.0,100.0,1.7,0.5,0.7,1800.0,800.0,0.6,24.0,2.85
max,137.0,30.0,4.6,34.0,4.4,30.0,5.2,330.0,8.67,87.0,95.0,700.0,125.0,2.0,0.65,1.5,1800.0,1800.0,0.9,24.0,5.55


In [4]:
# Check sum of hero slots = 0+1+2+3+4+128+129+130+131+132 = 660 (hero0_slot to hero9_slot)
df['hero_slot_sum'] = df.loc[:,[f'hero{i}_slot' for i in range(0,10)]].sum(axis=1)
print('Counts of total of hero slot values:')
print(df['hero_slot_sum'].value_counts())

Counts of total of hero slot values:
660.0    5600742
656.0          4
528.0          2
129.0          1
524.0          1
789.0          1
128.0          1
Name: hero_slot_sum, dtype: int64


In [5]:
# Check if hero slots 0-4 are always 0-4 and hero slots 5-9 are always 128-132
# loop through each slot and check value counts, appending to results df
results = pd.DataFrame(index=[0,1,2,3,4,128,129,130,131,132])
for i in range(0,10):
    results = pd.merge(results, df[f'hero{i}_slot'].value_counts(), left_index=True, right_index=True)
print('Hero slot values:')
display(results)

Hero slot values:


Unnamed: 0,hero0_slot,hero1_slot,hero2_slot,hero3_slot,hero4_slot,hero5_slot,hero6_slot,hero7_slot,hero8_slot,hero9_slot
0.0,5599799,59,46,44,29,622,52,43,42,39
1.0,33,5599796,61,41,42,27,620,51,41,39
2.0,40,29,5599795,65,40,42,27,620,51,41
3.0,45,39,29,5599795,66,40,42,26,618,50
4.0,52,45,39,29,5599789,66,38,42,26,618
128.0,619,60,46,39,35,5599789,59,38,41,26
129.0,27,619,50,45,39,35,5599798,58,38,41
130.0,41,26,619,50,44,39,35,5599800,58,38
131.0,38,41,26,618,50,42,39,35,5599803,58
132.0,58,38,41,26,618,50,42,39,34,5599801


In [6]:
# Check if all heroes picked are valid ids
print('Invalid or no hero picks: (empty set means all valid)')
ids = df_features['hero_id']
for i in range(0,10):
    picks = df[f'hero{i}_pick'].value_counts(dropna=False).index
    diff = set(picks).difference(ids)
    print(f'hero{i}_pick: {diff}')

Invalid or no hero picks: (empty set means all valid)
hero0_pick: set()
hero1_pick: set()
hero2_pick: {0.0}
hero3_pick: {0.0}
hero4_pick: {0.0}
hero5_pick: {0.0}
hero6_pick: {0.0}
hero7_pick: {0.0}
hero8_pick: {0.0}
hero9_pick: {0.0}


In [7]:
# Count number of times hero id 0 picked
for i in range(0,10):
    count = np.sum(df[f'hero{i}_pick']==0)
    print(f'hero{i}_pick id=0 count: {count}')

hero0_pick id=0 count: 0
hero1_pick id=0 count: 0
hero2_pick id=0 count: 1
hero3_pick id=0 count: 4
hero4_pick id=0 count: 5
hero5_pick id=0 count: 4
hero6_pick id=0 count: 3
hero7_pick id=0 count: 2
hero8_pick id=0 count: 4
hero9_pick id=0 count: 12


### Insights

In [48]:
# Number of games per hero
print('Picks by hero:')
ids = df_features['hero_id']
results = pd.DataFrame(index=ids)
# Loop through each hero slot and count values, appen to results
for i in range(0,10):
    pick_counts = pd.DataFrame(data=df[f'hero{i}_pick'].value_counts())
    results = pd.merge(results, pick_counts, left_index=True, right_index=True)

# Create total column, summing all player slots
results['hero_total_picks'] = results.sum(axis=1)

# Merging the hero names into the df, sorting from most commonly played to least
df_features_names = df_features[['hero_id','hero']].set_index('hero_id')
results = pd.merge(results, df_features_names, left_index=True, right_index=True)
results = results.sort_values(by='hero_total_picks', ascending=False)

# Creating hero picks percentage columns
matches_count = np.round(results['hero_total_picks'].sum()/10,0)
results['hero_total_picks_percent'] = (results['hero_total_picks']/matches_count*100).round(2)

display(results)

# Plot
n = 10 # plot top and bottom n heroes
fig = px.bar(
    results.iloc[list(np.arange(0,n)) + list(np.arange(-n-1,-1))],
    x='hero',
    y='hero_total_picks_percent',
    template='none',
    labels={
        'hero': 'Hero',
        'hero_total_picks_percent': 'Picked in % of matches'
    },
    height=400
    )
fig.update_yaxes(tick0=0, dtick=5)
fig.update_xaxes(tickangle=45)
fig.update_layout(margin = dict(b = 160))
fig.show()
fig.write_image('../images/hero_pick_rate.png', scale=5)

Picks by hero:


Unnamed: 0,hero0_pick,hero1_pick,hero2_pick,hero3_pick,hero4_pick,hero5_pick,hero6_pick,hero7_pick,hero8_pick,hero9_pick,hero_total_picks,hero,hero_total_picks_percent
14.0,152270,148290,147355,149102,150063,150143,145640,144900,146222,146784,1480769,Pudge,26.44
26.0,109841,113640,114259,113268,116491,110198,114936,114297,113912,117094,1137936,Lion,20.32
35.0,117137,110847,112527,110734,114857,115690,109188,110948,109458,113602,1124988,Sniper,20.09
84.0,105405,110024,109402,109369,113298,106452,111527,110913,110805,115067,1102262,Ogre Magi,19.68
8.0,107191,106612,105812,106901,103837,108238,107113,106188,106861,103051,1061804,Juggernaut,18.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38.0,7220,7275,7160,7262,7125,7021,7128,7025,7112,6951,71279,Beastmaster,1.27
66.0,6972,7182,7270,7121,7045,6877,7214,7309,7178,6964,71132,Chen,1.27
103.0,6642,6956,6694,6879,6806,6493,6762,6782,6755,6759,67528,Elder Titan,1.21
61.0,6007,5736,5821,5742,5623,5867,5725,5759,5729,5579,57588,Broodmother,1.03


In [17]:
df.head()

Unnamed: 0,match_id,match_seq_num,radiant_win,start_time,duration,avg_mmr,num_mmr,lobby_type,game_mode,avg_rank_tier,num_rank_tier,cluster,hero0_pick,hero1_pick,hero2_pick,hero3_pick,hero4_pick,hero5_pick,hero6_pick,hero7_pick,hero8_pick,hero9_pick,hero0_slot,hero1_slot,hero2_slot,hero3_slot,hero4_slot,hero5_slot,hero6_slot,hero7_slot,hero8_slot,hero9_slot,hero_slot_sum
0,6447015200,5388383445,True,1645660804,2502,3667.0,3.0,7,22,47,6,172,94.0,87.0,40.0,71.0,22.0,128.0,48.0,57.0,4.0,137.0,0.0,1.0,2.0,3.0,4.0,128.0,129.0,130.0,131.0,132.0,660.0
1,6447015219,5388383380,False,1645660804,2487,2360.0,1.0,7,22,32,6,236,137.0,136.0,63.0,75.0,12.0,36.0,94.0,76.0,14.0,5.0,0.0,1.0,2.0,3.0,4.0,128.0,129.0,130.0,131.0,132.0,660.0
2,6447015314,5388392814,False,1645660809,3611,3992.0,2.0,7,22,52,5,182,137.0,32.0,7.0,79.0,74.0,1.0,47.0,22.0,14.0,8.0,0.0,1.0,2.0,3.0,4.0,128.0,129.0,130.0,131.0,132.0,660.0
3,6447015315,5388383693,True,1645660809,2511,2698.0,1.0,7,22,33,5,184,113.0,23.0,50.0,59.0,119.0,11.0,84.0,137.0,19.0,53.0,0.0,1.0,2.0,3.0,4.0,128.0,129.0,130.0,131.0,132.0,660.0
4,6447015300,5388382266,False,1645660809,2431,3300.0,1.0,0,4,23,3,273,25.0,30.0,70.0,4.0,7.0,88.0,1.0,68.0,2.0,96.0,0.0,1.0,2.0,3.0,4.0,128.0,129.0,130.0,131.0,132.0,660.0


In [18]:
# Wins by hero
# Dataframes for radiant and dire wins respectively
df_radiant_win = df[df['radiant_win']==True]
df_dire_win = df[df['radiant_win']==False]

# Player slots for radiant and dire teams
radiant_slots = [0,1,2,3,4]
dire_slots = [128,129,130,131,132]

# Loop through each slot
for i in range(0,10):
    # Replace slot with True or False depending if it was in the winning team
    df_radiant_win[f'hero{i}_slot'] = df_radiant_win[f'hero{i}_slot'].isin(radiant_slots)
    df_dire_win[f'hero{i}_slot'] = df_dire_win[f'hero{i}_slot'].isin(dire_slots)
    # Replace pick with itself if win, else 0 if lose
    df_radiant_win[f'hero{i}_pick'] = df_radiant_win[f'hero{i}_pick']*df_radiant_win[f'hero{i}_slot']
    df_dire_win[f'hero{i}_pick'] = df_dire_win[f'hero{i}_pick']*df_dire_win[f'hero{i}_slot']

# Now have two dataframes only containing winning picks
# Reduce dataframes to hero picks only and melt so all hero ids are in a single column
df_radiant_win = df_radiant_win[[f'hero{i}_pick' for i in range(0,10)]]
df_radiant_win = df_radiant_win.melt(var_name='hero_picks', value_name='hero_id')
df_dire_win = df_dire_win[[f'hero{i}_pick' for i in range(0,10)]]
df_dire_win = df_dire_win.melt(var_name='hero_picks', value_name='hero_id')

# Filter dataframes to remove hero_id = 0
df_radiant_win = df_radiant_win[df_radiant_win['hero_id']!=0]
df_dire_win = df_dire_win[df_dire_win['hero_id']!=0]

# Get number of hero picks using value counts and convert to dataframe
radiant_wins = pd.DataFrame(df_radiant_win['hero_id'].value_counts().sort_index()).rename(columns={'hero_id':'radiant_wins'})
dire_wins = pd.DataFrame(df_dire_win['hero_id'].value_counts().sort_index()).rename(columns={'hero_id':'dire_wins'})

# Merge dataframes and calculate total wins per hero
total_wins = pd.merge(radiant_wins, dire_wins, left_index=True, right_index=True)
total_wins['total_wins'] = total_wins['radiant_wins'] + total_wins['dire_wins']

# Add hero names to dataframe
df_features_names = df_features[['hero_id','hero']].set_index('hero_id')
total_wins = pd.merge(total_wins, df_features_names, left_index=True, right_index=True)

In [None]:
total_wins.head()

Unnamed: 0,radiant_wins,dire_wins,total_wins,hero
1.0,188388,192001,380389,Anti-Mage
2.0,223459,224713,448172,Axe
3.0,50787,51978,102765,Bane
4.0,67278,66930,134208,Bloodseeker
5.0,141641,141392,283033,Crystal Maiden


In [30]:
# Radiant / Dire win share
total_wins_sum = df['radiant_win'].value_counts()
total_wins_radiant = total_wins_sum[1]
total_wins_dire = total_wins_sum[0]
total_win_count = total_wins_radiant + total_wins_dire
proportion_wins_radiant = np.round(total_wins_radiant/total_win_count*100,2)
proportion_wins_dire = np.round(total_wins_dire/total_win_count*100,2)

print(f'Total/Proportion of games won (Radiant): {total_wins_radiant} / {proportion_wins_radiant}%')
print(f'Total/Proportion of games won (Dire): {total_wins_dire} / {proportion_wins_dire}%')

Total/Proportion of games won (Radiant): 2782548 / 49.68%
Total/Proportion of games won (Dire): 2818204 / 50.32%


In [None]:
# Win percentage by hero (divide wins by number of matches by hero)
df_picks_only = df[[f'hero{i}_pick' for i in range(0,10)]]
df_picks_only = df_picks_only.melt(var_name='hero_picks', value_name='hero_id')
matches_played = pd.DataFrame(df_picks_only['hero_id'].value_counts().sort_index()).rename(columns={'hero_id':'matches_played'})
win_percentage = pd.merge(total_wins, matches_played, left_index=True, right_index=True)
win_percentage['win_percentage'] = (win_percentage['total_wins']/win_percentage['matches_played']*100).round(2)
win_percentage.sort_values(by='win_percentage', ascending=False)

Unnamed: 0,radiant_wins,dire_wins,total_wins,hero,matches_played,win_percentage
59.0,182849,179798,362647,Huskar,661363,54.83
82.0,34393,33154,67547,Meepo,124658,54.19
85.0,163056,165613,328669,Undying,606849,54.16
16.0,158575,161527,320102,Sand King,593430,53.94
57.0,70399,72115,142514,Omniknight,264447,53.89
...,...,...,...,...,...,...
106.0,115839,117932,233771,Ember Spirit,516301,45.28
18.0,51263,53169,104432,Sven,232553,44.91
65.0,31768,32510,64278,Batrider,144422,44.51
61.0,13027,12562,25589,Broodmother,57588,44.43


In [31]:
# Number of players with skill defined (MMR)
print('Number of players with skill defined (MMR):')
num_mmr_counts = df['num_mmr'].value_counts(dropna=False)
total_mmr_counts = num_mmr_counts.sum()

df_mmr_counts = pd.DataFrame(num_mmr_counts)
df_mmr_counts['num_mmr_percent'] = (df_mmr_counts['num_mmr']/total_mmr_counts*100).round(3)
df_mmr_counts


Number of players with skill defined (MMR):


Unnamed: 0,num_mmr,num_mmr_percent
,1754285,31.322
1.0,1667086,29.765
2.0,1106377,19.754
3.0,605001,10.802
4.0,289189,5.163
5.0,119583,2.135
6.0,42618,0.761
7.0,12675,0.226
8.0,3198,0.057
9.0,642,0.011


In [None]:
# Distribution of player skill (MMR)
bin_width= 50
nbins = int(np.ceil((df["avg_mmr"].max() - df["avg_mmr"].min()) / bin_width))
fig = px.histogram(df, x="avg_mmr", nbins=nbins, template='none', height=300)
fig.update_layout(xaxis_title='Match-average MMR', yaxis_title="Matches")
fig.show()
# fig.write_image('../images/mmr_dist.svg')
fig.write_image('../images/mmr_dist.png', scale=5)

In [9]:
df["avg_mmr"].describe()

count    3.846467e+06
mean     2.944129e+03
std      9.710240e+02
min      1.000000e+00
25%      2.339000e+03
50%      3.014000e+03
75%      3.590000e+03
max      1.002100e+04
Name: avg_mmr, dtype: float64

In [None]:
# Distribution of average MMR for number of players with MMR
colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', 10, colortype='rgb')
colors2 = n_colors('rgb(214, 214, 24)', 'rgb(214, 214, 24)', 2, colortype='rgb')
fig = go.Figure()

for players in range(0,11):
    if players==0:
        data = df['avg_mmr']
        fig.add_trace(go.Violin(
        x=data.values,              
        line_color=colors2[0],
        name='All matches'            
        ))
    
    if players>0:   
        data = df[df['num_mmr']==players]['avg_mmr']
        fig.add_trace(go.Violin(
            x=data.values,              
            line_color=colors[players-1],
            name=f'{players} ranked'
            ))

fig.update_traces(meanline_visible=True, orientation='h', side='positive', width=3, points=False)
fig.update_layout(template='none', xaxis_showgrid=False, xaxis_zeroline=False, xaxis_title="Match-average MMR")
fig.show()
fig.write_image('../images/mmr_dist_vs_ranked_players.svg')
fig.write_image('../images/mmr_dist_vs_ranked_players.png', scale=5)

In [None]:
# Distribution of match lengths
duration_mins = df['duration'].values/60
df_plot = pd.DataFrame(duration_mins, columns=['duration_mins'])
bin_width = 1
nbins = int(np.ceil((df_plot["duration_mins"].max() - df_plot["duration_mins"].min()) / bin_width))
fig = px.histogram(df_plot, x="duration_mins", nbins=nbins, template='none', height=300)
fig.update_layout(xaxis_title='Match duration (mins)', yaxis_title="Matches")
fig.update_xaxes(range=[0, 90])
fig.show()
fig.write_image('../images/duration_dist.png', scale=5)

In [36]:
df_plot["duration_mins"].describe()

count    5.600752e+06
mean     3.926928e+01
std      9.189535e+00
min      6.016667e+00
25%      3.293333e+01
50%      3.833333e+01
75%      4.451667e+01
max      1.849000e+02
Name: duration_mins, dtype: float64

In [41]:
greater_than_90 = len(df_plot[df_plot["duration_mins"]>90])
proportion_greater_than_90 = np.round(greater_than_90/len(df)*100,4)
print(f'Number of matches with duration >90mins: {greater_than_90} ({proportion_greater_than_90}% of total matches)')

Number of matches with duration >90mins: 551 (0.0098% of total matches)


In [17]:
#Games modes
df_game_modes = pd.DataFrame(df['game_mode'].value_counts().sort_index())
df_game_modes = df_game_modes.reset_index()
df_game_modes = df_game_modes.rename(columns={'index':'game_mode', 'game_mode':'matches'})
total = df_game_modes['matches'].sum()
df_game_modes['proportion'] = (df_game_modes['matches']/total*100).round(3)
df_game_modes

Unnamed: 0,game_mode,matches,proportion
0,1,40,0.001
1,2,8000,0.143
2,3,318282,5.683
3,4,266571,4.76
4,5,9934,0.177
5,16,367,0.007
6,22,4997558,89.23
