In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment', None) # default='warn'

In [3]:
# Load combined data
df_raw = pd.read_csv('../data/combined.csv')
df = df_raw.copy()

# Load hero feature data
df_features = pd.read_csv('../data/features.csv')

### Standard filter

In [11]:
# All slots add up to 660
df['slot_total'] = df[[f'hero{i}_slot' for i in range(0,10)]].sum(axis=1)
filt_1 = (df['slot_total']==660).values
# print(len(filt_1[filt_1==True]))
# print(len(filt_1[filt_1==False]))

# Valid hero ids (from exploratory, we know only invalid choice is 0)
filt_2 = [True for i in range(0,len(df))]
for i in range(0,10):
    filt_2 = filt_2 & (df[f'hero{i}_pick']!=0).values
# print(len(filt_2[filt_2==True]))
# print(len(filt_2[filt_2==False]))

# Match duration from 26-56 minutes
min_duration = 26*60 # minutes to seconds
max_duration = 56*60 # minutes to seconds
filt_3 = (df['duration']>=min_duration).values & (df['duration']<=max_duration).values
# print(len(filt_3[filt_3==True]))
# print(len(filt_3[filt_3==False]))

filt_std = filt_1 & filt_2 & filt_3 # standard filter is all filters &'d
print(f'Matches kept: {len(filt_std[filt_std==True])}')
print(f'Matches removed: {len(filt_std[filt_std==False])}')

# df_filt_std = pd.DataFrame(filt_std, columns=['filter_standard']) # convert standard filter to df
# filter_standard.to_csv('../data/filter.csv', index=False)

Matches kept: 5078155
Matches removed: 522597


### MMR filter

In [12]:
# Matches with defined MMRs
filt_4_0 = ~(df['avg_mmr'].isna()).values

# MMR groups
filt_4_1 = (df['avg_mmr']>=1).values & (df['avg_mmr']<1000).values
filt_4_2 = (df['avg_mmr']>=1000).values & (df['avg_mmr']<2000).values
filt_4_3 = (df['avg_mmr']>=2000).values & (df['avg_mmr']<3000).values
filt_4_4 = (df['avg_mmr']>=3000).values & (df['avg_mmr']<4000).values
filt_4_5 = (df['avg_mmr']>=4000).values & (df['avg_mmr']<5000).values
filt_4_6 = (df['avg_mmr']>=5000).values

# Combining MMR groups with standard filters
filt_4_0_std = filt_4_0 & filt_std
filt_4_1_std = filt_4_1 & filt_std
filt_4_2_std = filt_4_2 & filt_std
filt_4_3_std = filt_4_3 & filt_std
filt_4_4_std = filt_4_4 & filt_std
filt_4_5_std = filt_4_5 & filt_std
filt_4_6_std = filt_4_6 & filt_std

# Total match stats
total_matches = len(df)
total_matches_std = len(filt_std[filt_std==True])
total_matches_std_with_mmr = len(filt_4_0_std[filt_4_0_std==True])
percentage_matches_with_mmr = np.round(total_matches_std_with_mmr/total_matches_std*100,2)

print(f'Total matches: {total_matches}')
print(f'Total matches std: {total_matches_std}')
print(f'Total matches std with MMR: {total_matches_std_with_mmr} ({percentage_matches_with_mmr}%)\n')

# Number of matches after standard filter and MMR filters
num_matches_4_1 = len(filt_4_1_std[filt_4_1_std==True])
num_matches_4_2 = len(filt_4_2_std[filt_4_2_std==True])
num_matches_4_3 = len(filt_4_3_std[filt_4_3_std==True])
num_matches_4_4 = len(filt_4_4_std[filt_4_4_std==True])
num_matches_4_5 = len(filt_4_5_std[filt_4_5_std==True])
num_matches_4_6 = len(filt_4_6_std[filt_4_6_std==True])

print(f'MMR 1-999: {num_matches_4_1} ({np.round(num_matches_4_1/total_matches_std*100,2)}% total)  ({np.round(num_matches_4_1/total_matches_std_with_mmr*100,2)}% with MMR)')
print(f'MMR 1000-1999: {num_matches_4_2} ({np.round(num_matches_4_2/total_matches_std*100,2)}% total)  ({np.round(num_matches_4_2/total_matches_std_with_mmr*100,2)}% with MMR)')
print(f'MMR 2000-2999: {num_matches_4_3} ({np.round(num_matches_4_3/total_matches_std*100,2)}% total)  ({np.round(num_matches_4_3/total_matches_std_with_mmr*100,2)}% with MMR)')
print(f'MMR 3000-3999: {num_matches_4_4} ({np.round(num_matches_4_4/total_matches_std*100,2)}% total)  ({np.round(num_matches_4_4/total_matches_std_with_mmr*100,2)}% with MMR)')
print(f'MMR 4000-4999: {num_matches_4_5} ({np.round(num_matches_4_5/total_matches_std*100,2)}% total)  ({np.round(num_matches_4_5/total_matches_std_with_mmr*100,2)}% with MMR)')
print(f'MMR 5000+: {num_matches_4_6} ({np.round(num_matches_4_6/total_matches_std*100,2)}% total)  ({np.round(num_matches_4_6/total_matches_std_with_mmr*100,2)}% with MMR)')

Total matches: 5600752
Total matches std: 5078155
Total matches std with MMR: 3488678 (68.7%)

MMR 1-999: 117735 (2.32% total)  (3.37% with MMR)
MMR 1000-1999: 450537 (8.87% total)  (12.91% with MMR)
MMR 2000-2999: 1158591 (22.82% total)  (33.21% with MMR)
MMR 3000-3999: 1332106 (26.23% total)  (38.18% with MMR)
MMR 4000-4999: 385337 (7.59% total)  (11.05% with MMR)
MMR 5000+: 44372 (0.87% total)  (1.27% with MMR)


### Duration filter

In [15]:
# Matches with defined durations
filt_5_0 = ~(df['duration'].isna()).values

# Duration groups
filt_5_1 = (df['duration']>=(26*60)).values & (df['duration']<(31*60)).values
filt_5_2 = (df['duration']>=(31*60)).values & (df['duration']<(36*60)).values
filt_5_3 = (df['duration']>=(36*60)).values & (df['duration']<(41*60)).values
filt_5_4 = (df['duration']>=(41*60)).values & (df['duration']<(46*60)).values
filt_5_5 = (df['duration']>=(46*60)).values & (df['duration']<(51*60)).values
filt_5_6 = (df['duration']>=(51*60)).values & (df['duration']<(56*60)).values

# Combining MMR groups with standard filters
filt_5_0_std = filt_5_0 & filt_std
filt_5_1_std = filt_5_1 & filt_std
filt_5_2_std = filt_5_2 & filt_std
filt_5_3_std = filt_5_3 & filt_std
filt_5_4_std = filt_5_4 & filt_std
filt_5_5_std = filt_5_5 & filt_std
filt_5_6_std = filt_5_6 & filt_std

# Total match stats
total_matches = len(df)
total_matches_std = len(filt_std[filt_std==True])
total_matches_std_with_duration = len(filt_5_0_std[filt_5_0_std==True])
percentage_matches_with_duration = np.round(total_matches_std_with_duration/total_matches_std*100,2)

print(f'Total matches: {total_matches}')
print(f'Total matches std: {total_matches_std}')
print(f'Total matches std with duration: {total_matches_std_with_duration} ({percentage_matches_with_duration}%)\n')

# Number of matches after standard filter and MMR filters
num_matches_5_1 = len(filt_5_1_std[filt_5_1_std==True])
num_matches_5_2 = len(filt_5_2_std[filt_5_2_std==True])
num_matches_5_3 = len(filt_5_3_std[filt_5_3_std==True])
num_matches_5_4 = len(filt_5_4_std[filt_5_4_std==True])
num_matches_5_5 = len(filt_5_5_std[filt_5_5_std==True])
num_matches_5_6 = len(filt_5_6_std[filt_5_6_std==True])

print(f'Duration 26-31: {num_matches_5_1} ({np.round(num_matches_5_1/total_matches_std*100,2)}% total)  ({np.round(num_matches_5_1/total_matches_std_with_duration*100,2)}% with duration)')
print(f'Duration 31-36: {num_matches_5_2} ({np.round(num_matches_5_2/total_matches_std*100,2)}% total)  ({np.round(num_matches_5_2/total_matches_std_with_duration*100,2)}% with duration)')
print(f'Duration 36-41: {num_matches_5_3} ({np.round(num_matches_5_3/total_matches_std*100,2)}% total)  ({np.round(num_matches_5_3/total_matches_std_with_duration*100,2)}% with duration)')
print(f'Duration 41-46: {num_matches_5_4} ({np.round(num_matches_5_4/total_matches_std*100,2)}% total)  ({np.round(num_matches_5_4/total_matches_std_with_duration*100,2)}% with duration)')
print(f'Duration 46-51: {num_matches_5_5} ({np.round(num_matches_5_5/total_matches_std*100,2)}% total)  ({np.round(num_matches_5_5/total_matches_std_with_duration*100,2)}% with duration)')
print(f'Duration 51-56: {num_matches_5_6} ({np.round(num_matches_5_6/total_matches_std*100,2)}% total)  ({np.round(num_matches_5_6/total_matches_std_with_duration*100,2)}% with duration)')

Total matches: 5600752
Total matches std: 5078155
Total matches std with duration: 5078155 (100.0%)

Duration 26-31: 654790 (12.89% total)  (12.89% with duration)
Duration 31-36: 1254998 (24.71% total)  (24.71% with duration)
Duration 36-41: 1231033 (24.24% total)  (24.24% with duration)
Duration 41-46: 1036095 (20.4% total)  (20.4% with duration)
Duration 46-51: 573267 (11.29% total)  (11.29% with duration)
Duration 51-56: 327147 (6.44% total)  (6.44% with duration)


### Write filters to file

In [None]:
# Write filters to csv (filters not 'and'ed with standard)
df_mmr_duration_filt = pd.DataFrame(data=dict(
    filt_std = filt_std,
    filt_mmr_1 = filt_4_1,
    filt_mmr_2 = filt_4_2,
    filt_mmr_3 = filt_4_3,
    filt_mmr_4 = filt_4_4,
    filt_mmr_5 = filt_4_5,
    filt_mmr_6 = filt_4_6,
    filt_duration_1 = filt_5_1,
    filt_duration_2 = filt_5_2,
    filt_duration_3 = filt_5_3,
    filt_duration_4 = filt_5_4,
    filt_duration_5 = filt_5_5,
    filt_duration_6 = filt_5_6,
))
df_mmr_duration_filt.to_csv('../models/filters.csv', index=False)