# Predict match outcomes by first 15 minutes

## Content

1. Intro
2. Setup
3. Prepare data
4. Train model & choose param
5. Test model

## Intro

Second part is dedicated to building ML model for Dota2 prediction.

___

## Setup

In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
sklearn.__version__

'1.1.2'

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

___

## Prepare data

In [5]:
df_match = pd.read_csv('../archive/match.csv')

In [6]:
Q3 = np.quantile(df_match['duration'], 0.75)
Q1 = np.quantile(df_match['duration'], 0.25)
IQR = Q3 - Q1
lowerRange = Q1 - 1.5 * IQR
upperRange = Q3 + 1.5 * IQR

In [7]:
dfIqr = df_match[df_match['duration'].between(lowerRange, upperRange)]

In [8]:
df_match_for_analysis = dfIqr[dfIqr['duration'] > 900][['match_id', 'first_blood_time', 'radiant_win']]

In [9]:
df_player_time = pd.read_csv('../archive/player_time.csv')

In [10]:
df_player_time_5_min = df_player_time[df_player_time['times'] < 301]

In [11]:
df_player_time_5_min['gold_t_r'] = df_player_time_5_min['gold_t_1'] + df_player_time_5_min['gold_t_2'] + df_player_time_5_min['gold_t_3'] + df_player_time_5_min['gold_t_4'] + df_player_time_5_min['gold_t_0']
df_player_time_5_min['lh_t_r'] = df_player_time_5_min['lh_t_0'] + df_player_time_5_min['lh_t_1'] + df_player_time_5_min['lh_t_2'] + df_player_time_5_min['lh_t_3'] + df_player_time_5_min['lh_t_4']
df_player_time_5_min['xp_t_r'] = df_player_time_5_min['xp_t_0'] + df_player_time_5_min['xp_t_1'] + df_player_time_5_min['xp_t_2'] + df_player_time_5_min['xp_t_3'] + df_player_time_5_min['xp_t_4']

df_player_time_5_min['gold_t_d'] = df_player_time_5_min['gold_t_128'] + df_player_time_5_min['gold_t_129'] + df_player_time_5_min['gold_t_130'] + df_player_time_5_min['gold_t_131'] + df_player_time_5_min['gold_t_132']
df_player_time_5_min['lh_t_d'] = df_player_time_5_min['lh_t_128'] + df_player_time_5_min['lh_t_129'] + df_player_time_5_min['lh_t_130'] + df_player_time_5_min['lh_t_131'] + df_player_time_5_min['lh_t_132']
df_player_time_5_min['xp_t_d'] = df_player_time_5_min['xp_t_128'] + df_player_time_5_min['xp_t_129'] + df_player_time_5_min['xp_t_130'] + df_player_time_5_min['xp_t_131'] + df_player_time_5_min['xp_t_132']

In [12]:
df_player_time_5_min = df_player_time_5_min[['match_id', 'times', 'gold_t_r', 'lh_t_r','xp_t_r','gold_t_d','lh_t_d','xp_t_d']]

In [13]:
df_player_time_5_min_sum = df_player_time_5_min.groupby("match_id").sum()

In [14]:
df_match_5_min = df_match_for_analysis.merge(df_player_time_5_min_sum, on='match_id')

In [15]:
df_objectives = pd.read_csv('../archive/objectives.csv')
df_objectives_5_min = df_objectives[df_objectives['time'] < 301]

In [16]:
def team_r(row, message):
    if row['subtype'] == message:
        if row['player1'] < 5:
            return 1
        else:
            return 0

In [17]:
def team_d(row, message):
    if row['subtype'] == message:
        if row['player1'] > 4:
            return 1
        else:
            return 0

In [18]:
df_objectives_5_min['roshan_radiant'] = df_objectives_5_min.apply(lambda x: team_r(x, 'CHAT_MESSAGE_ROSHAN_KILL'), axis = 1)
df_objectives_5_min['roshan_dire'] = df_objectives_5_min.apply(lambda x: team_d(x, 'CHAT_MESSAGE_ROSHAN_KILL'), axis = 1)
#
df_objectives_5_min['firstblood_radiant'] = df_objectives_5_min.apply(lambda x: team_r(x, 'CHAT_MESSAGE_FIRSTBLOOD'), axis = 1)
df_objectives_5_min['firstblood_dire'] = df_objectives_5_min.apply(lambda x: team_d(x, 'CHAT_MESSAGE_FIRSTBLOOD'), axis = 1)
#
df_objectives_5_min['tower_radiant'] = df_objectives_5_min.apply(lambda x: team_r(x, 'CHAT_MESSAGE_TOWER_KILL'), axis = 1)
df_objectives_5_min['tower_dire'] = df_objectives_5_min.apply(lambda x: team_d(x, 'CHAT_MESSAGE_TOWER_KILL'), axis = 1)

In [19]:
df_objectives_5_min = df_objectives_5_min[['match_id', 'roshan_radiant', 'roshan_dire', 'firstblood_radiant', 'firstblood_dire', 'tower_radiant', 'tower_dire']]

In [20]:
df_objectives_5_min.fillna(0, inplace=True)

In [21]:
df_objectives_5_min_sum = df_objectives_5_min.groupby('match_id').sum()

In [22]:
df = df_match_5_min.merge(df_objectives_5_min_sum, on='match_id')

In [23]:
df['radiant_win'] = df['radiant_win'].map({True: 1, False: 0})

In [24]:
df.head()

Unnamed: 0,match_id,first_blood_time,radiant_win,times,gold_t_r,lh_t_r,xp_t_r,gold_t_d,lh_t_d,xp_t_d,roshan_radiant,roshan_dire,firstblood_radiant,firstblood_dire,tower_radiant,tower_dire
0,0,1,1,900,17971,114,18471,20637,200,18134,0.0,0.0,1.0,0.0,0.0,0.0
1,1,221,0,900,15743,163,15001,19164,169,17249,0.0,0.0,1.0,0.0,0.0,0.0
2,2,190,0,900,15690,147,17671,13179,100,16446,0.0,0.0,0.0,1.0,0.0,0.0
3,3,40,0,900,14252,88,15816,15398,121,16217,0.0,0.0,0.0,1.0,0.0,0.0
4,4,58,1,900,19958,187,16293,15805,123,14025,0.0,0.0,1.0,0.0,0.0,0.0


___

## Train model

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import time
import datetime

In [26]:
X = df.drop(columns=['radiant_win'])
y = df['radiant_win']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [27]:
kf = KFold(shuffle=True, n_splits = 5, random_state = 666)

In [28]:
for i in range(10, 110, 10):
    start = datetime.datetime.now()
    clf = GradientBoostingClassifier(n_estimators=i, max_depth=4, random_state=0)
    score = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=kf, scoring='roc_auc', n_jobs = -1)
    
    print("Time", datetime.datetime.now() - start)
    print("n_estimators", i)
    print("Score", score.mean())
    print("")

Time 0:00:02.420068
n_estimators 10
Score 0.6341349794322302

Time 0:00:02.635402
n_estimators 20
Score 0.6349305146484665

Time 0:00:03.247094
n_estimators 30
Score 0.6350410822316982

Time 0:00:03.986205
n_estimators 40
Score 0.6346840767892783

Time 0:00:04.895952
n_estimators 50
Score 0.634590511689059

Time 0:00:05.730692
n_estimators 60
Score 0.6344901442219351

Time 0:00:06.843534
n_estimators 70
Score 0.6345189368254338

Time 0:00:07.677200
n_estimators 80
Score 0.6342071336504803

Time 0:00:08.594706
n_estimators 90
Score 0.6339317818170874

Time 0:00:09.501513
n_estimators 100
Score 0.6338896873107289



Not the best results. Lets try with __Random Forest__:

In [29]:
score_bag = cross_val_score(estimator=RandomForestClassifier(max_depth=4),
                            X=X_train, y=y_train, cv=kf, scoring='roc_auc', n_jobs = -1)
score_bag.mean()

0.6339277827211219

64% accurasy is not the best result. <br>
The main reason is poor data quality. I think that main reason it is so low - in data we cannot filter hero kills by time.<br>
I would try to get original data source and enrich data from there<br>
__TBD__

___