# Real-time Dota predictions

### Imports

In [74]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, plot_roc_curve, roc_auc_score

# interactive viz
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from scipy.stats import shapiro
from statsmodels.graphics.gofplots import qqplot

from sklearn.preprocessing import PowerTransformer

%matplotlib inline

In [5]:
plt.style.use('default')

In [9]:
time_df = pd.read_csv('../data/player_time.csv')
matches_df = pd.read_csv("../data/match.csv")
player_df = pd.read_csv("../data/players.csv")

matches_df.set_index('match_id');

### Helper methods

In [10]:
# todo: move those in a separate module

RAD_GLD_COLS = [f'gold_t_{i}' for i in range(0, 5)]
DIR_GLD_COLS = [f'gold_t_{i}' for i in range(128, 133)]

RAD_XP_COLS = [f'xp_t_{i}' for i in range(0, 5)]
DIR_XP_COLS = [f'xp_t_{i}' for i in range(128, 133)]

def get_mean_gold_difference(df):
    return df[RAD_GLD_COLS].mean(axis=1) - df[DIR_GLD_COLS].mean(axis=1)

def get_mean_xp_difference(df):
    return df[RAD_XP_COLS].mean(axis=1) - df[DIR_XP_COLS].mean(axis=1)

### Data pre-configuration

In [11]:
time_df['mean_gold_diff'] = get_mean_gold_difference(time_df)
time_df['mean_xp_diff'] = get_mean_xp_difference(time_df)

# set time and join with radiant win
time_df['times'] = pd.to_datetime(time_df['times'], unit='s') # turn the seconds to real time
match_time_df = time_df.pivot(index='match_id', columns='times', values='mean_gold_diff')

### Data Exploration on a particular match

In [12]:
@widgets.interact()
def show_match(match=(1, 50)):

    plt.figure(figsize=(20, 6))
    plt.grid(True)

    some_match = time_df[time_df['match_id'] == match]
    does_win = matches_df.at[match, 'radiant_win']
    winner = 'Radiant' if does_win else 'Dire'

    g1 = sns.lineplot(data=some_match, y='mean_gold_diff', x='times', label='Teams Mean Gold Different')
    g2 = sns.lineplot(data=some_match, y='mean_xp_diff', x='times', label='Teams Mean Experience Different')

    date_form = DateFormatter("%M")
    g1.xaxis.set_major_formatter(date_form)

    x_label = f'minutes \ {winner} wins'

    g1.set(xlabel=x_label, ylabel='Radiant \ Dire Gold and Experience Difference');
    return match

interactive(children=(IntSlider(value=25, description='match', max=50, min=1), Output()), _dom_classes=('widge…

## Explore the predictors on a 5 minutes interval

In [76]:
@widgets.interact()
def plot_mean_gold_difference(minute = (1, 60)):
    sns.distplot(match_time_df.iloc[:, minute])
    return minute

interactive(children=(IntSlider(value=30, description='minute', max=60, min=1), Output()), _dom_classes=('widg…

In [79]:
## Interactive QQ plot

In [72]:
@widgets.interact()
def qqplot_of_gold_by_minute(minute = (1, 60)):
    qqplot(match_time_df.iloc[:, minute])
    return minute

interactive(children=(IntSlider(value=30, description='minute', max=60, min=1), Output()), _dom_classes=('widg…

## Test for normality using Shapiro test

In [77]:
@widgets.interact()
def shapiro_wilk_test_by_minute(minute=(1, 60)):
    stat, p = shapiro(match_time_df.iloc[:, minute].dropna().to_numpy())
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    if p > alpha:
        print('Sample looks Gaussian (fail to reject H0)')
    else:
        print('Sample does not look Gaussian (reject H0)')


interactive(children=(IntSlider(value=30, description='minute', max=60, min=1), Output()), _dom_classes=('widg…

## Logistic regression on a time window of 5 minutes

In [80]:
def train(X_train, y_train, minute, model = LogisticRegression()):
    ncols = X_train.shape[1]
    start = max(0, minute - 6)
    end = min(ncols - 1, minute - 1)

    is_present = -X_train.iloc[:, start:end].isna().any(axis=1)

    return model.fit(X_train[is_present].iloc[:, start:end], y_train[is_present])

In [81]:
X_train, X_test, y_train, y_test = train_test_split(match_time_df, matches_df['radiant_win'], test_size=0.8, random_state=1)

test_match = X_test.iloc[10:11, :]
does_win = y_test[0]

model = train(X_train, y_train, 26)

game_minutes = range(6, 40)
models = [(train(X_train, y_train, minute), minute) for minute in game_minutes]
probs = [model.predict_proba(X_test.iloc[6:7, i-6:i-1].dropna())[0] for model, i in models]

## Probability of winning visualization


In [82]:
match_id = X_test.index[0]

match = X_test.iloc[match_id:match_id+1, :]
probs = [model.predict_proba(match.iloc[:, i-6:i-1].dropna())[0] for model, i in models]
probs_df = pd.DataFrame([{'radiant_win_probability': prob[1], 'dire_win_probability': prob[0]} for prob in probs])

probs_df['radiant_win_probability']
winner = 'Radiant' if does_win else 'Dire'
x_indexer = range(6, 40)
g1 = sns.lineplot(data=probs_df, y='dire_win_probability', x=x_indexer, label='Dire Wins')
g2 = sns.lineplot(data=probs_df, y='radiant_win_probability', x=x_indexer, label='Radiant Wins')

g1.set(xlabel=f'minutes of the game \ {winner} wins', ylabel='Teams win probability');

interactive(children=(Dropdown(description='match_id', options=(26247, 35067, 34590, 16668, 12196, 2600, 9047,…