# Predicting the winning side of Dota2

### 1. Libraries imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.dummy import DummyRegressor

from sklearn.utils import check_array
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

# Model assement functionality
from sklearn.model_selection import LeaveOneOut, KFold, train_test_split, cross_val_score

### Game Modes

| id | Name                   |
|----|------------------------|
| 0  | No Game Mode           |
| 1  | All Pick               |
| 2  | Captain's Mode         |
| 3  | Random Draft           |
| 4  | Single Draft           |
| 5  | All Random             |
| 7  | Diretide               |
| 8  | Reverse Captain's Mode |
| 9  | Greeviling             |
| 10 | Tutorial               |
| 11 | Mid Only               |
| 12 | Least Played           |
| 13 | New Player Pool        |
| 14 | Compendium Matchmaking |
| 15 | Custom                 |
| 16 | Captain's Draft        |
| 17 | Balanced Draft         |
| 18 | Ability Draft          |
| 20 | All Random Deathmatch  |
| 21 | Solo Mid 1v1           |
| 22 | Ranked All Pick        |

1, 2, 3, 4, 5, 12, 16

### Data imports

In [138]:
players_data = pd.read_csv("./data/players.csv")
matches_data = pd.read_csv("./data/match.csv")
player_time_data = pd.read_csv('./data/player_time.csv')
players_data = players_data.merge(matches_data[["radiant_win", "match_id", "game_mode", 'duration']], on="match_id")

### Model selection and assesment

##### Assumptions of the model
* 1. All players play equally every hero
* 2. All players in a match are playing similiarly good
* 3. The player doesn't progress with time (k-fold estimate)

* The game mode is either all pick, single draft, all random, random draft, captain’s draft, captain’s mode, or least played.

* The skill level of the players is “very-high,” which corresponds to roughly the top 8% of players. 
* No players leave the match before the game is completed.

### Data preparation for feature vector of heroes

#### Remove data where some of the players has left or is abandoned

In [79]:
# leavers_data = players_data[players_data.leaver_status != 0]
# match_ids_having_left = leavers_data.match_id.unique()
# left_mask = players_data['match_id'].isin(match_ids_having_left)
# players_data = players_data[left_mask == False]

#### Remove those are playing too long

In [145]:
df = players_data[['hero_id', 'match_id', 'player_slot', 'radiant_win', 'leaver_status']]

heroes_number = df.hero_id.max()
rad_games = df[df['player_slot'] == 0][['match_id', 'hero_id']]
dir_games = df[df['player_slot'] == 1][['match_id', 'hero_id']]

# clean those who have left
# rad_games = rad_games[rad_games.leaver_status == 0]
# dir_games = dir_games[dir_games.leaver_status == 0]

rad_heroes = pd.pivot_table(rad_games, index='match_id', aggfunc=len, columns='hero_id', fill_value=0)

dir_heroes = pd.pivot_table(dir_games, index='match_id', aggfunc=len, columns='hero_id', fill_value=0)
dir_heroes.columns += heroes_number + 1

features_df = rad_heroes.merge(dir_heroes, on='match_id')

responses = df.drop_duplicates(subset='match_id', keep="last")['radiant_win']

In [146]:
dir_heroes

hero_id,113,114,115,116,117,118,119,120,121,122,...,215,216,217,218,219,220,222,223,224,225
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [133]:
def predict(vector):
    rad_prob = lr.predict(X_test)

### Model fitting and assesement

In [258]:
X_train, X_test, y_train, y_test = train_test_split(features_df, responses,
                                                    test_size=0.10, random_state=42)
lr = LogisticRegression()
clf = lr.fit(X_train, y_train)
clf.score(X_test, y_test)

0.5412

In [262]:
reversed_indices = np.r_[111:222 , :111]
X_test2 = X_test.iloc[:, reversed_indices]

rad_probs = clf.predict_proba(X_test)[:, 1]
dir_probs = clf.predict_proba(X_test2)[:, 1]
ove_probs = (rad_probs + (dir_probs)) / 2
is_winning = ove_probs > 0.5
# # is_winning
sum(is_winning == y_test) / y_test.shape[0]
# X_test2

0.5436

In [3]:
%%html
<style>
table {float:left}
</style>