# Imports and reading data

In [1]:
import os
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer

In [2]:
data_path = Path(os.getcwd()).parent/"data"/"dota"

df = pd.read_csv(data_path/"features.csv", index_col="match_id")
df_test = pd.read_csv(data_path/"features_test.csv", index_col="match_id")

y = df.loc[:, "radiant_win"].astype(float).to_numpy() # extract answers for df

# drop "future" features
df.drop(columns=["start_time",
                 "radiant_win",
                 "tower_status_radiant",
                 "tower_status_dire",
                 "barracks_status_radiant",
                 "barracks_status_dire",
                 "duration"], inplace=True)

# EDA

Давайте посмотрим на то какие фичи у нас есть

In [3]:
df.describe()

Unnamed: 0,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_hero,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
count,97230.0,97230.0,97230.0,97230.0,97230.0,97230.0,97230.0,97230.0,97230.0,97230.0,...,97230.0,95394.0,81087.0,96554.0,71132.0,97230.0,97230.0,97230.0,97230.0,95404.0
mean,2.630999,51.517104,3.442672,1233.405801,1147.899702,11.231996,0.357009,0.362285,8.271315,52.183452,...,0.71625,-6.875747,127.215028,-80.191893,214.870536,2.965566,3.349553,2.448339,0.689119,-6.901922
std,2.835761,32.564211,1.111741,566.588895,464.111662,9.04162,0.663889,0.626704,2.497575,32.674077,...,0.725331,39.50865,62.442018,15.26195,34.137158,1.907288,1.155609,0.813459,0.710122,40.701397
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,-236.0,-45.0,-90.0,180.0,0.0,0.0,0.0,0.0,-84.0
25%,1.0,22.0,3.0,767.0,746.0,2.0,0.0,0.0,7.0,25.0,...,0.0,-31.0,83.0,-86.0,185.0,2.0,3.0,2.0,0.0,-31.0
50%,1.0,50.0,3.0,1175.0,1113.0,11.0,0.0,0.0,8.0,50.0,...,1.0,-15.0,131.0,-84.0,203.0,3.0,3.0,2.0,1.0,-16.0
75%,7.0,75.0,4.0,1704.0,1479.0,19.0,1.0,1.0,10.0,75.0,...,1.0,9.0,165.0,-79.0,238.0,4.0,4.0,3.0,1.0,8.0
max,7.0,112.0,6.0,3319.0,4332.0,47.0,8.0,5.0,34.0,112.0,...,25.0,300.0,300.0,296.0,300.0,21.0,9.0,9.0,13.0,300.0


In [4]:
df.head()

Unnamed: 0_level_0,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_hero,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,7,11,5,2098,1489,20,0,0,7,67,...,0,35.0,103.0,-84.0,221.0,3,4,2,2,-52.0
1,0,42,4,1188,1033,9,0,1,12,49,...,0,-20.0,149.0,-84.0,195.0,5,4,3,1,-5.0
2,7,33,4,1319,1270,22,0,0,12,98,...,1,-39.0,45.0,-77.0,221.0,3,4,3,1,13.0
3,1,29,4,1779,1056,14,0,0,5,30,...,0,-30.0,124.0,-80.0,184.0,0,4,2,0,27.0
4,7,13,4,1431,1090,8,1,0,8,27,...,0,46.0,182.0,-80.0,225.0,6,3,3,0,-16.0


Какие колонки имеют пропущенные значения?

In [5]:
def get_missing(df):
    has_missing = []
    for col_name, cnt in zip(df.columns, df.count()):
        if cnt < len(df):
            has_missing.append({"col_name": col_name, "n_missing": len(df) - cnt})
    return has_missing

In [6]:
get_missing(df)

[{'col_name': 'first_blood_time', 'n_missing': 19553},
 {'col_name': 'first_blood_team', 'n_missing': 19553},
 {'col_name': 'first_blood_player1', 'n_missing': 19553},
 {'col_name': 'first_blood_player2', 'n_missing': 43987},
 {'col_name': 'radiant_bottle_time', 'n_missing': 15691},
 {'col_name': 'radiant_courier_time', 'n_missing': 692},
 {'col_name': 'radiant_flying_courier_time', 'n_missing': 27479},
 {'col_name': 'radiant_first_ward_time', 'n_missing': 1836},
 {'col_name': 'dire_bottle_time', 'n_missing': 16143},
 {'col_name': 'dire_courier_time', 'n_missing': 676},
 {'col_name': 'dire_flying_courier_time', 'n_missing': 26098},
 {'col_name': 'dire_first_ward_time', 'n_missing': 1826}]

In [7]:
get_missing(df_test)

[{'col_name': 'first_blood_time', 'n_missing': 3552},
 {'col_name': 'first_blood_team', 'n_missing': 3552},
 {'col_name': 'first_blood_player1', 'n_missing': 3552},
 {'col_name': 'first_blood_player2', 'n_missing': 7766},
 {'col_name': 'radiant_bottle_time', 'n_missing': 2895},
 {'col_name': 'radiant_courier_time', 'n_missing': 127},
 {'col_name': 'radiant_flying_courier_time', 'n_missing': 4885},
 {'col_name': 'radiant_first_ward_time', 'n_missing': 330},
 {'col_name': 'dire_bottle_time', 'n_missing': 2842},
 {'col_name': 'dire_courier_time', 'n_missing': 130},
 {'col_name': 'dire_flying_courier_time', 'n_missing': 4524},
 {'col_name': 'dire_first_ward_time', 'n_missing': 263}]

# Data preparation

In [3]:
df.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

In [4]:
assert len(get_missing(df)) == 0
assert len(get_missing(df_test)) == 0

NameError: name 'get_missing' is not defined

In [5]:
X = df.to_numpy()

# Gradient boosting

In [6]:
import time

def train_clf(Classifier, X, y, params):
    cv = KFold(n_splits=5, shuffle=True, random_state=241)

    scores = {}

    for param in tqdm(params):
        score = np.zeros(5)
        durs = np.zeros(5)
        i = 0
        for train_index, test_index in cv.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf = Classifier(**param, random_state=241)

            start = time.time()

            clf.fit(X_train, y_train)

            dur = time.time() - start

            y_pred = clf.predict_proba(X_test)[:, 1]
            score[i] = roc_auc_score(y_test, y_pred)
            durs[i] = dur
            i += 1

        scores[" ".join(map(str, param.items()))] = {"score": score.mean(), "duration": durs.sum()}
    return scores

In [12]:
clf_params = [{"n_estimators": 10}, {"n_estimators": 20}, {"n_estimators": 30}]
scores = train_clf(GradientBoostingClassifier, X, y, clf_params)
scores

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




{"('n_estimators', 10)": {'score': 0.6643877206345741,
  'duration': 48.40032339096069},
 "('n_estimators', 20)": {'score': 0.6828535735340823,
  'duration': 85.42267179489136},
 "('n_estimators', 30)": {'score': 0.6894962060591201,
  'duration': 154.79795503616333}}

# Logistic regression

In [13]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [14]:
clf_params = [{"penalty": "l2", "C": 0.001},
              {"penalty": "l2", "C": 0.01},
              {"penalty": "l2", "C": 0.1},
              {"penalty": "l2", "C": 1.},
              {"penalty": "l2", "C": 10.},
              {"penalty": "l2", "C": 1000.},
              {"penalty": "l2", "C": 10000.},
              {"penalty": "l2", "C": 100000.},
              {"penalty": 'none'}]
scores = train_clf(LogisticRegression, X, y, clf_params)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9.0), HTML(value='')))




In [15]:
scores

{"('penalty', 'l2') ('C', 0.001)": {'score': 0.7160840366228188,
  'duration': 1.9151926040649414},
 "('penalty', 'l2') ('C', 0.01)": {'score': 0.7162598557965987,
  'duration': 3.5079455375671387},
 "('penalty', 'l2') ('C', 0.1)": {'score': 0.716233028946313,
  'duration': 4.0903143882751465},
 "('penalty', 'l2') ('C', 1.0)": {'score': 0.7162292494059109,
  'duration': 3.0014796257019043},
 "('penalty', 'l2') ('C', 10.0)": {'score': 0.7162283088307158,
  'duration': 3.235761880874634},
 "('penalty', 'l2') ('C', 1000.0)": {'score': 0.7162281796391324,
  'duration': 2.8909637928009033},
 "('penalty', 'l2') ('C', 10000.0)": {'score': 0.7162281838815987,
  'duration': 2.900258779525757},
 "('penalty', 'l2') ('C', 100000.0)": {'score': 0.7162281817636524,
  'duration': 3.161609649658203},
 "('penalty', 'none')": {'score': 0.7162281817636524,
  'duration': 3.706479787826538}}

In [16]:
del X

In [7]:
scaler = StandardScaler()
df_without_cat = df.drop(columns=["lobby_type", "r1_hero", "r2_hero", "r3_hero", "r4_hero", "r5_hero", "d1_hero", "d2_hero", "d3_hero", "d4_hero", "d5_hero"])
X = df_without_cat.to_numpy()
X = scaler.fit_transform(X)

In [18]:
clf_params = [{"penalty": "l2", "C": 0.001},
              {"penalty": "l2", "C": 0.01},
              {"penalty": "l2", "C": 0.1},
              {"penalty": "l2", "C": 1.},
              {"penalty": "l2", "C": 10.},
              {"penalty": "l2", "C": 1000.},
              {"penalty": "l2", "C": 10000.},
              {"penalty": "l2", "C": 100000.},
              {"penalty": 'none'}]
scores = train_clf(LogisticRegression, X, y, clf_params)
scores

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9.0), HTML(value='')))




{"('penalty', 'l2') ('C', 0.001)": {'score': 0.7161378059340825,
  'duration': 2.097487211227417},
 "('penalty', 'l2') ('C', 0.01)": {'score': 0.7163215664424338,
  'duration': 2.554262399673462},
 "('penalty', 'l2') ('C', 0.1)": {'score': 0.7162966624026562,
  'duration': 2.6589972972869873},
 "('penalty', 'l2') ('C', 1.0)": {'score': 0.7162927662392431,
  'duration': 2.885206460952759},
 "('penalty', 'l2') ('C', 10.0)": {'score': 0.7162922705279733,
  'duration': 2.6871306896209717},
 "('penalty', 'l2') ('C', 1000.0)": {'score': 0.7162922577318527,
  'duration': 2.715484142303467},
 "('penalty', 'l2') ('C', 10000.0)": {'score': 0.7162922513714152,
  'duration': 3.5428035259246826},
 "('penalty', 'l2') ('C', 100000.0)": {'score': 0.7162922492537789,
  'duration': 2.870666980743408},
 "('penalty', 'none')": {'score': 0.7162922492537789,
  'duration': 2.7602288722991943}}

In [9]:
n_heroes = df.filter(like="hero").max().max()
n_heroes

112

In [10]:
X_pick = np.zeros((df.shape[0], n_heroes))

for i, match_id in enumerate(df.index):
    for p in range(5):
        X_pick[i, df.loc[match_id, f'r{p+1}_hero']-1] = 1
        X_pick[i, df.loc[match_id, f'd{p+1}_hero']-1] = -1

In [11]:

X = np.hstack([X, X_pick])
X = scaler.fit_transform(X)
X.shape

(97230, 202)

In [12]:
clf_params = [{"penalty": "l2", "C": 0.001},
              {"penalty": "l2", "C": 0.01},
              {"penalty": "l2", "C": 0.1},
              {"penalty": "l2", "C": 1.},
              {"penalty": "l2", "C": 10.},
              {"penalty": "l2", "C": 1000.},
              {"penalty": "l2", "C": 10000.},
              {"penalty": "l2", "C": 100000.},
              {"penalty": 'none'}]
scores = train_clf(LogisticRegression, X, y, clf_params)
scores

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9.0), HTML(value='')))




{"('penalty', 'l2') ('C', 0.001)": {'score': 0.7515256704904653,
  'duration': 3.915085554122925},
 "('penalty', 'l2') ('C', 0.01)": {'score': 0.7518281840948673,
  'duration': 5.507117509841919},
 "('penalty', 'l2') ('C', 0.1)": {'score': 0.7517849643867451,
  'duration': 6.085991382598877},
 "('penalty', 'l2') ('C', 1.0)": {'score': 0.7517765427330695,
  'duration': 6.939167022705078},
 "('penalty', 'l2') ('C', 10.0)": {'score': 0.7517755992405005,
  'duration': 6.4167468547821045},
 "('penalty', 'l2') ('C', 1000.0)": {'score': 0.7517763093385121,
  'duration': 6.6445839405059814},
 "('penalty', 'l2') ('C', 10000.0)": {'score': 0.7517763050966866,
  'duration': 6.404962778091431},
 "('penalty', 'l2') ('C', 100000.0)": {'score': 0.7517763050966866,
  'duration': 6.971964359283447},
 "('penalty', 'none')": {'score': 0.7517763050966866,
  'duration': 6.066238641738892}}

In [13]:
clf = LogisticRegression()
clf.fit(X, y)

LogisticRegression()

In [34]:
del X

# Predict test

In [14]:
df_no_cat_test = df_test.drop(columns=["start_time", "lobby_type", "r1_hero", "r2_hero", "r3_hero", "r4_hero", "r5_hero", "d1_hero", "d2_hero", "d3_hero", "d4_hero", "d5_hero"])
X = df_no_cat_test.to_numpy()

X_pick = np.zeros((df_test.shape[0], n_heroes))

for i, match_id in enumerate(df_test.index):
    for p in range(5):
        X_pick[i, df_test.loc[match_id, f'r{p+1}_hero']-1] = 1
        X_pick[i, df_test.loc[match_id, f'd{p+1}_hero']-1] = -1

X = np.hstack([X, X_pick])
X = scaler.transform(X)
X.shape

(17177, 202)

In [16]:
y_pred = clf.predict_proba(X)[:, 1]

df_ans = pd.DataFrame({"match_id": df_test.index, "radiant_win": y_pred})
df_ans.set_index("match_id", inplace=True)
df_ans.to_csv(data_path/"preds.csv", index="match_id")