## Предсказание победителя в DotA 2

disable warnings and import all needed libraries:

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, ShuffleSplit, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

Для начала загрузим признаки:

In [3]:
%%time
features = pd.read_csv('data/features.csv', index_col='match_id')
features.head()

Wall time: 20.4 s


Удалим признаки, которых нет в тестовой выборке <br>
Узнаем есть ли в признаках пропуски (пропуски скорее всего означают, что события не произошло за первые 5 минут) <br>
Заполним пропуски нулями

In [4]:
train_df = features.drop(columns=['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire',
                                  'barracks_status_radiant', 'barracks_status_dire'])
print(train_df.columns[train_df.isna().any()].tolist())
train_df = train_df.fillna(0)

['first_blood_time', 'first_blood_team', 'first_blood_player1', 'first_blood_player2', 'radiant_bottle_time', 'radiant_courier_time', 'radiant_flying_courier_time', 'radiant_first_ward_time', 'dire_bottle_time', 'dire_courier_time', 'dire_flying_courier_time', 'dire_first_ward_time']


Выделим **X_train** и **y_train**

In [5]:
X_train = train_df.values
y_train = features['radiant_win'].values

#### 1. Gradient Boosting 5-Fold cross-validation
Обучим модель на разном количестве деревьев 

In [7]:
%%time
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=17)

model_gb1 = GradientBoostingClassifier(n_estimators=10)

# calcuate ROC-AUC for each split
cv_scores_gb1 = cross_val_score(model_gb1, X_train, y_train, cv=cv, scoring='roc_auc')

Wall time: 1min 47s


На 10 деревьях 0.6638

In [8]:
print(cv_scores_gb1.mean(), cv_scores_gb1.std())

0.6638212118591542 0.0027055733382682304


In [18]:
%%time
model_gb2 = GradientBoostingClassifier(n_estimators=20)
cv_scores_gb2 = cross_val_score(model_gb2, X_train, y_train, cv=cv, scoring='roc_auc')

model_gb3 = GradientBoostingClassifier(n_estimators=30)
cv_scores_gb3 = cross_val_score(model_gb3, X_train, y_train, cv=cv, scoring='roc_auc')

Wall time: 7min 9s


In [21]:
%%time
model_gb250 = GradientBoostingClassifier(n_estimators=250)
cv_scores_gb250 = cross_val_score(model_gb250, X_train, y_train, cv=cv, scoring='roc_auc')

In [24]:
print(cv_scores_gb2.mean(), cv_scores_gb2.std())
print(cv_scores_gb3.mean(), cv_scores_gb3.std())

0.6809035362077415 0.002869911864414055
0.688028167191805 0.002920434580041873


In [25]:
print(cv_scores_gb250.mean(), cv_scores_gb250.std())

0.7148684121164571 0.0018273112133065678


In [26]:
%%time
model_gb3 = GradientBoostingClassifier(n_estimators=30)
cv_scores_gb3 = cross_val_score(model_gb3, X_train, y_train, cv=cv, scoring='roc_auc')

Wall time: 4min 12s


In [27]:
%%time
model_gb3_1 = GradientBoostingClassifier(n_estimators=30, max_depth=2)
cv_scores_gb3_1 = cross_val_score(model_gb3_1, X_train, y_train, cv=cv, scoring='roc_auc')

Wall time: 2min 30s


In [28]:
cv_scores_gb3_1.mean() #max_depth=2, n_estimators=30 => 0.6812550642543101

0.6812550642543101

При увеличении количества деревьев растет качество, а так же время обучения модели. Ускорить обучение можно уменьшив другие параметры (например, max_depth), но из-за этого пострадает качество <br>

Let's try **Logistic Regression** <br>
Функция для нормировки признаков и получения cross_val_score по ROC_AUC:

In [9]:
def get_logreg_score(X):
    X_train_scaled = StandardScaler().fit_transform(X)
    logit = LogisticRegression(C=1, penalty='l2')
    cv_scores_logit = cross_val_score(logit, X_train_scaled, y_train, cv=cv, scoring='roc_auc')
    return [cv_scores_logit.mean(), cv_scores_logit.std()]

Сходу получаем 71%

In [10]:
%%time
get_logreg_score(X_train)

Wall time: 43.8 s


[0.7152442684469731, 0.0023618816025120838]

Вспомним тот факт, что в выборке присутвуют категориальные признаки, попробуем удалить их и замерить качество еще раз:

In [11]:
%%time
cat_features = ['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
X_train = train_df.drop(columns=cat_features).values
print(get_logreg_score(X_train))

[0.7153294734641201, 0.0022947329444929403]
Wall time: 31.7 s


Сколько всего различных героев существует в выборке?

In [59]:
heroes = ['r1_hero', 'r2_hero', 'r3_hero','r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
len(np.unique(train_df[heroes].values))
#108 или 112

108

Сформируем 112 признаков. i-й признак будет равен нулю, если герой не участвовал в матче, единице - если играл за Radiant, минус единице - если играл за Dire

In [13]:
X_pick = np.zeros((train_df.shape[0], 112))

for i, match_id in enumerate(train_df.index):
    for p in range(5):
        X_pick[i, train_df.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, train_df.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

Добавим перекодированные признаки к X_train

In [14]:
X_train = np.concatenate([X_train, X_pick], axis=1)

Получаем `ROC-AUC = 0.75105`

In [15]:
get_logreg_score(X_train)

[0.751054575291998, 0.0010964869190840853]

Посмотрим на корреляцию признаков между друг другом

In [16]:
#correlation matrix
corr = train_df.corr().abs()

s = corr.unstack()
so = s.sort_values(kind="quicksort")
print (so[(so < 1) & (so > 0.8)])

d1_lh                d1_gold                0.844783
d1_gold              d1_lh                  0.844783
d4_gold              d4_lh                  0.847389
d4_lh                d4_gold                0.847389
d2_lh                d2_gold                0.847995
d2_gold              d2_lh                  0.847995
d5_lh                d5_gold                0.848907
d5_gold              d5_lh                  0.848907
d3_lh                d3_gold                0.849217
d3_gold              d3_lh                  0.849217
r1_lh                r1_gold                0.851178
r1_gold              r1_lh                  0.851178
r2_lh                r2_gold                0.855202
r2_gold              r2_lh                  0.855202
r3_gold              r3_lh                  0.855882
r3_lh                r3_gold                0.855882
r5_lh                r5_gold                0.857561
r5_gold              r5_lh                  0.857561
r4_gold              r4_lh                  0.

Мы видим, что присутствуют сильно скоррелированные между собой признаки: **xp и level, first_blood_player1 и first_blood_team, lh и gold**. Это может привести к неустойчивости финальной модели при использовании логистической регрессии, так что попробуем удалить эти признаки

In [17]:
corr_features = ['r1_level', 'r2_level', 'r3_level', 'r4_level', 'r5_level',
                 'd1_level', 'd2_level', 'd3_level', 'd4_level', 'd5_level',
                'first_blood_player1', 'r1_lh', 'r2_lh', 'r3_lh', 'r4_lh', 'r5_lh',
                 'd1_lh', 'd2_lh', 'd3_lh', 'd4_lh', 'd5_lh']

In [18]:
%%time
X_train = train_df.drop(columns=cat_features+corr_features).values
X_train = np.concatenate([X_train, X_pick], axis=1)
print(get_logreg_score(X_train))

[0.7481035489702818, 0.001151820035315145]
Wall time: 47.6 s


Как можно заметить, качество даже немного ухудшилось. Попробуем создать агрегированные признаки для каждой команды: просуммируем каждый признак по всем пяти героям команды и посмотрим как изменится скор

In [49]:
%%time
train_df['r_level_sum'] = 0
train_df['d_level_sum'] = 0
train_df['r_xp_sum'] = 0 
train_df['d_xp_sum'] = 0
train_df['r_gold_sum'] = 0
train_df['d_gold_sum'] = 0
train_df['r_lh_sum'] = 0 
train_df['d_lh_sum'] = 0
train_df['r_kills_sum'] = 0
train_df['d_kills_sum'] = 0
train_df['r_deaths_sum'] = 0
train_df['d_deaths_sum'] = 0
train_df['r_items_sum'] = 0
train_df['d_items_sum'] = 0
for i in range(5):
    train_df['r_level_sum'] += train_df['r%d_level' % (i+1)]
    train_df['d_level_sum'] += train_df['d%d_level' % (i+1)]
    train_df['r_xp_sum'] += train_df['r%d_xp' % (i+1)]
    train_df['d_xp_sum'] += train_df['d%d_xp' % (i+1)]
    train_df['r_gold_sum'] += train_df['r%d_gold' % (i+1)]
    train_df['d_gold_sum'] += train_df['d%d_gold' % (i+1)]
    train_df['r_lh_sum'] += train_df['r%d_lh' % (i+1)]
    train_df['d_lh_sum'] += train_df['d%d_lh' % (i+1)]
    train_df['r_kills_sum'] += train_df['r%d_kills' % (i+1)]
    train_df['d_kills_sum'] += train_df['d%d_kills' % (i+1)]
    train_df['r_deaths_sum'] += train_df['r%d_deaths' % (i+1)]
    train_df['d_deaths_sum'] += train_df['d%d_deaths' % (i+1)]
    train_df['r_items_sum'] += train_df['r%d_items' % (i+1)]
    train_df['d_items_sum'] += train_df['d%d_items' % (i+1)]
    
player_features = ['r1_hero', 'r1_level', 'r1_xp', 'r1_gold', 'r1_lh', 'r1_kills', 'r1_deaths', 'r1_items',
                   'r2_hero', 'r2_level', 'r2_xp', 'r2_gold', 'r2_lh', 'r2_kills', 'r2_deaths', 'r2_items',
                   'r3_hero', 'r3_level', 'r3_xp', 'r3_gold', 'r3_lh', 'r3_kills', 'r3_deaths', 'r3_items',
                   'r4_hero', 'r4_level', 'r4_xp', 'r4_gold', 'r4_lh', 'r4_kills', 'r4_deaths', 'r4_items',
                   'r5_hero', 'r5_level', 'r5_xp', 'r5_gold', 'r5_lh', 'r5_kills', 'r5_deaths', 'r5_items',
                   'd1_hero', 'd1_level', 'd1_xp', 'd1_gold', 'd1_lh', 'd1_kills', 'd1_deaths', 'd1_items',
                   'd2_hero', 'd2_level', 'd2_xp', 'd2_gold', 'd2_lh', 'd2_kills', 'd2_deaths', 'd2_items',
                   'd3_hero', 'd3_level', 'd3_xp', 'd3_gold', 'd3_lh', 'd3_kills', 'd3_deaths', 'd3_items',
                   'd4_hero', 'd4_level', 'd4_xp', 'd4_gold', 'd4_lh', 'd4_kills', 'd4_deaths', 'd4_items',
                   'd5_hero', 'd5_level', 'd5_xp', 'd5_gold', 'd5_lh', 'd5_kills', 'd5_deaths', 'd5_items']
X_train = train_df.drop(columns=player_features+['lobby_type', 'first_blood_player1'])
X_train = np.concatenate([X_train, X_pick], axis=1)
print(get_logreg_score(X_train))

[0.7515533999133, 0.001150669057955364]
Wall time: 45.4 s


ROC-AUC вырос на 0.0005

Настроим параметры C и penalty с помощью GridSearchCV:

In [19]:
penalty = ['l1', 'l2']
C = np.logspace(0,4,5)
params = dict(C=C, penalty=penalty)

In [20]:
%%time
X_train_scaled = StandardScaler().fit_transform(X_train)
logit_cv = GridSearchCV(LogisticRegression(), params, cv=cv, scoring='roc_auc')
logit_cv.fit(X_train_scaled, y_train)

print("best parameters:", logit_cv.best_params_)
print("best score:", logit_cv.best_score_)

best parameters: {'C': 1.0, 'penalty': 'l1'}
best score: 0.7481104974558247
Wall time: 23min 59s


Тюнинг увеличил скор всего на 0.00001

Функция для вывода предсказаний тестовой выборки в файл (чтобы засабмититься на каггл)

In [44]:
def write_to_submission_file(predicted_labels, out_file,
                             target='radiant_win', index_label='match_id'):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = test_df.index,
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

Проделываем те же преобразования с тестовой выборкой

In [46]:
features_test = pd.read_csv('data/features_test.csv', index_col='match_id')
features_test.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1430287923,0,93,4,1103,1089,8,0,1,9,...,0,12.0,247.0,-86.0,272.0,3,4,2,0,118.0
7,1430293357,1,20,2,556,570,1,0,0,9,...,2,-29.0,168.0,-54.0,,3,2,2,1,16.0
10,1430301774,1,112,2,751,808,1,0,0,13,...,1,-22.0,46.0,-87.0,186.0,1,3,3,0,-34.0
13,1430323933,1,27,3,708,903,1,1,1,11,...,2,-49.0,30.0,-89.0,210.0,3,4,2,1,-26.0
16,1430331112,1,39,4,1259,661,4,0,0,9,...,0,36.0,180.0,-86.0,180.0,1,3,2,1,-33.0


In [47]:
test_df = features_test.fillna(0)

In [48]:
X_pick_test = np.zeros((test_df.shape[0], 112))

for i, match_id in enumerate(test_df.index):
    for p in range(5):
        X_pick_test[i, test_df.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick_test[i, test_df.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [52]:
test_df['r_level_sum'] = 0
test_df['d_level_sum'] = 0
test_df['r_xp_sum'] = 0 
test_df['d_xp_sum'] = 0
test_df['r_gold_sum'] = 0
test_df['d_gold_sum'] = 0
test_df['r_lh_sum'] = 0 
test_df['d_lh_sum'] = 0
test_df['r_kills_sum'] = 0
test_df['d_kills_sum'] = 0
test_df['r_deaths_sum'] = 0
test_df['d_deaths_sum'] = 0
test_df['r_items_sum'] = 0
test_df['d_items_sum'] = 0
for i in range(5):
    test_df['r_level_sum'] += test_df['r%d_level' % (i+1)]
    test_df['d_level_sum'] += test_df['d%d_level' % (i+1)]
    test_df['r_xp_sum'] += test_df['r%d_xp' % (i+1)]
    test_df['d_xp_sum'] += test_df['d%d_xp' % (i+1)]
    test_df['r_gold_sum'] += test_df['r%d_gold' % (i+1)]
    test_df['d_gold_sum'] += test_df['d%d_gold' % (i+1)]
    test_df['r_lh_sum'] += test_df['r%d_lh' % (i+1)]
    test_df['d_lh_sum'] += test_df['d%d_lh' % (i+1)]
    test_df['r_kills_sum'] += test_df['r%d_kills' % (i+1)]
    test_df['d_kills_sum'] += test_df['d%d_kills' % (i+1)]
    test_df['r_deaths_sum'] += test_df['r%d_deaths' % (i+1)]
    test_df['d_deaths_sum'] += test_df['d%d_deaths' % (i+1)]
    test_df['r_items_sum'] += test_df['r%d_items' % (i+1)]
    test_df['d_items_sum'] += test_df['d%d_items' % (i+1)]
    
player_features = ['r1_hero', 'r1_level', 'r1_xp', 'r1_gold', 'r1_lh', 'r1_kills', 'r1_deaths', 'r1_items',
                   'r2_hero', 'r2_level', 'r2_xp', 'r2_gold', 'r2_lh', 'r2_kills', 'r2_deaths', 'r2_items',
                   'r3_hero', 'r3_level', 'r3_xp', 'r3_gold', 'r3_lh', 'r3_kills', 'r3_deaths', 'r3_items',
                   'r4_hero', 'r4_level', 'r4_xp', 'r4_gold', 'r4_lh', 'r4_kills', 'r4_deaths', 'r4_items',
                   'r5_hero', 'r5_level', 'r5_xp', 'r5_gold', 'r5_lh', 'r5_kills', 'r5_deaths', 'r5_items',
                   'd1_hero', 'd1_level', 'd1_xp', 'd1_gold', 'd1_lh', 'd1_kills', 'd1_deaths', 'd1_items',
                   'd2_hero', 'd2_level', 'd2_xp', 'd2_gold', 'd2_lh', 'd2_kills', 'd2_deaths', 'd2_items',
                   'd3_hero', 'd3_level', 'd3_xp', 'd3_gold', 'd3_lh', 'd3_kills', 'd3_deaths', 'd3_items',
                   'd4_hero', 'd4_level', 'd4_xp', 'd4_gold', 'd4_lh', 'd4_kills', 'd4_deaths', 'd4_items',
                   'd5_hero', 'd5_level', 'd5_xp', 'd5_gold', 'd5_lh', 'd5_kills', 'd5_deaths', 'd5_items']
X_test = test_df.drop(columns=player_features+['lobby_type', 'first_blood_player1'])
X_test = np.concatenate([X_test, X_pick_test], axis=1)

Wall time: 359 ms


Нормируем выборки, чтобы зафититься и получить оценки принадлежности для ROC-AUC

In [53]:
X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit_transform(X_test)

In [54]:
%%time
logit = LogisticRegression(C=1, penalty='l1')
logit.fit(X_train_scaled, y_train)
predictions = logit.predict_proba(X_test_scaled)[:, 1]

Wall time: 42.6 s


Максимальное и минимальное значение прогноза

In [58]:
print('Max:', predictions.max())
print ('Min:', predictions.min())

Max: 0.9963148561147639
Min: 0.008539959627259305


Наконец, запишем массив оценок для тестовой выборки в файл

In [55]:
write_to_submission_file(predictions, 'submission.csv')

Получили `ROC-AUC = 0.75545` на Public LB