# Подход 1: Градиентный бустинг

In [51]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('./features.csv', index_col='match_id')
# print (df.head())
y = df['radiant_win'] #целевая переменная
cols_num = len(df.columns)
cols_to_drop = list(df.columns.values[(cols_num - 6):])
print(f'Признаки, связанные с итогами матча: {cols_to_drop}')
df = df.drop(columns = cols_to_drop)

Признаки, связанные с итогами матча: ['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire']


In [42]:
def missed_labels(df) -> list:
    count_series = df.count()
    labels = []
    max_val = max(count_series)
    for i in range(0, len(count_series)):
        ind = count_series.index[i] 
        if count_series[ind] != max_val:
            labels.append(ind)
    return labels
            
print (f'Признаки с пропущенными значениями: {missed_labels(df)}')

Признаки с пропущенными значениями: ['first_blood_time', 'first_blood_team', 'first_blood_player1', 'first_blood_player2', 'radiant_bottle_time', 'radiant_courier_time', 'radiant_flying_courier_time', 'radiant_first_ward_time', 'dire_bottle_time', 'dire_courier_time', 'dire_flying_courier_time', 'dire_first_ward_time']


In [54]:
def subst_missed(mode, df):
    if mode == 'zeros':
        df_out = df.fillna(0)
    elif mode == 'min':
        df_out = df.fillna(df.min())
    elif mode == 'max':
        df_out = df.fillna(df.max())
    elif mode == 'mean':
        df_out = df.fillna(df.mean())
    return df_out
        
mode = ['zeros', 'min', 'max', 'mean']#способы заполнения пропущенных значений: нулями, минимумом, максимумом, средним
# print (f'before:{missed_labels(df)}')
df_wout_miss = subst_missed(mode[0], df)
# print (f'after:{missed_labels(df_wout_miss)}')

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn import utils
import time
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

**Обработка тестовой выборки:**

In [7]:
df_test = pd.read_csv('./features_test.csv', index_col='match_id')
df_test_wout_miss = subst_missed(mode[0], df_test)
X_test = df_test_wout_miss.as_matrix()


In [8]:
y_train = y.as_matrix()
X_train = df_wout_miss.as_matrix()

In [9]:
def cvKfold(X,y):
    kf = KFold(n_splits=5, random_state=1, shuffle=True)
    kf.get_n_splits(X, y)
    return kf


In [10]:
def GB_clf_new(X_train, y_train, trees):
    start_time = datetime.datetime.now()
    clf = GradientBoostingClassifier(learning_rate = 0.1, n_estimators=trees, verbose=True, random_state=241, max_depth = 3)
    kf = cvKfold(X_train, y_train)
    start_time = datetime.datetime.now()
    cvs = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=kf, n_jobs=-1)
    cvs_val = cvs.mean()
    print (f'esimators: {trees}\ncross-validation score: {cvs_val}')
    print (f'Time elapsed:{datetime.datetime.now() - start_time}')

    

In [11]:
trees_num = [10, 20, 30, 40, 50, 60, 80, 100, 200, 250, 400]
for tr in trees_num:
    GB_clf_new(X_train, y_train, tr)


      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
         1           1.3784           21.91s
         1           1.3783           22.74s
         1           1.3787           22.16s
         1           1.3784           22.27s
         2           1.3730           19.28s
         2           1.3727           19.77s
         2           1.3732           19.25s
         2           1.3728           19.48s
         3           1.3678           16.83s
         3           1.3678           16.88s
         3           1.3680           16.72s
         3           1.3677           16.81s
         4           1.3634           14.52s
         4           1.3631           14.72s
         4           1.3635           14.42s
         4           1.3631           14.61s
         5           1.3584           12.17s
         5           1.3590           12.19s
      

*Проверим другие способы заполнения пропущенных значений для 10 деревьев:*

In [12]:
for m in mode:
    df_wout_miss = subst_missed(m, df)
    X_train = df_wout_miss.as_matrix()
    print (f'{m}')
    GB_clf_new(X_train, y_train, 10)

zeros
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
         1           1.3783           22.14s
         1           1.3784           21.98s
         1           1.3787           22.39s
         1           1.3784           22.18s
         2           1.3727           19.80s
         2           1.3730           20.00s
         2           1.3732           20.21s
         2           1.3728           20.03s
         3           1.3678           17.30s
         3           1.3678           17.36s
         3           1.3680           17.76s
         3           1.3677           18.03s
         4           1.3631           16.35s
         4           1.3634           16.16s
         4           1.3635           16.42s
         4           1.3631           16.35s
         5           1.3584           13.44s
         5           1.3590           13.48s


Для данной задачи заполнение максимальным значением показывает лучший результат с точки зрения оценки качества

# Подход 2: Логистическая регрессия

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

In [14]:
def LR(X_train, y_train, C):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    regr = LogisticRegression(C = C,  random_state=241, n_jobs=-1)
    kf = cvKfold(X_train, y_train)
    start_time = datetime.datetime.now()
    cvs = cross_val_score(regr, X_train, y_train, scoring='roc_auc', cv=kf, n_jobs=-1)
    cvs_val = cvs.mean()
    print (f'C_regularization: {C}\ncross-validation score: {cvs_val}')
    print (f'Time elapsed:{datetime.datetime.now() - start_time}')


In [15]:
C_values = np.power(10.0, np.arange(-5, 6))

for C in C_values:
    LR(X_train, y_train, C)

C_regularization: 1e-05
cross-validation score: 0.6945298784714038
Time elapsed:0:00:03.197561
C_regularization: 0.0001
cross-validation score: 0.7120012896775185
Time elapsed:0:00:03.310772
C_regularization: 0.001
cross-validation score: 0.7166125177369221
Time elapsed:0:00:05.375302
C_regularization: 0.01
cross-validation score: 0.7167538783462648
Time elapsed:0:00:07.126358
C_regularization: 0.1
cross-validation score: 0.7167265024959126
Time elapsed:0:00:07.557830
C_regularization: 1.0
cross-validation score: 0.7167226184902952
Time elapsed:0:00:08.409622
C_regularization: 10.0
cross-validation score: 0.7167225358878273
Time elapsed:0:00:08.272484
C_regularization: 100.0
cross-validation score: 0.7167224426942234
Time elapsed:0:00:08.282911
C_regularization: 1000.0
cross-validation score: 0.7167224300026744
Time elapsed:0:00:08.273436
C_regularization: 10000.0
cross-validation score: 0.7167224278889277
Time elapsed:0:00:08.479520
C_regularization: 100000.0
cross-validation score: 0

C_regularization: 0.1
cross-validation score: 0.7167265024959126

Данное значение немного меньше, чем оценка качества градиентного бустинга с 400 деревьями (0.719), но логистическая регрессия обучается намного быстрее по сравнению с градиентным бустингом: 8 секунд vs 15 минут.


In [77]:
df = pd.read_csv('./features.csv', index_col='match_id')
cols_num = len(df.columns)
cols_to_drop = list(df.columns.values[(cols_num - 6):])
df = df.drop(columns = cols_to_drop)
cols_category = ['lobby_type']
for p in range(0,5):
    cols_category.append('r%d_hero' % (p+1))
    cols_category.append('d%d_hero' % (p+1))
print(f'Категориальные признаки: {cols_category}')
df_heroes = df.loc[:,cols_category[1:]]
df_1 = df.drop(columns = cols_category)
df_wout_miss = subst_missed(mode[0], df_1)

Категориальные признаки: ['lobby_type', 'r1_hero', 'd1_hero', 'r2_hero', 'd2_hero', 'r3_hero', 'd3_hero', 'r4_hero', 'd4_hero', 'r5_hero', 'd5_hero']


In [64]:
df_wout_miss = subst_missed(mode[0], df)
X_train = df_wout_miss.as_matrix()

In [65]:
X_train.shape

(97230, 102)

In [18]:
for C in C_values:
    LR(X_train, y_train, C)

C_regularization: 1e-05
cross-validation score: 0.6951771056798932
Time elapsed:0:00:02.968290
C_regularization: 0.0001
cross-validation score: 0.7112865148335556
Time elapsed:0:00:03.308681
C_regularization: 0.001
cross-validation score: 0.7162096360086553
Time elapsed:0:00:05.706351
C_regularization: 0.01
cross-validation score: 0.7163757959125769
Time elapsed:0:00:07.242099
C_regularization: 0.1
cross-validation score: 0.7163509299301707
Time elapsed:0:00:07.965520
C_regularization: 1.0
cross-validation score: 0.7163472703068166
Time elapsed:0:00:08.061428
C_regularization: 10.0
cross-validation score: 0.7163470734968606
Time elapsed:0:00:08.284315
C_regularization: 100.0
cross-validation score: 0.7163470756461708
Time elapsed:0:00:08.487071
C_regularization: 1000.0
cross-validation score: 0.7163470904820135
Time elapsed:0:00:09.817256
C_regularization: 10000.0
cross-validation score: 0.7163471010747567
Time elapsed:0:00:10.448629
C_regularization: 100000.0
cross-validation score: 0

После удаления категориальных признаков качество практически не изменилось, местами ухудшилось. Некоторых пресонажей выбирают чаще, так как на них легче играть или они сильнее, а значит побед на таких персонажах будет больше.

In [78]:
heroes = df_heroes.as_matrix()
N = len(np.unique(heroes))
N_max = max(np.unique(heroes))
print(f'{N} различных идентификаторов героев')


108 различных идентификаторов героев


In [79]:
X_pick = np.zeros((df_heroes.shape[0], N_max))

for i, match_id in enumerate(df.index):
    for p in range(0,5):
        X_pick[i, df.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, df.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

print(X_pick.shape)
heroes_df = pd.DataFrame(X_pick, index=df.index)
X_train_all = pd.concat([df_wout_miss, heroes_df], axis=1)

(97230, 112)


In [80]:
X_train = X_train_all.as_matrix()

In [81]:
X_train.shape

(97230, 203)

In [82]:
for C in C_values:
    LR(X_train, y_train, C)

C_regularization: 1e-05
cross-validation score: 0.7148913479970942
Time elapsed:0:00:06.590475
C_regularization: 0.0001
cross-validation score: 0.7428571510322293
Time elapsed:0:00:05.901119
C_regularization: 0.001
cross-validation score: 0.7516014700448687
Time elapsed:0:00:09.858119
C_regularization: 0.01
cross-validation score: 0.7519047305979785
Time elapsed:0:00:13.274294
C_regularization: 0.1
cross-validation score: 0.7518617237159408
Time elapsed:0:00:14.189926
C_regularization: 1.0
cross-validation score: 0.7518560551551906
Time elapsed:0:00:14.695352
C_regularization: 10.0
cross-validation score: 0.7518554325401261
Time elapsed:0:00:15.235051
C_regularization: 100.0
cross-validation score: 0.7518553435406548
Time elapsed:0:00:15.136264
C_regularization: 1000.0
cross-validation score: 0.7518553308360634
Time elapsed:0:00:15.121190
C_regularization: 10000.0
cross-validation score: 0.7518553265919964
Time elapsed:0:00:15.336189
C_regularization: 100000.0
cross-validation score: 0

In [125]:
def LR_best(X_train, y_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    regr = LogisticRegression(C = 0.1,  random_state=241, n_jobs=-1)
    regr.fit(X_train, y_train)
    y_pred_test = regr.predict_proba(X_test)
    y_pred_train = regr.predict_proba(X_train)
    return y_pred_test, y_pred_train


In [126]:
def preprocessing_df(df):
    cols_num = len(df.columns)
#     cols_to_drop = list(df.columns.values[(cols_num - 6):])
#     df = df.drop(columns = cols_to_drop)
    cols_category = ['lobby_type']
    for p in range(0,5):
        cols_category.append('r%d_hero' % (p+1))
        cols_category.append('d%d_hero' % (p+1))
    df_heroes = df.loc[:,cols_category[1:]]
    df_1 = df.drop(columns = cols_category)
    df_wout_miss = subst_missed(mode[0], df_1)
    heroes = df_heroes.as_matrix()
    N = len(np.unique(heroes))
    N_max = max(np.unique(heroes))
    print(N_max)
    X_pick = np.zeros((df_heroes.shape[0], N_max))

    for i, match_id in enumerate(df.index):
        for p in range(0,5):
            X_pick[i, df.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, df.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
    heroes_df = pd.DataFrame(X_pick, index=df.index)
    print(X_pick.shape)
    X = pd.concat([df_wout_miss, heroes_df], axis=1)
    X = X.as_matrix()
    print(X.shape)
    return X

In [127]:
df_test = pd.read_csv('./features_test.csv', index_col='match_id')
X_test = preprocessing_df(df_test)


112
(17177, 112)
(17177, 203)


In [128]:
y_pred_test, y_pred_train = LR_best(X_train, y_train, X_test)

In [137]:
X_index = pd.DataFrame(df_test.index)
y_pred = pd.DataFrame(y_pred_test, columns=['radiant_lose', 'radiant_win'])
result = pd.concat([X_index, y_pred['radiant_win']], axis=1)
res_test = result.set_index('match_id')

In [139]:
res_test.head()

Unnamed: 0_level_0,radiant_win
match_id,Unnamed: 1_level_1
6,0.824757
7,0.758259
10,0.18716
13,0.859122
16,0.237957


In [140]:
res_test.to_csv('submission.csv')