#Part 1: Градиентный бустинг "в лоб"

## 1. Чтение данных, удаление признаков связанных с итогами матча

In [4]:
import pandas
features = pandas.read_csv('./features.csv', index_col='match_id')

print features.shape
features.head(15)

(97230, 108)


Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16,2449,0,4,1974,3,63
5,1430284186,1,11,5,1961,1461,19,0,1,6,...,4,4,0,-43,1453,0,512,2038,0,63
8,1430293701,1,8,3,967,1136,7,1,0,8,...,6,3,0,10,1968,0,1536,1983,12,63
9,1430299335,7,35,5,2117,1252,16,0,0,6,...,3,4,0,-15,4079,1,1540,0,63,0
11,1430308974,1,17,5,1527,906,10,0,1,7,...,3,4,0,26,3071,0,0,1572,0,63
12,1430316105,7,15,5,1651,1060,14,0,1,10,...,3,2,0,16,2384,0,0,2038,0,63


In [5]:
features.drop(['duration', 'tower_status_radiant', 'tower_status_dire',
               'barracks_status_radiant', 'barracks_status_dire'],
              axis = 1, inplace = True)

## 2. Проверка выборки на наличие пропущенных данных.

In [6]:
filled_count = features.count()
filled_count = filled_count[filled_count != features.shape[0]]
print filled_count

first_blood_time               77677
first_blood_team               77677
first_blood_player1            77677
first_blood_player2            53243
radiant_bottle_time            81539
radiant_courier_time           96538
radiant_flying_courier_time    69751
radiant_first_ward_time        95394
dire_bottle_time               81087
dire_courier_time              96554
dire_flying_courier_time       71132
dire_first_ward_time           95404
dtype: int64


1. Признаки "first_blood_time", "first_blood_team", "first_blood_player1" - имеют одинаковое количество пропущенных данных. Логично предположить что данных нет в том случае, если "первая кровь" не случилось в течении 5 минут после старта игры.
2. Признак "first_blood_player2" так же относится к характеристикам события "первая кровь", но имеет большее количество пропущенных данных. Связано это наверное с тем что событие "первая кровь" не всегда включало второго игрока который помог это реализовать (первое убийство было совершено каким либо игроком без помощи его партнеров).
3. Признаки "radiant/dire_bottle_time", "radiant/dire_courier_time", "radiant/dire_flying_courier_time", "radiant/dire_first_ward_time" отсутствуют в части данных по причине того что те или иные предметы не были куплены за первые 5 минут одной из команд.

## 3. Замена пропущенных данных

In [7]:
features = features.fillna(0)
filled_count = features.count()
filled_count = filled_count[filled_count != features.shape[0]]
print filled_count

Series([], dtype: int64)


## 4. Какой стобец содержит целевую переменну?

In [8]:
print "radiant_win"

radiant_win


In [9]:
import numpy as np

target = np.array(features['radiant_win'])
features.drop('radiant_win', axis = 1, inplace = True)

In [10]:
features_gbm = features.as_matrix()
print features_gbm.shape
print target

(97230, 102)
[1 1 0 ..., 0 0 1]


##5. Обучение модели.

In [11]:
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
import time

kfold = KFold(n = features.shape[0], n_folds=5, shuffle=True)

ntrees = [10, 20, 30, 40]

features_gbm = features.as_matrix()

result_gbm = []

for ntree in ntrees:
    scores = []
    times  = []
    for train_index, test_index in kfold:
        features_train, features_test = features_gbm[train_index], features_gbm[test_index]
        target_train, target_test = target[train_index], target[test_index]
        
        start_time = time.time()
        clf = GradientBoostingClassifier(n_estimators=ntree, verbose = False)
        clf.fit(features_train, target_train)
        
        time_took = (time.time() - start_time)/60
        preds = clf.predict_proba(features_test)
        score = roc_auc_score(target_test, preds[:,1])
        
        scores.append(score)
        times.append(time_took)
        print "score - {0}, time - {1}".format(score, time_took)
    
    result_gbm.append({'ntree' : ntree, 'score' : np.mean(scores), 'time' : np.sum(times), 'mean_fit_time' : np.mean(times)})
    
print result_gbm

score - 0.663702906621, time - 0.224034599463
score - 0.661589690375, time - 0.218166700999
score - 0.66474918126, time - 0.2173655351
score - 0.667548474413, time - 0.295690866311
score - 0.662815561219, time - 0.277170666059
score - 0.681896781077, time - 0.476887313525
score - 0.682989455676, time - 0.455286749204
score - 0.680289008934, time - 0.448336815834
score - 0.685259504701, time - 0.419909799099
score - 0.677487510999, time - 0.45917503039
score - 0.688698387721, time - 0.662212685744
score - 0.691611635993, time - 0.653836234411
score - 0.689371715976, time - 0.642866635323
score - 0.69116240857, time - 0.653055516879
score - 0.685463870226, time - 0.668648417791
score - 0.693110718645, time - 0.867354186376
score - 0.696059935907, time - 0.804435066382
score - 0.693676387087, time - 0.772638968627
score - 0.696013071129, time - 0.810420831045
score - 0.690849736091, time - 0.882886219025
[{'score': 0.66408116277768658, 'mean_fit_time': 0.24648567358652751, 'time': 1.23242

#Part 2: Logistic Regression

## 1. Оценка качества логистической регрессии "в лоб".

In [12]:
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import time

features_lr = features.as_matrix()

kfold = KFold(n = features.shape[0], n_folds=5, shuffle=True)

C_s = [0.01, 0.1, 1, 10]
result_lr_1 = []

for C in C_s:
    scores = []
    times  = []
    for train_index, test_index in kfold:
        features_train, features_test = features_lr[train_index], features_lr[test_index]
        target_train, target_test = target[train_index], target[test_index]
        
        scaler = StandardScaler()
        
        features_train = scaler.fit_transform(features_train) 
        features_test  = scaler.transform(features_test)
        
        start_time = time.time()
        clf = LogisticRegression(C = C)
        clf.fit(features_train, target_train)
        time_took = (time.time() - start_time)/60
        
        preds = clf.predict_proba(features_test)
        score = roc_auc_score(target_test, preds[:,1])
        scores.append(score)
        times.append(time_took)
        print "score - {0}, time - {1}".format(score, time_took)
    
    result_lr_1.append({'l2_penalty' : C, 'score' : np.mean(scores), 'time' : np.sum(times), 'mean_fit_time' : np.mean(times)})
    
print result_lr_1

score - 0.715891979091, time - 0.04811668396
score - 0.724322570211, time - 0.0446375648181
score - 0.717251245067, time - 0.046207801501
score - 0.713066503779, time - 0.0481929659843
score - 0.711300606041, time - 0.0445393323898
score - 0.715806735465, time - 0.0507471521695
score - 0.724340033775, time - 0.0493570804596
score - 0.717231010372, time - 0.0531502008438
score - 0.713101165603, time - 0.0558500846227
score - 0.711228647528, time - 0.0630105177561
score - 0.71579513928, time - 0.0644244154294
score - 0.72434070097, time - 0.0513360500336
score - 0.71722725477, time - 0.0589195847511
score - 0.713101451452, time - 0.0502578655879
score - 0.7112186961, time - 0.047961684068
score - 0.715793814304, time - 0.0522996822993
score - 0.724341574677, time - 0.0430809696515
score - 0.71722664561, time - 0.0602185487747
score - 0.713101790236, time - 0.0507186333338
score - 0.711218325371, time - 0.0527316331863
[{'l2_penalty': 0.01, 'score': 0.71636658083796279, 'mean_fit_time': 0

## 2. Оценка качества логистической регресси без категориальных признаков - "почти в лоб".

In [13]:
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import time

features_lr = features.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero',
                             'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1).as_matrix()

kfold = KFold(n = features.shape[0], n_folds=5, shuffle=True)

C_s = [0.01, 0.1, 1, 10]
result_lr_2 = []

for C in C_s:
    scores = []
    times  = []
    for train_index, test_index in kfold:
        features_train, features_test = features_lr[train_index], features_lr[test_index]
        target_train, target_test = target[train_index], target[test_index]
        
        scaler = StandardScaler()
        features_train = scaler.fit_transform(features_train) 
        features_test  = scaler.transform(features_test)
        
        start_time = time.time()
        clf = LogisticRegression(C = C)
        clf.fit(features_train, target_train)
        time_took = (time.time() - start_time)/60
        
        preds = clf.predict_proba(features_test)
        score = roc_auc_score(target_test, preds[:,1])
        scores.append(score)
        times.append(time_took)
        print "score - {0}, time - {1}".format(score, time_took)
    
    result_lr_2.append({'l2_penalty' : C, 'score' : np.mean(scores), 'time' : np.sum(times), 'mean_fit_time' : np.mean(times)})
    
print result_lr_2

score - 0.713508225353, time - 0.0427851835887
score - 0.710538725417, time - 0.0426883180936
score - 0.713867742462, time - 0.0428750197093
score - 0.721033528209, time - 0.0460892995199
score - 0.723995609568, time - 0.0410315990448
score - 0.713394000003, time - 0.0458480517069
score - 0.710606476998, time - 0.0460638165474
score - 0.713825041683, time - 0.0476210355759
score - 0.721062729093, time - 0.042737086614
score - 0.723971774332, time - 0.0531951506933
score - 0.713383775688, time - 0.0437853336334
score - 0.710614043783, time - 0.0417170643806
score - 0.713822092968, time - 0.0442290147146
score - 0.721062321171, time - 0.0419942498207
score - 0.723966574784, time - 0.0489133318265
score - 0.713384114557, time - 0.0500036001205
score - 0.710614928075, time - 0.0409282326698
score - 0.713821431228, time - 0.0440425197283
score - 0.72106298868, time - 0.0417982300123
score - 0.723966569484, time - 0.0449955662092
[{'l2_penalty': 0.01, 'score': 0.71658876620178458, 'mean_fit_


## 3. Подсчет количества уникальных идентификаторов.

In [14]:
hero_features = ['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero',
                             'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']

for feat in hero_features:
    print "Unique herois in {0} equal to {1}. MaxId - {2}, MinId - {3}".format(feat, features[feat].value_counts().shape, max(features[feat]), min(features[feat]))

Unique herois in r1_hero equal to (108,). MaxId - 112, MinId - 1
Unique herois in r2_hero equal to (108,). MaxId - 112, MinId - 1
Unique herois in r3_hero equal to (108,). MaxId - 112, MinId - 1
Unique herois in r4_hero equal to (108,). MaxId - 112, MinId - 1
Unique herois in r5_hero equal to (108,). MaxId - 112, MinId - 1
Unique herois in d1_hero equal to (108,). MaxId - 112, MinId - 1
Unique herois in d2_hero equal to (108,). MaxId - 112, MinId - 1
Unique herois in d3_hero equal to (108,). MaxId - 112, MinId - 1
Unique herois in d4_hero equal to (108,). MaxId - 112, MinId - 1
Unique herois in d5_hero equal to (108,). MaxId - 112, MinId - 1


## 4. Добавление категориальных признаков

In [15]:
import numpy as np

# N — количество различных героев в выборке
features_pick = np.zeros((features.shape[0], 112))

for i, match_id in enumerate(features.index):
    for p in xrange(5):
        features_pick[i, features.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        features_pick[i, features.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

## 5. Построение логистической регресси на основании новых данных.

In [16]:
import scipy.sparse
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import time
import pandas as pd

features_lr = features.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero',
                             'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1).as_matrix()
# Some fix to not scal features_pick
kfold = KFold(n = features_lr.shape[0], n_folds=5, shuffle=True)

C_s = [0.01, 0.1, 1, 10]
result_lr_3 = []

for C in C_s:
    scores = []
    times  = []
    for train_index, test_index in kfold:
        features_train, features_test = features_lr[train_index], features_lr[test_index]
        features_pick_train, features_pick_test = features_pick[train_index], features_pick[test_index]
        target_train, target_test = target[train_index], target[test_index]
        
        scaler = StandardScaler()
        features_train = scaler.fit_transform(features_train) 
        features_test  = scaler.transform(features_test)
        
        features_train = np.hstack([features_train, features_pick_train])
        features_test = np.hstack([features_test, features_pick_test])

        start_time = time.time()
        clf = LogisticRegression(C = C)
        clf.fit(features_train, target_train)
        time_took = (time.time() - start_time)/60
        
        preds = clf.predict_proba(features_test)
        score = roc_auc_score(target_test, preds[:,1])
        scores.append(score)
        times.append(time_took)
        print "score - {0}, time - {1}".format(score, time_took)
    
    result_lr_3.append({'l2_penalty' : C, 'score' : np.mean(scores), 'time' : np.sum(times), 'mean_fit_time' : np.mean(times)})
    
print result_lr_3

score - 0.749818444827, time - 0.0750632683436
score - 0.756459817724, time - 0.0658071160316
score - 0.750458193583, time - 0.0663942654928
score - 0.752478724356, time - 0.0616011023521
score - 0.749343151829, time - 0.0703068852425
score - 0.750122158016, time - 0.0889361182849
score - 0.756593761662, time - 0.0919057488441
score - 0.750642399036, time - 0.103681766987
score - 0.752369230506, time - 0.102071619034
score - 0.749865678632, time - 0.100662330786
score - 0.750125251563, time - 0.107108517488
score - 0.756571759801, time - 0.102887733777
score - 0.750629945993, time - 0.101262732347
score - 0.752317216425, time - 0.0963414510091
score - 0.749881501705, time - 0.0943530837695
score - 0.750127137355, time - 0.096683382988
score - 0.756569604625, time - 0.0987997810046
score - 0.750624977488, time - 0.115884880225
score - 0.752310601459, time - 0.115093433857
score - 0.749882942092, time - 0.111285499732
[{'l2_penalty': 0.01, 'score': 0.75171166646389243, 'mean_fit_time': 0

## 6. Построение прогноза лучшей модели (логистической регрессии на тестовую выборку)

In [17]:
# From scratch
l2_penalty = 0.1

import pandas
features_test = pandas.read_csv('./features_test.csv', index_col='match_id')
features_test = features_test.fillna(0)

In [18]:
import numpy as np

# N — количество различных героев в выборке
features_test_pick = np.zeros((features_test.shape[0], 112))

for i, match_id in enumerate(features_test.index):
    for p in xrange(5):
        features_test_pick[i, features_test.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        features_test_pick[i, features_test.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [19]:
features_test = features_test.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero',
                             'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1).as_matrix()
features_train_pick = features_pick

In [20]:
# So at this stage we have:
# features_test      -> test data
# features_test_pick -> test pick data
# =====================================
# features_train -> train data
# features_train_pick -> train_pick_data
features_train = features_lr

In [21]:
print "Train shape - {0}".format(features_train.shape)
print "Test shape - {0}".format(features_test.shape)

print "Train shape pick - {0}".format(features_train_pick.shape)
print "Test shape pick - {0}".format(features_test_pick.shape)

Train shape - (97230, 91)
Test shape - (17177, 91)
Train shape pick - (97230, 112)
Test shape pick - (17177, 112)


In [22]:
scaler = StandardScaler()
features_train = scaler.fit_transform(features_train) 
features_test  = scaler.transform(features_test)
        
features_train = np.hstack([features_train, features_train_pick])
features_test = np.hstack([features_test, features_test_pick])

In [23]:
clf = LogisticRegression(C = l2_penalty)
clf.fit(features_train, target)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [24]:
prediction = clf.predict_proba(features_test)[:,1]

In [25]:
print prediction

[ 0.82393808  0.7529729   0.18805993 ...,  0.23449531  0.62500024
  0.4260529 ]


In [26]:
print max(prediction)

0.996371319615


In [27]:
print min(prediction)

0.00842943425679


In [28]:
test_ids

NameError: name 'test_ids' is not defined

In [None]:
result = pd.DataFrame({'match_id' : test_ids, 'radiant_win' : prediction})

In [None]:
result.to_csv("subm.csv", index=False)

Качество модели на тестовой выборке 0.75529

## Xgboost + new Features

In [29]:
import xgboost as xgb

In [30]:
dtrain = xgb.DMatrix(features_train, label=target)

In [31]:
dtest = xgb.DMatrix(features_test)


In [33]:
param = {'bst:max_depth':7, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
num_round = 100
bst = xgb.cv(plst, dtrain, num_round, nfold = 4)

NameError: name 'plst' is not defined