In [1]:
import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
import datetime
import xgboost
import catboost as cb
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,cross_val_score, train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

import data_helpers as dh
import data_iter1 as di
import submit_report as rep
import model_utils as mu

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# загрузка данных
events  = pd.read_csv('event_data_train.csv')
submissions = pd.read_csv('submissions_data_train.csv')

# разделение данных для обучения на train и test
_ = dh.split_events_submissions(events, submissions, test_size=0.3)
events_train_orig, events_test_orig, submissions_train_orig, submissions_test_orig = _

# подготовка данных
X_train, y_train = di.get_x_y(events_train_orig, submissions_train_orig)
X_test, y_test = di.get_x_y(events_test_orig, submissions_test_orig)

#загрузка данных для предсказаний
events_pred  = pd.read_csv('events_data_test.csv')
submissions_pred = pd.read_csv('submission_data_test.csv')
X_pred , _ = di.get_x_y(events_pred, submissions_pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['date'] = pd.to_datetime(data.timestamp, unit='s')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['day'] = data.date.dt.date


In [3]:
#Подготовка данных для тренировки модели
X = X_train.append(X_test)
y = y_train.append(y_test)
user_min_time = events.groupby('user_id', as_index=False).agg({'timestamp':'min'})
X['user_id'] = X.index
X = X.merge(user_min_time[['user_id','timestamp']], how='outer')
X['date'] = pd.to_datetime(X['timestamp'], unit = 's')
X['day'] = X['date'].dt.date
X['weekday'] = X['date'].dt.dayofweek
#X = X.drop('user_id', axis=1)
X = X.drop('timestamp', axis=1)
X['year'] = X['date'].dt.year
X['month'] = X['date'].dt.month
X['hour'] = X['date'].dt.hour
X = X.drop('day', axis=1)
X = X.drop('date', axis=1)
X.shape

Defaulting to column, but this will raise an ambiguity error in a future version
  


(19234, 11)

In [4]:
#подготовка данных для предсказаний
user_min_time_pred = events_pred.groupby('user_id', as_index=False).agg({'timestamp':'min'})
X_pred['user_id'] = X_pred.index
X_pred = X_pred.merge(user_min_time_pred[['user_id','timestamp']], how='outer')
X_pred['date'] = pd.to_datetime(X_pred['timestamp'], unit = 's')
X_pred['day'] = X_pred['date'].dt.date
X_pred['weekday'] = X_pred['date'].dt.dayofweek
X_pred = X_pred.drop('timestamp', axis=1)
X_pred['year'] = X_pred['date'].dt.year
X_pred['month'] = X_pred['date'].dt.month
X_pred['hour'] = X_pred['date'].dt.hour
X_pred = X_pred.drop('day', axis=1)
X_pred = X_pred.drop('date', axis=1)
X_pred.shape

Defaulting to column, but this will raise an ambiguity error in a future version
  after removing the cwd from sys.path.


(6184, 11)

In [5]:
#данные для OneHotEncoder
year = [2015, 2016, 2017, 2018, 2019]
month = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
weekday = [0, 1, 2, 3, 4, 5, 6]
hour = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]

In [6]:
for i in year:
    X['year='+str(i)] = (X['year']==i).astype('float')
    X_pred['year='+str(i)] = (X_pred['year']==i).astype('float')
for i in month:
    X['month='+str(i)] = (X['month']==i).astype('float')
    X_pred['month='+str(i)] = (X_pred['month']==i).astype('float')
for i in weekday:
    X['weekday='+str(i)] = (X['weekday']==i).astype('float')
    X_pred['weekday='+str(i)] = (X_pred['weekday']==i).astype('float')
for i in hour:
    X['hour='+str(i)] = (X['hour']==i).astype('float')
    X_pred['hour='+str(i)] = (X_pred['hour']==i).astype('float')
X.head()

Unnamed: 0,correct,wrong,discovered,passed,started_attempt,viewed,user_id,weekday,year,month,...,hour=14,hour=15,hour=16,hour=17,hour=18,hour=19,hour=20,hour=21,hour=22,hour=23
0,0.0,0.0,1,0,0,1,1,4,2016,9,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,0.0,9,9,2,9,2,2,2017,12,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,4.0,15,15,4,20,3,0,2015,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,2.0,1,1,0,1,5,4,2016,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1,1,0,1,7,2,2018,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#data = events[events['action']=='passed']
#ohe = OneHotEncoder(sparse=False)
#new_ohe_features = ohe.fit_transform(events['step_id'].unique().reshape(-1,1))
#tmp = pd.DataFrame(new_ohe_features, columns=['step='+str(i) for i in range(new_ohe_features.shape[1])])
#data = pd.concat([data, tmp], axis=1)
#data.shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
X = X.drop('year',axis=1)
X = X.drop('month',axis=1)
X = X.drop('weekday',axis=1)
X = X.drop('hour',axis=1)
X_pred = X_pred.drop('year',axis=1)
X_pred = X_pred.drop('month',axis=1)
X_pred = X_pred.drop('weekday',axis=1)
X_pred = X_pred.drop('hour',axis=1)
X = X.set_index('user_id')
X_pred = X_pred.set_index('user_id')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=2, 
                            min_samples_leaf=10, min_samples_split=10, 
                            class_weight='balanced')
rf.fit(X_train, y_train)
pred_proba = rf.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)
# должны получить roc 0.92  +- 0.02

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=10,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

roc на test 0.9224385342440852


In [9]:
# важность фич
fimp = mu.get_feature_importances_df(rf.feature_importances_, X_train.columns)
fimp.head(15)

Unnamed: 0,weight
correct,0.288726
passed,0.157939
discovered,0.149019
viewed,0.132077
wrong,0.116802
started_attempt,0.086507
month=6,0.012327
year=2015,0.003739
year=2017,0.003316
year=2016,0.003204


In [10]:
# значение к метрике на кроссвалидации коррелирует к метрике на степике
rfcv = RandomForestClassifier(**rf.get_params())
cv_scores = cross_val_score(rfcv, X_train, y_train, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores = np.mean(cv_scores)
print ('mean score', mean_cv_scores)

mean score 0.9282900525460018


In [11]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
pred_proba = log_reg.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

roc на test 0.9170485920432623


In [12]:
neigh = KNeighborsClassifier(n_neighbors=37)
neigh.fit(X_train, y_train)
pred_proba = neigh.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=37, p=2,
           weights='uniform')

roc на test 0.9144596209980518


In [13]:
y_train_int = y_train.map(lambda x: 1 if (x==True) else 0)
y_test_int = y_test.map(lambda x: 1 if (x==True) else 0)
cb_clf = cb.CatBoostClassifier()
cb_clf.fit(X_train, y_train_int)
pred_proba = cb_clf.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)

Learning rate set to 0.035658
0:	learn: 0.6440985	total: 160ms	remaining: 2m 40s
1:	learn: 0.6092130	total: 234ms	remaining: 1m 56s
2:	learn: 0.5784963	total: 300ms	remaining: 1m 39s
3:	learn: 0.5499355	total: 362ms	remaining: 1m 30s
4:	learn: 0.5268833	total: 431ms	remaining: 1m 25s
5:	learn: 0.4983255	total: 496ms	remaining: 1m 22s
6:	learn: 0.4711596	total: 563ms	remaining: 1m 19s
7:	learn: 0.4496121	total: 630ms	remaining: 1m 18s
8:	learn: 0.4376088	total: 695ms	remaining: 1m 16s
9:	learn: 0.4248111	total: 757ms	remaining: 1m 14s
10:	learn: 0.4153851	total: 824ms	remaining: 1m 14s
11:	learn: 0.4079270	total: 865ms	remaining: 1m 11s
12:	learn: 0.3976732	total: 927ms	remaining: 1m 10s
13:	learn: 0.3894449	total: 1.01s	remaining: 1m 10s
14:	learn: 0.3830097	total: 1.07s	remaining: 1m 10s
15:	learn: 0.3753496	total: 1.14s	remaining: 1m 9s
16:	learn: 0.3678521	total: 1.2s	remaining: 1m 9s
17:	learn: 0.3618150	total: 1.27s	remaining: 1m 9s
18:	learn: 0.3569087	total: 1.33s	remaining: 1m 

161:	learn: 0.2626875	total: 11.4s	remaining: 58.8s
162:	learn: 0.2626521	total: 11.4s	remaining: 58.7s
163:	learn: 0.2626198	total: 11.5s	remaining: 58.5s
164:	learn: 0.2625959	total: 11.5s	remaining: 58.3s
165:	learn: 0.2625611	total: 11.6s	remaining: 58.1s
166:	learn: 0.2625205	total: 11.6s	remaining: 58s
167:	learn: 0.2624002	total: 11.7s	remaining: 57.8s
168:	learn: 0.2622534	total: 11.7s	remaining: 57.7s
169:	learn: 0.2622039	total: 11.8s	remaining: 57.6s
170:	learn: 0.2621121	total: 11.9s	remaining: 57.5s
171:	learn: 0.2620240	total: 11.9s	remaining: 57.3s
172:	learn: 0.2619559	total: 12s	remaining: 57.3s
173:	learn: 0.2618973	total: 12.1s	remaining: 57.3s
174:	learn: 0.2618045	total: 12.1s	remaining: 57.1s
175:	learn: 0.2617955	total: 12.2s	remaining: 56.9s
176:	learn: 0.2617545	total: 12.2s	remaining: 56.8s
177:	learn: 0.2617047	total: 12.3s	remaining: 56.7s
178:	learn: 0.2616401	total: 12.3s	remaining: 56.6s
179:	learn: 0.2615146	total: 12.4s	remaining: 56.6s
180:	learn: 0.26

320:	learn: 0.2531826	total: 20.7s	remaining: 43.8s
321:	learn: 0.2531318	total: 20.7s	remaining: 43.7s
322:	learn: 0.2530918	total: 20.8s	remaining: 43.6s
323:	learn: 0.2530035	total: 20.8s	remaining: 43.5s
324:	learn: 0.2529438	total: 20.9s	remaining: 43.4s
325:	learn: 0.2529175	total: 21s	remaining: 43.3s
326:	learn: 0.2528352	total: 21s	remaining: 43.2s
327:	learn: 0.2528093	total: 21.1s	remaining: 43.2s
328:	learn: 0.2527326	total: 21.1s	remaining: 43.1s
329:	learn: 0.2527225	total: 21.2s	remaining: 43s
330:	learn: 0.2526917	total: 21.2s	remaining: 42.9s
331:	learn: 0.2526130	total: 21.3s	remaining: 42.9s
332:	learn: 0.2525783	total: 21.4s	remaining: 42.8s
333:	learn: 0.2525287	total: 21.4s	remaining: 42.7s
334:	learn: 0.2524898	total: 21.5s	remaining: 42.6s
335:	learn: 0.2524372	total: 21.5s	remaining: 42.6s
336:	learn: 0.2523569	total: 21.6s	remaining: 42.5s
337:	learn: 0.2523199	total: 21.6s	remaining: 42.4s
338:	learn: 0.2522832	total: 21.7s	remaining: 42.3s
339:	learn: 0.2522

479:	learn: 0.2467777	total: 29.5s	remaining: 31.9s
480:	learn: 0.2467656	total: 29.5s	remaining: 31.9s
481:	learn: 0.2467542	total: 29.6s	remaining: 31.8s
482:	learn: 0.2467307	total: 29.6s	remaining: 31.7s
483:	learn: 0.2467214	total: 29.7s	remaining: 31.7s
484:	learn: 0.2466432	total: 29.7s	remaining: 31.6s
485:	learn: 0.2466201	total: 29.8s	remaining: 31.5s
486:	learn: 0.2465554	total: 29.9s	remaining: 31.5s
487:	learn: 0.2465391	total: 29.9s	remaining: 31.4s
488:	learn: 0.2465173	total: 30s	remaining: 31.3s
489:	learn: 0.2464666	total: 30s	remaining: 31.2s
490:	learn: 0.2464047	total: 30.1s	remaining: 31.2s
491:	learn: 0.2464007	total: 30.1s	remaining: 31.1s
492:	learn: 0.2463191	total: 30.2s	remaining: 31.1s
493:	learn: 0.2462924	total: 30.3s	remaining: 31s
494:	learn: 0.2462871	total: 30.3s	remaining: 30.9s
495:	learn: 0.2462620	total: 30.4s	remaining: 30.9s
496:	learn: 0.2462492	total: 30.5s	remaining: 30.8s
497:	learn: 0.2461764	total: 30.5s	remaining: 30.8s
498:	learn: 0.2461

638:	learn: 0.2423289	total: 42.8s	remaining: 24.2s
639:	learn: 0.2422773	total: 42.9s	remaining: 24.1s
640:	learn: 0.2422565	total: 43s	remaining: 24.1s
641:	learn: 0.2421904	total: 43.1s	remaining: 24s
642:	learn: 0.2420911	total: 43.2s	remaining: 24s
643:	learn: 0.2420861	total: 43.3s	remaining: 23.9s
644:	learn: 0.2420686	total: 43.4s	remaining: 23.9s
645:	learn: 0.2419909	total: 43.5s	remaining: 23.8s
646:	learn: 0.2419847	total: 43.6s	remaining: 23.8s
647:	learn: 0.2419816	total: 43.7s	remaining: 23.7s
648:	learn: 0.2419804	total: 43.8s	remaining: 23.7s
649:	learn: 0.2419783	total: 43.9s	remaining: 23.7s
650:	learn: 0.2419698	total: 44s	remaining: 23.6s
651:	learn: 0.2419656	total: 44.2s	remaining: 23.6s
652:	learn: 0.2419618	total: 44.3s	remaining: 23.5s
653:	learn: 0.2419559	total: 44.3s	remaining: 23.5s
654:	learn: 0.2419360	total: 44.4s	remaining: 23.4s
655:	learn: 0.2419153	total: 44.5s	remaining: 23.4s
656:	learn: 0.2418867	total: 44.6s	remaining: 23.3s
657:	learn: 0.241867

797:	learn: 0.2394187	total: 57.8s	remaining: 14.6s
798:	learn: 0.2394113	total: 57.9s	remaining: 14.6s
799:	learn: 0.2393412	total: 58s	remaining: 14.5s
800:	learn: 0.2393377	total: 58.1s	remaining: 14.4s
801:	learn: 0.2393040	total: 58.2s	remaining: 14.4s
802:	learn: 0.2392807	total: 58.3s	remaining: 14.3s
803:	learn: 0.2392748	total: 58.4s	remaining: 14.2s
804:	learn: 0.2392363	total: 58.5s	remaining: 14.2s
805:	learn: 0.2392286	total: 58.6s	remaining: 14.1s
806:	learn: 0.2391589	total: 58.7s	remaining: 14s
807:	learn: 0.2391546	total: 58.8s	remaining: 14s
808:	learn: 0.2391499	total: 58.9s	remaining: 13.9s
809:	learn: 0.2391088	total: 58.9s	remaining: 13.8s
810:	learn: 0.2390615	total: 59s	remaining: 13.8s
811:	learn: 0.2390550	total: 59.1s	remaining: 13.7s
812:	learn: 0.2390530	total: 59.2s	remaining: 13.6s
813:	learn: 0.2390278	total: 59.2s	remaining: 13.5s
814:	learn: 0.2389770	total: 59.4s	remaining: 13.5s
815:	learn: 0.2388939	total: 59.5s	remaining: 13.4s
816:	learn: 0.238893

956:	learn: 0.2368902	total: 1m 9s	remaining: 3.13s
957:	learn: 0.2368894	total: 1m 9s	remaining: 3.06s
958:	learn: 0.2368878	total: 1m 9s	remaining: 2.98s
959:	learn: 0.2368555	total: 1m 9s	remaining: 2.91s
960:	learn: 0.2368541	total: 1m 9s	remaining: 2.84s
961:	learn: 0.2368435	total: 1m 9s	remaining: 2.76s
962:	learn: 0.2368398	total: 1m 10s	remaining: 2.69s
963:	learn: 0.2368376	total: 1m 10s	remaining: 2.62s
964:	learn: 0.2368340	total: 1m 10s	remaining: 2.54s
965:	learn: 0.2368133	total: 1m 10s	remaining: 2.47s
966:	learn: 0.2368124	total: 1m 10s	remaining: 2.4s
967:	learn: 0.2367575	total: 1m 10s	remaining: 2.33s
968:	learn: 0.2367504	total: 1m 10s	remaining: 2.25s
969:	learn: 0.2367480	total: 1m 10s	remaining: 2.18s
970:	learn: 0.2367359	total: 1m 10s	remaining: 2.11s
971:	learn: 0.2366643	total: 1m 10s	remaining: 2.04s
972:	learn: 0.2366600	total: 1m 10s	remaining: 1.96s
973:	learn: 0.2366487	total: 1m 10s	remaining: 1.89s
974:	learn: 0.2366415	total: 1m 10s	remaining: 1.82s


<catboost.core.CatBoostClassifier at 0x1a7370b8>

roc на test 0.9251986995423082


In [14]:
xgb = xgboost.XGBClassifier()
xgb.fit(X_train, y_train)
pred_proba = xgb.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

roc на test 0.9255613125257565


In [15]:
pred_proba_cb = cb_clf.predict_proba(X_test)
pred_proba_xgb = xgb.predict_proba(X_test)
pred_proba_rf = rf.predict_proba(X_test)
pred_proba_knn = neigh.predict_proba(X_test)
pred_proba_lr = log_reg.predict_proba(X_test)
pred_proba_mean = pred_proba_cb[:, 1]*0.1 + pred_proba_rf[:, 1]*0.3 + pred_proba_knn[:, 1]*0.1 \
+ pred_proba_lr[:, 1]*0.1 + pred_proba_xgb[:, 1]*0.4
roc_score = roc_auc_score(y_test, pred_proba_mean)
print('roc на test', roc_score)

roc на test 0.9259668727818506


In [16]:
#best RandomForest
rf = RandomForestClassifier(random_state=0)
parametrs_rf = {'max_features' : [9, 10],
            'n_estimators': [15, 20, 50, 100],
            'max_depth': [4, 5, 6],
            'min_samples_leaf': [1, 2, 5, 10],
            'min_samples_split': [10, 15, 20]}
grid_search_cv_clf = GridSearchCV(estimator=rf, param_grid=parametrs_rf, cv=5, n_jobs=-1, verbose=1, scoring='roc_auc')
grid_search_cv_clf.fit(X_train, y_train)
best_rf = grid_search_cv_clf.best_estimator_
pred_proba_rf = best_rf.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba_rf[:, 1])
print('roc на test', roc_score)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   59.4s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  4.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_features': [9, 10], 'n_estimators': [15, 20, 50, 100], 'max_depth': [4, 5, 6], 'min_samples_leaf': [1, 2, 5, 10], 'min_samples_split': [10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

roc на test 0.9215281948250084


In [17]:
#best XGBoost
xgb_model = xgboost.XGBClassifier()
test_params = {
  "n_estimators" : [95, 100],
 "learning_rate"    : [0.01, 0.02, 0.03, 0.04],
 "max_depth"        : [3, 4, 5],
 "min_child_weight" : [ 1, 2, 4]
}
model = GridSearchCV(estimator = xgb_model,param_grid = test_params, n_jobs=-1, verbose=10, cv=5)
model.fit(X_train,y_train)
best_xgb = model.best_estimator_
pred_proba_xgb = best_xgb.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba_xgb[:, 1])
print('roc на test', roc_score)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   53.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  4

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [95, 100], 'learning_rate': [0.01, 0.02, 0.03, 0.04], 'max_depth': [3, 4, 5], 'min_child_weight': [1, 2, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

roc на test 0.9254793439084349


In [18]:
#best kNN
knn_model = KNeighborsClassifier()
test_params = {
  'n_neighbors' : [5, 10, 20, 30, 37, 40, 50, 60, 75, 90, 100]
}
model = GridSearchCV(estimator = knn_model,param_grid = test_params, n_jobs=-1, verbose=10, cv=5)
model.fit(X_train,y_train)
best_knn = model.best_estimator_
pred_proba_knn = best_knn.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba_knn[:, 1])
print('roc на test', roc_score)

Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   52.5s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.9min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_neighbors': [5, 10, 20, 30, 37, 40, 50, 60, 75, 90, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

roc на test 0.9144499846293972


In [20]:
#best Logistic Regression
lr_model = LogisticRegression()
test_params = {
  'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] 
}
model = GridSearchCV(estimator = lr_model,param_grid = test_params, n_jobs=-1, verbose=10, cv=5)
model.fit(X_train,y_train)
best_lr = model.best_estimator_
pred_proba_lr = best_knn.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba_lr[:, 1])
print('roc на test', roc_score)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1190s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  24 out of  35 | elapsed:    1.4s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  28 out of  35 | elapsed:    1.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    1.9s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

roc на test 0.9144499846293972


In [21]:
pred_proba_cb = cb_clf.predict_proba(X_test)
pred_proba_mean = pred_proba_cb[:, 1]*0.4 + pred_proba_rf[:, 1]*0.1 + pred_proba_xgb[:, 1]*0.3 \
+ pred_proba_knn[:, 1]*0.1 + pred_proba_lr[:, 1]*0.1
roc_score = roc_auc_score(y_test, pred_proba_mean)
print('roc на test', roc_score)

roc на test 0.9253696558602921


In [22]:
SUBMIT_NUM = 15
pred_proba_cb = cb_clf.predict_proba(X_pred)
pred_proba_rf = best_rf.predict_proba(X_pred)
pred_proba_xgb = xgb.predict_proba(X_pred)
pred_proba_lr = best_lr.predict_proba(X_pred)
pred_proba_knn = best_knn.predict_proba(X_pred)

pred_proba = pred_proba_cb[:, 1]*0.4 + pred_proba_rf[:, 1]*0.1 + pred_proba_xgb[:, 1]*0.3 \
+ pred_proba_knn[:, 1]*0.1 + pred_proba_lr[:, 1]*0.1
rep_df = rep.create_report(X_pred.index, pred_proba)
assert rep_df.user_id.nunique() == X_pred.index.nunique()
print ('Прогноз сохранен в файл ', rep.save_report(rep_df, SUBMIT_NUM))

print ('Распределение "вероятностей" модели')
pd.cut(pred_proba, 10).value_counts()

Прогноз сохранен в файл  predict_2019-05-27_submit_15.csv
Распределение "вероятностей" модели


(0.00219, 0.103]    3452
(0.103, 0.202]       684
(0.202, 0.302]       674
(0.302, 0.401]       187
(0.401, 0.501]       195
(0.501, 0.601]       224
(0.601, 0.7]         142
(0.7, 0.8]            67
(0.8, 0.899]         121
(0.899, 0.999]       438
dtype: int64

In [25]:
pred_proba_cb_test = cb_clf.predict_proba(X_test)
pred_proba_rf_test = best_rf.predict_proba(X_test)
pred_proba_xgb_test = xgb.predict_proba(X_test)
pred_proba_lr_test = best_lr.predict_proba(X_test)
pred_proba_knn_test = best_knn.predict_proba(X_test)

In [8]:
X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit_transform(X_test)
X_pred_scaled = StandardScaler().fit_transform(X_pred)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [20]:
mlp = MLPClassifier(hidden_layer_sizes=(45,), activation='relu', max_iter=500)
mlp.fit(X_train_scaled,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(46,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [21]:
pred_proba_mlp = mlp.predict_proba(X_test_scaled)
roc_score = roc_auc_score(y_test, pred_proba_mlp[:, 1])
print('roc на test', roc_score)

roc на test 0.8901011736617538


In [None]:
mlp