In [1]:
import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
import datetime
import xgboost
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,cross_val_score, train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.externals import joblib

import data_helpers as dh
import data_iter1 as di
import submit_report as rep
import model_utils as mu

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# загрузка данных
events  = pd.read_csv('event_data_train.csv')
submissions = pd.read_csv('submissions_data_train.csv')

# разделение данных для обучения на train и test
_ = dh.split_events_submissions(events, submissions, test_size=0.3)
events_train_orig, events_test_orig, submissions_train_orig, submissions_test_orig = _

# подготовка данных
X_train, y_train = di.get_x_y(events_train_orig, submissions_train_orig)
X_test, y_test = di.get_x_y(events_test_orig, submissions_test_orig)

#загрузка данных для предсказаний
events_pred  = pd.read_csv('events_data_test.csv')
submissions_pred = pd.read_csv('submission_data_test.csv')
X_pred , _ = di.get_x_y(events_pred, submissions_pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['date'] = pd.to_datetime(data.timestamp, unit='s')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['day'] = data.date.dt.date


In [3]:
#Подготовка данных для тренировки модели
X = X_train.append(X_test)
y = y_train.append(y_test)
user_min_time = events.groupby('user_id', as_index=False).agg({'timestamp':'min'})
X['user_id'] = X.index
X = X.merge(user_min_time[['user_id','timestamp']], how='outer')
X['date'] = pd.to_datetime(X['timestamp'], unit = 's')
X['day'] = X['date'].dt.date
X['weekday'] = X['date'].dt.dayofweek
#X = X.drop('user_id', axis=1)
X = X.drop('timestamp', axis=1)
X['year'] = X['date'].dt.year
X['month'] = X['date'].dt.month
X['hour'] = X['date'].dt.hour
X = X.drop('day', axis=1)
X = X.drop('date', axis=1)
X.shape

Defaulting to column, but this will raise an ambiguity error in a future version
  


(19234, 11)

In [4]:
#подготовка данных для предсказаний
user_min_time_pred = events_pred.groupby('user_id', as_index=False).agg({'timestamp':'min'})
X_pred['user_id'] = X_pred.index
X_pred = X_pred.merge(user_min_time_pred[['user_id','timestamp']], how='outer')
X_pred['date'] = pd.to_datetime(X_pred['timestamp'], unit = 's')
X_pred['day'] = X_pred['date'].dt.date
X_pred['weekday'] = X_pred['date'].dt.dayofweek
X_pred = X_pred.drop('timestamp', axis=1)
X_pred['year'] = X_pred['date'].dt.year
X_pred['month'] = X_pred['date'].dt.month
X_pred['hour'] = X_pred['date'].dt.hour
X_pred = X_pred.drop('day', axis=1)
X_pred = X_pred.drop('date', axis=1)
X_pred.shape

Defaulting to column, but this will raise an ambiguity error in a future version
  after removing the cwd from sys.path.


(6184, 11)

In [22]:
#данные для OneHotEncoder
year = [2015, 2016, 2017, 2018, 2019]
month = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
weekday = [0, 1, 2, 3, 4, 5, 6]
hour = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]

In [19]:
for i in year:
    X['year='+str(i)] = (X['year']==i).astype('float')
    X_pred['year='+str(i)] = (X_pred['year']==i).astype('float')
X.head()

Unnamed: 0,correct,wrong,discovered,passed,started_attempt,viewed,user_id,weekday,year,month,hour,year=2015,year=2016,year=2017,year=2018,year=2019
0,4.0,4.0,15,15,4,20,3,0,2015,6,8,1.0,0.0,0.0,0.0,0.0
1,2.0,2.0,1,1,0,1,5,4,2016,6,9,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1,1,0,1,7,2,2018,3,12,0.0,0.0,0.0,1.0,0.0
3,9.0,21.0,109,84,37,154,8,3,2016,12,14,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,3,3,0,4,9,5,2017,1,11,0.0,0.0,1.0,0.0,0.0


In [23]:
for i in month:
    X['month='+str(i)] = (X['month']==i).astype('float')
    X_pred['month='+str(i)] = (X_pred['month']==i).astype('float')
for i in weekday:
    X['weekday='+str(i)] = (X['weekday']==i).astype('float')
    X_pred['weekday='+str(i)] = (X_pred['weekday']==i).astype('float')
for i in hour:
    X['hour='+str(i)] = (X['hour']==i).astype('float')
    X_pred['hour='+str(i)] = (X_pred['hour']==i).astype('float')
X.head()

Unnamed: 0,correct,wrong,discovered,passed,started_attempt,viewed,user_id,weekday,year,month,...,hour=14,hour=15,hour=16,hour=17,hour=18,hour=19,hour=20,hour=21,hour=22,hour=23
0,4.0,4.0,15,15,4,20,3,0,2015,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,2.0,1,1,0,1,5,4,2016,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1,1,0,1,7,2,2018,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.0,21.0,109,84,37,154,8,3,2016,12,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,3,3,0,4,9,5,2017,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
X = X.set_index('user_id')
X_pred = X_pred.set_index('user_id')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [32]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=2, 
                            min_samples_leaf=10, min_samples_split=10, 
                            class_weight='balanced')
rf.fit(X_train, y_train)
pred_proba = rf.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)
# должны получить roc 0.92  +- 0.02

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=10,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

roc на test 0.9202084723781737


In [33]:
# важность фич
fimp = mu.get_feature_importances_df(rf.feature_importances_, X_train.columns)
fimp.head(15)


Unnamed: 0,weight
correct,0.297533
discovered,0.167586
passed,0.151968
wrong,0.12514
started_attempt,0.090019
viewed,0.078516
hour,0.012533
month,0.012355
month=6,0.008242
weekday,0.007087


In [34]:
# значение к метрике на кроссвалидации коррелирует к метрике на степике
rfcv = RandomForestClassifier(**rf.get_params())
cv_scores = cross_val_score(rfcv, X_train, y_train, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores = np.mean(cv_scores)
print ('mean score', mean_cv_scores)

mean score 0.9267657501377305


In [36]:
SUBMIT_NUM = 9
pred_proba = rf.predict_proba(X_pred)[:, 1]
rep_df = rep.create_report(X_pred.index, pred_proba)
assert rep_df.user_id.nunique() == X_pred.index.nunique()
print ('Прогноз сохранен в файл ', rep.save_report(rep_df, SUBMIT_NUM))

print ('Распределение "вероятностей" модели')
pd.cut(pred_proba, 10).value_counts()

Прогноз сохранен в файл  predict_2019-05-22_submit_9.csv
Распределение "вероятностей" модели


(-0.000228, 0.1]    3221
(0.1, 0.2]           223
(0.2, 0.3]           369
(0.3, 0.399]         572
(0.399, 0.499]       348
(0.499, 0.599]       234
(0.599, 0.698]       292
(0.698, 0.798]       334
(0.798, 0.897]       130
(0.897, 0.997]       461
dtype: int64

In [37]:
rf = RandomForestClassifier(random_state=0)
parametrs_rf = {'criterion': ['gini', 'entropy'],
            'max_features' : [9, 10],
            'n_estimators': [15, 20, 50],
            'max_depth': [4, 5, 6],
            'min_samples_leaf': [1, 2, 3],
            'min_samples_split': [15, 20]}
grid_search_cv_clf = GridSearchCV(estimator=rf, param_grid=parametrs_rf, cv=5, n_jobs=-1, verbose=1, scoring='roc_auc')
grid_search_cv_clf.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:  7.2min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_features': [9, 10], 'n_estimators': [15, 20, 50], 'max_depth': [4, 5, 6], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [38]:
best_rf = grid_search_cv_clf.best_estimator_
best_rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=15,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [39]:
pred_proba = best_rf.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)

roc на test 0.9174993543382453


In [58]:
# значение к метрике на кроссвалидации коррелирует к метрике на степике

rfcv = RandomForestClassifier(**best_rf.get_params())
cv_scores = cross_val_score(rfcv, X_train, y_train, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores = np.mean(cv_scores)
print ('mean score', mean_cv_scores)

mean score 0.9266923491547825


In [59]:
SUBMIT_NUM = 10

pred_proba = best_rf.predict_proba(X_pred)[:, 1]
rep_df = rep.create_report(X_pred.index, pred_proba)
assert rep_df.user_id.nunique() == X_pred.index.nunique()
print ('Прогноз сохранен в файл ', rep.save_report(rep_df, SUBMIT_NUM))

print ('Распределение "вероятностей" модели')
pd.cut(pred_proba, 10).value_counts()

Прогноз сохранен в файл  predict_2019-05-22_submit_7.csv
Распределение "вероятностей" модели


(-0.000881, 0.1]    3419
(0.1, 0.2]           478
(0.2, 0.3]           878
(0.3, 0.4]           126
(0.4, 0.5]           280
(0.5, 0.6]           303
(0.6, 0.7]            69
(0.7, 0.8]            34
(0.8, 0.9]           149
(0.9, 1.0]           448
dtype: int64

In [61]:
X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit_transform(X_test)
X_pred_scaled = StandardScaler().fit_transform(X_pred)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [68]:
xgb_model = xgboost.XGBClassifier()
test_params = {
  "n_estimators" : [90, 100, 110],
 "learning_rate"    : [0.02, 0.04, 0.05],
 "max_depth"        : [3, 4, 5],
 "min_child_weight" : [ 1, 2, 4]
}
model = GridSearchCV(estimator = xgb_model,param_grid = test_params, n_jobs=-1, verbose=10, cv=5)

In [69]:
model.fit(X_train,y_train)
print (model.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   34.7s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   49.1s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   56.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  1

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [90, 100, 110], 'learning_rate': [0.02, 0.04, 0.05], 'max_depth': [3, 4, 5], 'min_child_weight': [1, 2, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

{'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 100}


In [71]:
best_xgb = model.best_estimator_
pred_proba = best_xgb.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)

roc на test 0.9320736692623023


In [72]:
best_xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=5, min_child_weight=4, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [73]:
SUBMIT_NUM = 8

pred_proba = best_xgb.predict_proba(X_pred)[:, 1]
rep_df = rep.create_report(X_pred.index, pred_proba)
assert rep_df.user_id.nunique() == X_pred.index.nunique()
print ('Прогноз сохранен в файл ', rep.save_report(rep_df, SUBMIT_NUM))

print ('Распределение "вероятностей" модели')
pd.cut(pred_proba, 10).value_counts()

Прогноз сохранен в файл  predict_2019-05-22_submit_8.csv
Распределение "вероятностей" модели


(0.00345, 0.104]    3690
(0.104, 0.203]      1048
(0.203, 0.302]       295
(0.302, 0.401]       244
(0.401, 0.5]         189
(0.5, 0.599]          77
(0.599, 0.698]        29
(0.698, 0.797]        93
(0.797, 0.897]        81
(0.897, 0.996]       438
dtype: int64

user_id
1        2016
2        2017
5        2016
7        2018
9        2017
14       2015
18       2015
20       2015
22       2017
26       2016
28       2015
30       2016
31       2017
32       2017
33       2015
36       2018
38       2015
44       2017
46       2015
50       2015
52       2015
53       2018
54       2017
57       2018
59       2016
60       2017
64       2017
65       2015
67       2017
68       2015
         ... 
26696    2018
26698    2018
26699    2018
26701    2018
26706    2018
26710    2018
26713    2018
26720    2018
26721    2018
26724    2018
26729    2018
26730    2018
26735    2018
26736    2018
26738    2018
26743    2018
26744    2018
26745    2018
26748    2018
26758    2018
26768    2018
26770    2018
26775    2018
26780    2018
26785    2019
26791    2018
26795    2018
26796    2018
26799    2018
26800    2018
Name: year, Length: 25418, dtype: int64

Index(['correct', 'wrong', 'discovered', 'passed', 'started_attempt', 'viewed',
       'weekday', 'year', 'month', 'hour', 'year0', 'year1', 'year2', 'year3',
       'month0', 'month1', 'month2', 'month3', 'month4', 'month5', 'month6',
       'month7', 'month8', 'month9', 'month10', 'month11', 'hour0', 'hour1',
       'hour2', 'hour3', 'hour4', 'hour5', 'hour6', 'hour7', 'hour8', 'hour9',
       'hour10', 'hour11', 'hour12', 'hour13', 'hour14', 'hour15', 'hour16',
       'hour17', 'hour18', 'hour19', 'hour20', 'hour21', 'hour22', 'hour23',
       'weekday0', 'weekday1', 'weekday2', 'weekday3', 'weekday4', 'weekday5',
       'weekday6', 'correct', 'passed', 'discovered', 'wrong', 'viewed',
       'started_attempt', 'hour', 'month', 'weekday', 'year'],
      dtype='object')