In [1]:
import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
import datetime
import xgboost
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,cross_val_score, train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.externals import joblib

import data_helpers as dh
import data_iter1 as di
import submit_report as rep
import model_utils as mu

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# загрузка данных
events  = pd.read_csv('event_data_train.csv')
submissions = pd.read_csv('submissions_data_train.csv')

# разделение данных для обучения на train и test
_ = dh.split_events_submissions(events, submissions, test_size=0.3)
events_train_orig, events_test_orig, submissions_train_orig, submissions_test_orig = _

# подготовка данных
X_train, y_train = di.get_x_y(events_train_orig, submissions_train_orig)
X_test, y_test = di.get_x_y(events_test_orig, submissions_test_orig)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['date'] = pd.to_datetime(data.timestamp, unit='s')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  data['day'] = data.date.dt.date


In [5]:
X = X_train.append(X_test)
y = y_train.append(y_test)
user_min_time = events.groupby('user_id', as_index=False).agg({'timestamp':'min'})
user_min_time.shape

(19234, 2)

In [6]:
X['user_id'] = X.index
X = X.merge(user_min_time[['user_id','timestamp']], how='outer')
X['date'] = pd.to_datetime(X['timestamp'], unit = 's')
X['day'] = X['date'].dt.date
X['weekday'] = X['date'].dt.dayofweek
X = X.drop('user_id', axis=1)
X = X.drop('timestamp', axis=1)
X['year'] = X['date'].dt.year
X['month'] = X['date'].dt.month
X['hour'] = X['date'].dt.hour
X.head()

Defaulting to column, but this will raise an ambiguity error in a future version
  


Unnamed: 0,correct,wrong,discovered,passed,started_attempt,viewed,day,date,weekday,year,month,hour
0,0.0,0.0,1,0,0,1,2016-09-02,2016-09-02 14:44:24,4,2016,9,14
1,2.0,0.0,9,9,2,9,2017-12-27,2017-12-27 14:02:44,2,2017,12,14
2,4.0,4.0,15,15,4,20,2015-06-15,2015-06-15 08:54:36,0,2015,6,8
3,2.0,2.0,1,1,0,1,2016-06-17,2016-06-17 09:46:49,4,2016,6,9
4,9.0,21.0,109,84,37,154,2016-12-01,2016-12-01 14:43:47,3,2016,12,14


In [7]:
ohe = OneHotEncoder(sparse=False)
new_features = ohe.fit_transform(X['year'].values.reshape(-1,1))
tmp = pd.DataFrame(new_features, columns=['year' + str(i) for i in range(new_features.shape[1])])
X = pd.concat([X, tmp], axis=1)
new_features = ohe.fit_transform(X['month'].values.reshape(-1,1))
tmp = pd.DataFrame(new_features, columns=['month' + str(i) for i in range(new_features.shape[1])])
X = pd.concat([X, tmp], axis=1)
new_features = ohe.fit_transform(X['hour'].values.reshape(-1,1))
tmp = pd.DataFrame(new_features, columns=['hour' + str(i) for i in range(new_features.shape[1])])
X = pd.concat([X, tmp], axis=1)
new_features = ohe.fit_transform(X['weekday'].values.reshape(-1,1))
tmp = pd.DataFrame(new_features, columns=['weekday' + str(i) for i in range(new_features.shape[1])])
X = pd.concat([X, tmp], axis=1)
X.head()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,correct,wrong,discovered,passed,started_attempt,viewed,day,date,weekday,year,...,hour21,hour22,hour23,weekday0,weekday1,weekday2,weekday3,weekday4,weekday5,weekday6
0,0.0,0.0,1,0,0,1,2016-09-02,2016-09-02 14:44:24,4,2016,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2.0,0.0,9,9,2,9,2017-12-27,2017-12-27 14:02:44,2,2017,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,4.0,4.0,15,15,4,20,2015-06-15,2015-06-15 08:54:36,0,2015,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,2.0,1,1,0,1,2016-06-17,2016-06-17 09:46:49,4,2016,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,9.0,21.0,109,84,37,154,2016-12-01,2016-12-01 14:43:47,3,2016,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
X = X.drop('day', axis=1)
X = X.drop('date', axis=1)
X = X.drop('weekday', axis=1)
X = X.drop('month', axis=1)
X = X.drop('year', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [9]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=2, 
                            min_samples_leaf=10, min_samples_split=10, 
                            class_weight='balanced')
rf.fit(X_train, y_train)
pred_proba = rf.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)
# должны получить roc 0.92  +- 0.02

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=10,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

roc на test 0.9298772125116344


In [10]:
# важность фич
fimp = mu.get_feature_importances_df(rf.feature_importances_, X_train.columns)
fimp.head(15)

Unnamed: 0,weight
correct,0.307779
passed,0.158638
discovered,0.128893
viewed,0.128383
wrong,0.101947
started_attempt,0.093377
hour,0.013533
month5,0.011646
year0,0.003602
year3,0.003575


In [11]:
# значение к метрике на кроссвалидации коррелирует к метрике на степике

rfcv = RandomForestClassifier(**rf.get_params())

cv_scores = cross_val_score(rfcv, X_train, y_train, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores = np.mean(cv_scores)
print ('mean score', mean_cv_scores)

mean score 0.9255371019308377


In [22]:
SUBMIT_NUM = 6

events_pred  = pd.read_csv('events_data_test.csv')
submissions_pred = pd.read_csv('submission_data_test.csv')
X_pred , _ = di.get_x_y(events_pred, submissions_pred)

In [23]:
user_min_time_pred = events_pred.groupby('user_id', as_index=False).agg({'timestamp':'min'})
X_pred['user_id'] = X_pred.index
X_pred = X_pred.merge(user_min_time_pred[['user_id','timestamp']], how='outer')
X_pred['date'] = pd.to_datetime(X_pred['timestamp'], unit = 's')
X_pred['day'] = X_pred['date'].dt.date
X_pred['weekday'] = X_pred['date'].dt.dayofweek
X_pred = X_pred.drop('user_id', axis=1)
X_pred = X_pred.drop('timestamp', axis=1)
X_pred['year'] = X_pred['date'].dt.year
X_pred['month'] = X_pred['date'].dt.month
X_pred['hour'] = X_pred['date'].dt.hour
X_pred.head()

Defaulting to column, but this will raise an ambiguity error in a future version
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,correct,wrong,discovered,passed,started_attempt,viewed,day,date,weekday,year,month,hour
0,0.0,0.0,1,1,0,1,2018-06-18,2018-06-18 14:21:47,0,2018,6,14
1,0.0,0.0,1,1,0,1,2019-01-07,2019-01-07 19:30:07,0,2019,1,19
2,0.0,0.0,2,2,0,6,2018-08-18,2018-08-18 09:49:16,5,2018,8,9
3,1.0,0.0,11,9,4,14,2018-09-19,2018-09-19 09:14:34,2,2018,9,9
4,29.0,36.0,70,70,35,105,2018-07-31,2018-07-31 10:52:34,1,2018,7,10


In [24]:
ohe = OneHotEncoder(sparse=False)
new_features = ohe.fit_transform(X_pred['year'].values.reshape(-1,1))
tmp = pd.DataFrame(new_features, columns=['year' + str(i) for i in range(new_features.shape[1])])
X_pred = pd.concat([X_pred, tmp], axis=1)
new_features = ohe.fit_transform(X_pred['month'].values.reshape(-1,1))
tmp = pd.DataFrame(new_features, columns=['month' + str(i) for i in range(new_features.shape[1])])
X_pred = pd.concat([X_pred, tmp], axis=1)
new_features = ohe.fit_transform(X_pred['hour'].values.reshape(-1,1))
tmp = pd.DataFrame(new_features, columns=['hour' + str(i) for i in range(new_features.shape[1])])
X_pred = pd.concat([X_pred, tmp], axis=1)
new_features = ohe.fit_transform(X_pred['weekday'].values.reshape(-1,1))
tmp = pd.DataFrame(new_features, columns=['weekday' + str(i) for i in range(new_features.shape[1])])
X_pred = pd.concat([X_pred, tmp], axis=1)
X_pred.head()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,correct,wrong,discovered,passed,started_attempt,viewed,day,date,weekday,year,...,hour21,hour22,hour23,weekday0,weekday1,weekday2,weekday3,weekday4,weekday5,weekday6
0,0.0,0.0,1,1,0,1,2018-06-18,2018-06-18 14:21:47,0,2018,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1,1,0,1,2019-01-07,2019-01-07 19:30:07,0,2019,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,2,2,0,6,2018-08-18,2018-08-18 09:49:16,5,2018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,11,9,4,14,2018-09-19,2018-09-19 09:14:34,2,2018,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,29.0,36.0,70,70,35,105,2018-07-31,2018-07-31 10:52:34,1,2018,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [25]:
X_pred = X_pred.drop('day', axis=1)
X_pred = X_pred.drop('date', axis=1)
X_pred = X_pred.drop('weekday', axis=1)
X_pred = X_pred.drop('month', axis=1)
X_pred = X_pred.drop('year', axis=1)
X_pred.shape

(6184, 49)

In [26]:
pred_proba = rf.predict_proba(X_pred)[:, 1]
rep_df = rep.create_report(X_pred.index, pred_proba)
assert rep_df.user_id.nunique() == X_pred.index.nunique()
print ('Прогноз сохранен в файл ', rep.save_report(rep_df, SUBMIT_NUM))

print ('Распределение "вероятностей" модели')
pd.cut(pred_proba, 10).value_counts()

ValueError: Number of features of the model must match the input. Model n_features is 54 and input n_features is 49 

In [88]:
rf = RandomForestClassifier(random_state=0)
parametrs_rf = {'criterion': ['gini', 'entropy'],
            'max_features' : [1, 2, 3, 4, 5, 6, 7],
            'n_estimators': [18, 19, 20, 21],
            'max_depth': [7, 8, 9, 10],
            'min_samples_leaf': [1, 2, 3, 4],
            'min_samples_split': [6, 7, 8]}
grid_search_cv_clf = GridSearchCV(estimator=rf, param_grid=parametrs_rf, cv=5, n_jobs=-1, verbose=1)
grid_search_cv_clf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 2688 candidates, totalling 13440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 11242 tasks      |

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_features': [1, 2, 3, 4, 5, 6, 7], 'n_estimators': [18, 19, 20, 21], 'max_depth': [7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4], 'min_samples_split': [6, 7, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [89]:
grid_search_cv_clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=7, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=21, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [90]:
best_rf = grid_search_cv_clf.best_estimator_
pred_proba = best_rf.predict_proba(X_test_scaled)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)

roc на test 0.9330574833464914


In [83]:
# значение к метрике на кроссвалидации коррелирует к метрике на степике

rfcv = RandomForestClassifier(**best_rf.get_params())
cv_scores = cross_val_score(rfcv, X_train_scaled, y_train, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores = np.mean(cv_scores)
print ('mean score', mean_cv_scores)

mean score 0.9233465900694563


In [85]:
SUBMIT_NUM = 5

events_pred  = pd.read_csv('events_data_test.csv')
submissions_pred = pd.read_csv('submission_data_test.csv')
X_pred , _ = di.get_x_y(events_pred, submissions_pred)
X_pred_scaled = StandardScaler().fit_transform(X_pred)
pred_proba = best_rf.predict_proba(X_pred_scaled)[:, 1]
rep_df = rep.create_report(X_pred.index, pred_proba)
assert rep_df.user_id.nunique() == X_pred.index.nunique()
print ('Прогноз сохранен в файл ', rep.save_report(rep_df, SUBMIT_NUM))

print ('Распределение "вероятностей" модели')
pd.cut(pred_proba, 10).value_counts()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Прогноз сохранен в файл  ./reports/predict_2019-05-21_submit_5.csv
Распределение "вероятностей" модели


(-0.001, 0.1]    3362
(0.1, 0.2]        401
(0.2, 0.3]        735
(0.3, 0.4]        391
(0.4, 0.5]        208
(0.5, 0.6]        169
(0.6, 0.7]        187
(0.7, 0.8]        144
(0.8, 0.9]        124
(0.9, 1.0]        463
dtype: int64

In [44]:
X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit_transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [66]:
xgb_model = xgboost.XGBClassifier()
test_params = {
  "n_estimators" : [125, 130, 135],
 "learning_rate"    : [0.02, 0.04, 0.05],
 "max_depth"        : [3, 4],
 "min_child_weight" : [ 1, 2, 4]
}
model = GridSearchCV(estimator = xgb_model,param_grid = test_params, n_jobs=-1, verbose=10, cv=5)

In [67]:
model.fit(X_train_scaled,y_train)
print (model.best_params_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   47.7s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   54.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  1

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [125, 130, 135], 'learning_rate': [0.02, 0.04, 0.05], 'max_depth': [3, 4], 'min_child_weight': [1, 2, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

{'learning_rate': 0.04, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 130}


In [68]:
best_xgb = model.best_estimator_
pred_proba = best_xgb.predict_proba(X_test_scaled)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)

roc на test 0.9327440493248386


In [54]:
best_xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [91]:
X_train.head()

Unnamed: 0_level_0,correct,wrong,discovered,passed,started_attempt,viewed,day
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.0,0.0,1,0,0,1,1
2,2.0,0.0,9,9,2,9,1
3,4.0,4.0,15,15,4,20,1
5,2.0,2.0,1,1,0,1,1
7,0.0,0.0,1,1,0,1,1
