# Supervised Learning Using Ensemble Methods

In [107]:
import pandas as pd
import pickle
import time

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib

import lightgbm as lgb

from preprocessor import make_preprocessor

In [4]:
mailout_train_clean = pd.read_csv('../data/Udacity_MAILOUT_052018_TRAIN_clean.csv', sep=';')
mailout_train_clean.shape

(42962, 383)

In [6]:
lnr = mailout_train_clean.LNR
y = mailout_train_clean.RESPONSE
X = mailout_train_clean.drop(['RESPONSE', 'LNR'], axis=1)

In [None]:
# Load feature info
feat_info = pd.read_csv('../features.csv')
feat_info.set_index('attribute', inplace=True)

In [13]:
# Make the preprocessor
numerical_columns = feat_info[feat_info.type == 'numeric'].index.drop(['GEBURTSJAHR','KBA13_ANZAHL_PKW'])
categorical_columns = X.columns.drop(numerical_columns)
preprocessor = make_preprocessor(numerical_columns, categorical_columns)

X_processed = preprocessor.fit_transform(X)

## Gradient Boost

In [102]:
clf = GradientBoostingClassifier()
clf.get_params()

{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'auto',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [103]:
param_grid = {'learning_rate': [.001],
              'max_depth': [5],
              'random_state': [42]}

start_time = time.time()

grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', cv=5)
grid.fit(X_processed, y)

elapsed_time = (time.time() - start_time) / 60
print('Elapsed computation time: {:.3f} mins'.format(elapsed_time))

Elapsed computation time: 14.471 mins


In [104]:
print(grid.best_score_)
print(grid.best_estimator_)

0.7652356030850433
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.001, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)


In [105]:
pickle.dump(grid.best_estimator_, open('gb_model.pkl', 'wb'))

## AdaBoost

In [95]:
clf = AdaBoostClassifier()
clf.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': None}

In [96]:
param_grid = {'learning_rate': [0.1],
              'n_estimators': [50],
              'random_state': [42]}

start_time = time.time()

grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', cv=5)
grid.fit(X_processed, y)

elapsed_time = (time.time() - start_time) / 60
print('Elapsed computation time: {:.3f} mins'.format(elapsed_time))

Elapsed computation time: 1.498 mins


In [97]:
print(grid.best_score_)
print(grid.best_estimator_)

0.7623794140401363
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.1, n_estimators=50, random_state=42)


In [98]:
pickle.dump(grid.best_estimator_, open('ada_model.pkl', 'wb'))

## LightGBM

In [74]:
lgb_clf = lgb.LGBMClassifier(objective='binary', metric='auc')
lgb_clf.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'metric': 'auc'}

In [99]:
param_grid = {'learning_rate': [0.01],
              'num_leaves':[31],
              'random_state': [42]}

start_time = time.time()

grid = GridSearchCV(estimator=lgb_clf, param_grid=param_grid, scoring='roc_auc', cv=5)
grid.fit(X_processed, y)

elapsed_time = (time.time() - start_time) / 60
print('Elapsed computation time: {:.3f} mins'.format(elapsed_time))

Elapsed computation time: 0.276 mins


In [100]:
print(grid.best_score_)
print(grid.best_estimator_)

0.7637889601314927
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.01, max_depth=-1,
        metric='auc', min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
        objective='binary', random_state=42, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)


In [101]:
pickle.dump(grid.best_estimator_, open('lgbm_model.pkl', 'wb'))

## Predict

In [129]:
def make_submission(model, test, lnr):
    preds = model.predict_proba(test)
    submission = pd.DataFrame({'LNR':lnr, 'RESPONSE':preds[:,0]})
    return submission

In [113]:
mailout_test_clean = pd.read_csv('../data/Udacity_MAILOUT_052018_TEST_clean.csv', sep=';')
print(mailout_test_clean.shape)
lnr = mailout_test_clean.LNR
mailout_test_clean.drop(['LNR'], axis=1, inplace=True)

In [117]:
# Preprocess
test_processed = preprocessor.fit_transform(mailout_test_clean)

In [127]:
# Gradient Boost
gb_model = joblib.load('gb_model.pkl')
preds = gb_model.predict_proba(test_processed)
submission = pd.DataFrame({'LNR':lnr, 'RESPONSE':preds[:,0]})
submission.to_csv('gb_preds.csv', index=False)
submission.head()

Unnamed: 0,LNR,RESPONSE
0,1754,0.985616
1,1770,0.985616
2,1465,0.988564
3,1470,0.988564
4,1478,0.988564


In [130]:
# AdaBoost
ada_model = joblib.load('ada_model.pkl')
submission = make_submission(ada_model, test_processed, lnr)
submission.to_csv('ada_preds.csv', index=False)
submission.head()

Unnamed: 0,LNR,RESPONSE
0,1754,0.676132
1,1770,0.67117
2,1465,0.756437
3,1470,0.755163
4,1478,0.739913


In [131]:
# LightGBM
lgbm = joblib.load('lgbm_model.pkl')
submission = make_submission(lgbm, test_processed, lnr)
submission.to_csv('lgbm_preds.csv', index=False)
submission.head()

Unnamed: 0,LNR,RESPONSE
0,1754,0.974802
1,1770,0.975796
2,1465,0.993654
3,1470,0.993654
4,1478,0.993654
