In [1]:
# Data manipulation
import pandas as pd
import numpy as np

import gc
# Modeling
import lightgbm as lgb

# Evaluation of the model
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.size'] = 18
%matplotlib inline

In [2]:
MAX_EVALS = 20
N_FOLDS = 5
OUT_FILE = 'bayes_test.csv'

In [3]:
original_data = pd.read_csv('./preprocessed_data/all_data_v3.csv')
original_data = original_data[original_data['TARGET'].notnull()]
print(original_data.shape)

# Sample 16000 rows (10000 for training, 6000 for testing)
# features = original_data.sample(n = 16000, random_state = 42)

features = original_data

del original_data
gc.collect()

(307511, 580)


0

In [5]:
features = pd.get_dummies(features)
print(features.shape)

(307511, 703)


In [6]:
# Extract the labels
labels = np.array(features['TARGET'].astype(np.int32)).reshape((-1, ))
features = features.drop(columns = ['TARGET', 'SK_ID_CURR'])

# Split into training and testing data
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 6000, random_state = 42)

print('Train shape: ', train_features.shape)
print('Test shape: ', test_features.shape)

train_features.head()

Train shape:  (301511, 701)
Test shape:  (6000, 701)


Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
193071,22797.0,215865.0,202500.0,247500.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,1
302257,34596.0,675000.0,675000.0,180000.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0,0,0,1,0,0,0,0,0,0
263273,31531.5,485640.0,450000.0,202500.0,0.0,0.0,0.0,1.0,0.0,4.0,...,1,0,0,0,0,0,0,1,0,0
68522,16879.5,552555.0,477000.0,180000.0,0.0,0.0,0.0,1.0,0.0,3.0,...,0,0,0,1,0,0,0,0,0,0
6848,35577.0,665325.0,616500.0,139500.0,0.0,0.0,0.0,1.0,0.0,3.0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
# Training set
train_set = lgb.Dataset(train_features, label = train_labels)
test_set = lgb.Dataset(test_features, label = test_labels)

In [8]:
import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer

def objective(hyperparameters):
    """Objective function for Gradient Boosting Machine Hyperparameter Optimization.
       Writes a new line to `outfile` on every iteration"""
    
    # Keep track of evals
    global ITERATION
    
    ITERATION += 1
    
    # Using early stopping to find number of trees trained
    if 'n_estimators' in hyperparameters:
        del hyperparameters['n_estimators']
    
    # Retrieve the subsample
    subsample = hyperparameters['boosting_type'].get('subsample', 1.0)
    
    # Extract the boosting type and subsample to top level keys
    hyperparameters['boosting_type'] = hyperparameters['boosting_type']['boosting_type']
    hyperparameters['subsample'] = subsample
    
    # Make sure parameters that need to be integers are integers
    for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
        hyperparameters[parameter_name] = int(hyperparameters[parameter_name])

    start = timer()
    
    # Perform n_folds cross validation
    cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 10000, nfold = N_FOLDS, 
                        verbose_eval = 100,early_stopping_rounds = 100, metrics = 'auc', seed = 50)

    run_time = timer() - start
    
    # Extract the best score
    best_score = cv_results['auc-mean'][-1]
    
    # Loss must be minimized
    loss = 1 - best_score
    
    # Boosting rounds that returned the highest cv score
    n_estimators = len(cv_results['auc-mean'])
    
    # Add the number of estimators to the hyperparameters
    hyperparameters['n_estimators'] = n_estimators

    # Write to the csv file ('a' means append)
    of_connection = open(OUT_FILE, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, hyperparameters, ITERATION, run_time, best_score])
    of_connection.close()

    # Dictionary with information for evaluation
    return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION,
            'train_time': run_time, 'status': STATUS_OK}

In [9]:
from hyperopt import hp
from hyperopt.pyll.stochastic import sample

# Define the search space / domain
space = {
    'boosting_type': hp.choice('boosting_type', 
                                            [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                             {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                             {'boosting_type': 'goss', 'subsample': 1.0}]),
    'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
    'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'is_unbalance': hp.choice('is_unbalance', [True, False]),
}

In [10]:
from hyperopt import tpe

# Create the optimization algorithm
tpe_algorithm = tpe.suggest

In [11]:
from hyperopt import Trials

# Record results
trials = Trials()

# Create a file and open a connection
# OUT_FILE = 'bayes_test.csv'
of_connection = open(OUT_FILE, 'w')
writer = csv.writer(of_connection)

ITERATION = 0

# Write column names
headers = ['loss', 'hyperparameters', 'iteration', 'runtime', 'score']
writer.writerow(headers)
of_connection.close()

In [12]:
from hyperopt import fmin

ITERATION = 0

In [13]:
%%time
# Run optimization
best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = trials,
            max_evals = MAX_EVALS)

[100]	cv_agg's auc: 0.775761 + 0.00418196
[200]	cv_agg's auc: 0.782736 + 0.0039623
[300]	cv_agg's auc: 0.783862 + 0.00405168
[400]	cv_agg's auc: 0.783599 + 0.00423582
[100]	cv_agg's auc: 0.511255 + 0.035303
[100]	cv_agg's auc: 0.490009 + 0.0272143
[100]	cv_agg's auc: 0.770454 + 0.00438771
[200]	cv_agg's auc: 0.776033 + 0.00426206
[300]	cv_agg's auc: 0.780462 + 0.00399788
[400]	cv_agg's auc: 0.782802 + 0.00471015
[500]	cv_agg's auc: 0.784995 + 0.00434118
[600]	cv_agg's auc: 0.785084 + 0.00394281
[100]	cv_agg's auc: 0.764318 + 0.00429067
[200]	cv_agg's auc: 0.769898 + 0.00458559
[300]	cv_agg's auc: 0.774728 + 0.00463329
[400]	cv_agg's auc: 0.778171 + 0.00439136
[500]	cv_agg's auc: 0.781706 + 0.0042223
[600]	cv_agg's auc: 0.78268 + 0.00421134
[700]	cv_agg's auc: 0.783662 + 0.00415979
[800]	cv_agg's auc: 0.784521 + 0.00389257
[900]	cv_agg's auc: 0.78493 + 0.00388384
[1000]	cv_agg's auc: 0.785345 + 0.00385552
[1100]	cv_agg's auc: 0.785433 + 0.00402358
[100]	cv_agg's auc: 0.49697 + 0.0557592

In [14]:
best

{'boosting_type': 0,
 'colsample_by_tree': 0.7157128140557183,
 'gdbt_subsample': 0.8704264554232746,
 'is_unbalance': 0,
 'learning_rate': 0.010895675526704639,
 'min_child_samples': 245.0,
 'num_leaves': 54.0,
 'reg_alpha': 0.7277077790414211,
 'reg_lambda': 0.12196456631794106,
 'subsample_for_bin': 140000.0}

In [15]:
import ast

def evaluate(results, name):
    """Evaluate model on test data using hyperparameters in results
       Return dataframe of hyperparameters"""
    
    new_results = results.copy()
    # String to dictionary
    new_results['hyperparameters'] = new_results['hyperparameters'].map(ast.literal_eval)
    
    # Sort with best values on top
    new_results = new_results.sort_values('score', ascending = False).reset_index(drop = True)
    
    # Print out cross validation high score
    print('The highest cross validation score from {} was {:.5f} found on iteration {}.'.format(name, new_results.loc[0, 'score'], new_results.loc[0, 'iteration']))
    
    # Use best hyperparameters to create a model
    hyperparameters = new_results.loc[0, 'hyperparameters']
    model = lgb.LGBMClassifier(**hyperparameters)
    
    # Train and make predictions
    model.fit(train_features, train_labels)
    preds = model.predict_proba(test_features)[:, 1]
    
    print('ROC AUC from {} on test data = {:.5f}.'.format(name, roc_auc_score(test_labels, preds)))
    
    # Create dataframe of hyperparameters
    hyp_df = pd.DataFrame(columns = list(new_results.loc[0, 'hyperparameters'].keys()))

    # Iterate through each set of hyperparameters that were evaluated
    for i, hyp in enumerate(new_results['hyperparameters']):
        hyp_df = hyp_df.append(pd.DataFrame(hyp, index = [0]), 
                               ignore_index = True)
        
    # Put the iteration and score in the hyperparameter dataframe
    hyp_df['iteration'] = new_results['iteration']
    hyp_df['score'] = new_results['score']
    
    del model
    gc.collect()
    
    return hyp_df

In [16]:
results = pd.read_csv(OUT_FILE)
bayes_results = evaluate(results, name = 'Bayesian')
bayes_results

The highest cross validation score from Bayesian was 0.78812 found on iteration 8.
ROC AUC from Bayesian on test data = 0.78526.


Unnamed: 0,boosting_type,colsample_bytree,is_unbalance,learning_rate,min_child_samples,num_leaves,reg_alpha,reg_lambda,subsample_for_bin,subsample,metric,verbose,n_estimators,iteration,score
0,gbdt,0.715713,True,0.010896,245,54,0.727708,0.121965,140000,0.870426,auc,1,2002,8,0.788124
1,dart,0.75844,False,0.028614,270,54,0.140301,0.582904,240000,0.994368,auc,1,2404,9,0.787395
2,gbdt,0.925477,False,0.010638,150,145,0.419077,0.963646,240000,0.643987,auc,1,1195,12,0.787069
3,gbdt,0.630376,False,0.028166,435,124,0.289069,0.088868,100000,0.672921,auc,1,450,7,0.786662
4,gbdt,0.632092,False,0.03025,125,24,0.392897,0.030363,220000,0.530251,auc,1,1169,19,0.786357
5,dart,0.756598,False,0.030221,395,132,0.116008,0.601984,160000,0.766222,auc,1,1045,5,0.785539
6,gbdt,0.774411,False,0.026128,55,62,0.723469,0.786496,140000,0.532487,auc,1,732,20,0.785413
7,dart,0.658995,True,0.048186,405,143,0.064139,0.973434,240000,0.857503,auc,1,571,4,0.785317
8,dart,0.655189,False,0.030459,220,146,0.98331,0.458931,100000,0.763076,auc,1,1462,17,0.785033
9,gbdt,0.974536,True,0.034986,335,149,0.28088,0.809136,280000,0.879373,auc,1,320,18,0.784838


### Continue Optimization
Hyperopt can continue searching where a previous search left off if we pass in a Trials object that already has results. The algorithms used in Bayesian optimization are black-box optimizers because they have no internal state. All they need is the previous results of objective function evaluations (the input values and loss) and they can build up the surrogate function and select the next values to evaluate in the objective function. This means that any search can be continued as long as we have the history in a Trials object.

In [17]:
import json

trials_dict = sorted(trials.results, key = lambda x: x['loss'])

# Save the trial results
with open('trials.json', 'w') as f:
    f.write(json.dumps(trials_dict))

NameError: name 'trials_dict' is not defined

In [18]:
best_bayes_params = bayes_results.iloc[bayes_results['score'].idxmax(), :].copy()
best_bayes_params

boosting_type             gbdt
colsample_bytree      0.715713
is_unbalance              True
learning_rate        0.0108957
min_child_samples          245
num_leaves                  54
reg_alpha             0.727708
reg_lambda            0.121965
subsample_for_bin       140000
subsample             0.870426
metric                     auc
verbose                      1
n_estimators              2002
iteration                    8
score                 0.788124
Name: 0, dtype: object

In [19]:
hyperparameters = dict(best_bayes_params)
del_keys = ['n_estimators','iteration','score']
for key in del_keys:
    del hyperparameters[key]
hyperparameters

{'boosting_type': 'gbdt',
 'colsample_bytree': 0.7157128140557183,
 'is_unbalance': True,
 'learning_rate': 0.010895675526704639,
 'min_child_samples': 245,
 'num_leaves': 54,
 'reg_alpha': 0.7277077790414211,
 'reg_lambda': 0.12196456631794106,
 'subsample_for_bin': 140000,
 'subsample': 0.8704264554232746,
 'metric': 'auc',
 'verbose': 1}

In [10]:
hyperparameters =   {'boosting_type': 'gbdt',
                     'objective': 'binary',
                     'number_boosting_rounds': 5000,
                     'early_stopping_rounds': 100,
                     'max_bin': 300,
                     'max_depth': -1,
                     'num_leaves': 35,
                     'learning_rate': 0.1,
                     'subsample': 1,
                     'min_child_samples': 50,
                     'subsample_freq': 1,
                     'min_gain_to_split': 0.5,
                     'scale_pos_weight': 1,
                     'colsample_bytree': 0.2,
                     'is_unbalance': True,
                     'reg_alpha': 0.0,
                     'reg_lambda': 100,
                     'metric': 'auc',
                     'verbose': 1}

In [20]:
del train_set, test_set
gc.collect()

122

In [4]:
# train = pd.read_csv('./preprocessed_data/train_v2.csv')
# test = pd.read_csv('./preprocessed_data/test_v2.csv')

original_data = pd.read_csv('./preprocessed_data/all_data_v3.csv')
train = original_data[original_data['TARGET'].notnull()]
test = original_data[original_data['TARGET'].isnull()]

# train = pd.get_dummies(train)
# test = pd.get_dummies(test)

del original_data
gc.collect()

# Extract the test ids and train labels
test_ids = test['SK_ID_CURR']
train_labels = np.array(train['TARGET'].astype(np.int32)).reshape((-1, ))

train = pd.get_dummies(train)
test = pd.get_dummies(test)

train,test = train.align(test, join = 'inner', axis = 1)

train = train.drop(columns = ['SK_ID_CURR', 'TARGET'])
test = test.drop(columns = ['SK_ID_CURR','TARGET'])


print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)


Training shape:  (307511, 699)
Testing shape:  (48744, 699)


In [11]:
train_set = lgb.Dataset(train, label = train_labels,categorical_feature='auto')

# Cross validation with n_folds and early stopping
cv_results = lgb.cv(hyperparameters, train_set,
                    num_boost_round = 10000, early_stopping_rounds = 100, 
                    metrics = 'auc', nfold = N_FOLDS)

print('The cross validation score on the full dataset for Bayesian optimization = {:.5f} with std: {:.5f}.'.format(
    cv_results['auc-mean'][-1], cv_results['auc-stdv'][-1]))
print('Number of estimators = {}.'.format(len(cv_results['auc-mean'])))



The cross validation score on the full dataset for Bayesian optimization = 0.78757 with std: 0.00282.
Number of estimators = 272.


In [12]:
# model = lgb.LGBMClassifier(n_estimators = len(cv_results['auc-mean']), **hyperparameters)
model = lgb.LGBMClassifier( **hyperparameters)
model.fit(train, train_labels)

preds = model.predict_proba(test)[:, 1]

submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': preds})
submission.to_csv('./model_performance/bo_v3.csv', index = False)

In [23]:
out_of_fold_pd = pd.DataFrame(columns=['pred','Target'])
out_of_fold_pd.head()

Unnamed: 0,pred,Target


In [24]:
pred = [ 1 , 0, 1]
target = [1, 0, 1]
tmp = pd.DataFrame(data={'pred': pred, 'Target': target})

In [25]:
out_of_fold_pd.append(tmp)

Unnamed: 0,pred,Target
0,1,1
1,0,0
2,1,1
