In [1]:
from helper import *

import pandas as pd
import numpy as np
import warnings
import time
warnings.filterwarnings("ignore")
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score



In [2]:
application_train = pd.read_csv('./preprocessed_data/train_all_data_raw.csv')
# application_train = application_train.head(1000)
# test_features = pd.read_csv('./preprocessed_data/testall_data_raw.csv')

In [3]:
X = application_train.drop('TARGET', axis=1)
y = application_train.TARGET

del application_train
gc.collect()

7

In [4]:
def bayes_parameter_opt_lgb(X, y, init_round=50, opt_round=50, n_folds=5, random_seed=6, n_estimators=10000):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(learning_rate, colsample_bytree, subsample, num_leaves, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        params = {'application':'binary','num_iterations': n_estimators, 'early_stopping_round':100, 'metric':'auc'}
        params['learning_rate'] = max(learning_rate, 0)
        params['colsample_bytree'] = max(learning_rate, 0)
        params['subsample'] = max(learning_rate, 0)
        params["num_leaves"] = int(round(num_leaves))
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, metrics=['auc'])
        return max(cv_result['auc-mean'])
    # range colsample_bytree
    lgbBO = BayesianOptimization(lgb_eval, {
                                            'learning_rate': (10**-5, 1),
                                            'colsample_bytree': (10**-5, 1),
                                            'subsample': (10**-5, 1),
                                            'num_leaves': (16, 45),
                                            'max_depth': (3, 8.99),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 5),
                                            'min_split_gain': (0.00001, 0.1),
                                            'min_child_weight': (1, 100)}, random_state=random_seed)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    # output optimization process
    lgbBO.points_to_csv("bayes_opt_result.csv")
    # return best parameters
    return lgbBO.res['max']['max_params']

In [5]:
%%time
opt_params = bayes_parameter_opt_lgb(X, y)

[31mInitialization[0m
[94m-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   lambda_l1 |   lambda_l2 |   learning_rate |   max_depth |   min_child_weight |   min_split_gain |   num_leaves |   subsample | 
    1 | 03m04s | [35m   0.76328[0m | [32m            0.8282[0m | [32m     0.3884[0m | [32m     0.0969[0m | [32m         0.5782[0m | [32m     8.3482[0m | [32m            4.3177[0m | [32m          0.0397[0m | [32m     38.1778[0m | [32m     0.8301[0m | 
    2 | 02m52s | [35m   0.76383[0m | [32m            0.2764[0m | [32m     3.3498[0m | [32m     3.0642[0m | [32m         0.7845[0m | [32m     4.9886[0m | [32m           41.8341[0m | [32m          0.0049[0m | [32m     24.2319[0m | [32m     0.9159[0m | 
    3 | 03m21s |    0.76133 |             0.1257 |      0.3015 |

   34 | 03m13s |    0.75928 |             0.0670 |      3.7042 |      0.0938 |          0.9238 |      7.0211 |            33.7677 |           0.0305 |      17.9204 |      0.0550 | 
   35 | 03m22s |    0.76509 |             0.7792 |      2.8752 |      3.3514 |          0.6622 |      6.5880 |            80.2925 |           0.0985 |      30.8753 |      0.9561 | 
   36 | 02m41s |    0.78040 |             0.4008 |      2.4426 |      1.3528 |          0.2590 |      7.2968 |            44.9243 |           0.0943 |      25.8976 |      0.1468 | 
   37 | 02m34s |    0.77768 |             0.2458 |      3.2220 |      0.0715 |          0.2768 |      8.6147 |             7.2096 |           0.0764 |      35.1403 |      0.9578 | 
   38 | 02m49s |    0.76608 |             0.5238 |      4.7415 |      0.4857 |          0.6355 |      5.1073 |            32.5074 |           0.0623 |      40.4884 |      0.3391 | 
   39 | 03m09s |    0.76575 |             0.2550 |      4.0122 |      2.0942 |          0.7244 

[200]	cv_agg's auc: 0.783811 + 0.00230801
[400]	cv_agg's auc: 0.786507 + 0.00221002
   57 | 05m15s |    0.78661 |             0.4998 |      0.1429 |      4.5065 |          0.1012 |      8.7983 |            99.7042 |           0.0530 |      16.5819 |      0.6571 | 
[200]	cv_agg's auc: 0.77731 + 0.00190808
[400]	cv_agg's auc: 0.783549 + 0.00188747
[600]	cv_agg's auc: 0.78576 + 0.00199565
[800]	cv_agg's auc: 0.786633 + 0.00209559
[1000]	cv_agg's auc: 0.787162 + 0.00221096
[1200]	cv_agg's auc: 0.78752 + 0.00241424
[1400]	cv_agg's auc: 0.78759 + 0.00245343
   58 | 07m27s |    0.78773 |             0.0807 |      4.7562 |      0.2497 |          0.0860 |      3.4425 |            31.3072 |           0.0837 |      28.1661 |      0.1924 | 
[200]	cv_agg's auc: 0.757784 + 0.00242703
[400]	cv_agg's auc: 0.77082 + 0.0024105
[600]	cv_agg's auc: 0.77812 + 0.00238213
[800]	cv_agg's auc: 0.781504 + 0.00233614
[1000]	cv_agg's auc: 0.78375 + 0.00241876
[1200]	cv_agg's auc: 0.784714 + 0.00241349
[1400]	cv_a

   79 | 03m10s |    0.78508 |             0.3880 |      0.1775 |      4.8512 |          0.1447 |      8.8005 |            64.2238 |           0.0023 |      18.3232 |      0.8111 | 
[200]	cv_agg's auc: 0.741051 + 0.00269231
[400]	cv_agg's auc: 0.756857 + 0.00239046
[600]	cv_agg's auc: 0.764916 + 0.00226105
[800]	cv_agg's auc: 0.772024 + 0.0022461
[1000]	cv_agg's auc: 0.77534 + 0.00223354
[1200]	cv_agg's auc: 0.778053 + 0.00222707
[1400]	cv_agg's auc: 0.780446 + 0.00229636
[1600]	cv_agg's auc: 0.781774 + 0.00230921
[1800]	cv_agg's auc: 0.783073 + 0.00225015
[2000]	cv_agg's auc: 0.784189 + 0.00228068
[2200]	cv_agg's auc: 0.784981 + 0.00228777
[2400]	cv_agg's auc: 0.785472 + 0.00226167
[2600]	cv_agg's auc: 0.785925 + 0.00228152
[2800]	cv_agg's auc: 0.786358 + 0.00227284
[3000]	cv_agg's auc: 0.786651 + 0.00231161
[3200]	cv_agg's auc: 0.786775 + 0.00234596
[3400]	cv_agg's auc: 0.786956 + 0.00237802
[3600]	cv_agg's auc: 0.787093 + 0.00237971
[3800]	cv_agg's auc: 0.787207 + 0.00238522
   80 | 

In [6]:
print(opt_params)

{'max_depth': 6.4673630153546817, 'num_leaves': 21.243914156492238, 'colsample_bytree': 0.78600799875337513, 'lambda_l2': 1.2380597629498169, 'learning_rate': 0.040501877181723323, 'min_child_weight': 39.385925114066708, 'min_split_gain': 0.082481163551225242, 'subsample': 0.20802296463542469, 'lambda_l1': 4.8717791158021182}


In [14]:
def lgb_model(features, test_features, encoding = 'ohe', n_folds = 5):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    # One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = False, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', 
                                   max_depth = 6.4673630153546817,
                                   num_leaves = 21, 
                                   colsample_bytree = 0.78600799875337513, 
                                   lambda_l2 = 1.2380597629498169, 
                                   learning_rate = 0.040501877181723323, 
                                   min_child_weight = 39.385925114066708, 
                                   min_split_gain = 0.082481163551225242, 
                                   subsample = 0.20802296463542469, 
                                   lambda_l1 = 4.8717791158021182, 
                                   n_jobs = -1, 
                                   random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [10]:
del X,y
gc.collect()

2782

In [11]:
app_train = pd.read_csv('./preprocessed_data/train_all_data_raw.csv')

app_test = pd.read_csv('./preprocessed_data/testall_data_raw.csv')

In [None]:
submission, fi, metrics = lgb_model(app_train, app_test)