In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
df_merge = reduce_mem_usage(pd.read_csv('./output/final_merge.csv'))
target = df_merge.logerror
features = df_merge.drop(['logerror'], axis = 1)

Memory usage of dataframe is 56.36 MB
Memory usage after optimization is: 23.22 MB
Decreased by 58.8%


In [9]:
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization
categorical_features = ['airconditioningtypeid', 'hashottuborspa', 'heatingorsystemtypeid', 
                       'pooltypeid2', 'propertylandusetypeid', 'fips', 'regionidcounty', 
                       'buildingqualitytypeid_fill', 'regionidcity_fill', 'year', 
                       'regionidneighborhood_fill', 'taxdelinquencyflag']

In [14]:
def bayes_parameter_opt_lgb(X, y, init_round=3, opt_round=20, n_folds=3, 
                            random_seed=6, n_estimators=1000, learning_rate=0.05):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y,categorical_feature=categorical_features,free_raw_data=False)
    # parameters
    def lgb_eval(num_leaves, colsample_bytree, subsample, max_depth, reg_lambda, reg_alpha, min_split_gain, 
                 min_child_weight, min_child_sample, max_bin, subsample_freq):
        params = {'objective':'regression','boosting_type': 'gbdt','nthread': 4, 'verbose': -1,\
                  'num_boost_round': n_estimators, 'learning_rate':learning_rate}
        params['subsample_freq']=int(round(subsample_freq))
        params['min_child_sample']=int(round(min_child_sample))
        params['max_bin']=int(round(max_bin))
        params["num_leaves"] = int(round(num_leaves))
        params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
        params['subsample'] = max(min(subsample, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['reg_lambda'] = max(reg_lambda, 0)
        params['reg_alpha'] = max(reg_alpha, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=False, 
                           verbose_eval=25, metrics=['rmse'],early_stopping_rounds=50)
        return -1.0 * np.min(cv_result['rmse-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (1000, 3000),
                                            'colsample_bytree': (0.1, 0.9),
                                            'subsample': (0.1, 0.9),
                                            'max_depth': (-1, 12),
                                            'reg_lambda': (0.1, 3),
                                            'reg_alpha': (0.1, 3),
                                            'min_child_sample':(20,200),
                                            'max_bin':(180,600),
                                            'subsample_freq':(1,20),
                                            'min_split_gain': (0.1, 0.9),
                                            'min_child_weight': (3, 30)})
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)

In [15]:
import warnings
warnings.filterwarnings('ignore')
opt_params = bayes_parameter_opt_lgb(features, target, init_round=3, opt_round=20, 
                                     n_folds=3, random_seed=6, n_estimators=1000, learning_rate=0.05)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample | subsam... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
[25]	cv_agg's rmse: 0.16526 + 0.00350248
[50]	cv_agg's rmse: 0.165132 + 0.0035038
[75]	cv_agg's rmse: 0.165139 + 0.0034733
[100]	cv_agg's rmse: 0.165091 + 0.0034467
[125]	cv_agg's rmse: 0.165079 + 0.00343575
[150]	cv_agg's rmse: 0.165067 + 0.00343286
[175]	cv_agg's rmse: 0.165031 + 0.00339901
[200]	cv_agg's rmse: 0.165021 + 0.00340916
[225]	cv_agg's rmse: 0.165026 + 0.00337994
[250]	cv_agg's rmse: 0.164995 + 0.00336527
[275]	cv_agg's rmse: 0.165006 + 0.00336743
[300]	cv_agg's rmse: 0.16501 + 0.00338104
| [0m 1       [0m | [0m-0.165   [0m | [0m 0.8126  [0m | [0m 285.1   [0m | [0m 3.841   [0m | [0m 159.5   [0m | [0m 15.25   [0m | [0m 0.3401  [0m | [0m 2.899e+0

[100]	cv_agg's rmse: 0.164717 + 0.00345559
[125]	cv_agg's rmse: 0.164698 + 0.00344566
[150]	cv_agg's rmse: 0.164706 + 0.00343656
[175]	cv_agg's rmse: 0.164712 + 0.00343362
| [95m 15      [0m | [95m-0.1647  [0m | [95m 0.4531  [0m | [95m 594.9   [0m | [95m 4.668   [0m | [95m 22.96   [0m | [95m 29.02   [0m | [95m 0.2573  [0m | [95m 1.042e+0[0m | [95m 1.876   [0m | [95m 1.906   [0m | [95m 0.8305  [0m | [95m 8.248   [0m |
[25]	cv_agg's rmse: 0.16558 + 0.00348501
[50]	cv_agg's rmse: 0.165657 + 0.00340345
| [0m 16      [0m | [0m-0.1656  [0m | [0m 0.8876  [0m | [0m 591.7   [0m | [0m-0.3577  [0m | [0m 22.02   [0m | [0m 3.574   [0m | [0m 0.2043  [0m | [0m 1.04e+03[0m | [0m 2.468   [0m | [0m 1.212   [0m | [0m 0.1405  [0m | [0m 18.9    [0m |
[25]	cv_agg's rmse: 0.165262 + 0.0035049
[50]	cv_agg's rmse: 0.165057 + 0.00350085
[75]	cv_agg's rmse: 0.164909 + 0.00347471
[100]	cv_agg's rmse: 0.164822 + 0.00347112
[125]	cv_agg's rmse: 0.164776 + 0.003465

In [None]:
|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample | subsam... |
|  2        | -0.1647   |  0.666    |  522.7    |  9.436    |  96.55    |  3.022    |  0.2541   |  2.41e+03 |  2.797    |  1.351    |  0.8636   |  8.035    |
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 2410,
    "learning_rate": 0.05,
    "colsample_bytree": 0.666,
    "reg_lambda": 1.351,
    'reg_alpha': 2.797,
    "metric": "rmse",
    'max_bins': 523,
    'max_depth': 10,
    'min_child_sample': 96,
    'min_child_weight': 3,
    'min_split_gain':0.2541,
    'subsample':0.8636,
    'subsample_freq': 8,
     
}