In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
df_merge = reduce_mem_usage(pd.read_csv('./output/final_merge.csv'))
target = df_merge.logerror
features = df_merge.drop(['logerror'], axis = 1)

Memory usage of dataframe is 93.50 MB
Memory usage after optimization is: 40.99 MB
Decreased by 56.2%


In [4]:
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization
categorical_features = ['airconditioningtypeid', 'hashottuborspa', 'heatingorsystemtypeid', 
                       'pooltypeid2', 'propertylandusetypeid', 'fips', 'regionidcounty', 
                       'buildingqualitytypeid_fill', 'regionidcity_fill', 'year', 
                       'regionidneighborhood_fill', 'taxdelinquencyflag']

In [None]:
params = {"objective": "regression", "boosting": "gbdt", "num_leaves": 512, 
              "learning_rate": 0.0021, 'bagging_fraction': 0.85, "reg_lambda": 0.1, 
              'reg_alpha':0.1, "metric": "rmse", 'max_depth': -1, 'min_child_weight': 30,
              'verbose': -1, 'min_split_gain':0.1, 'subsample_freq':1, 'sub_feature':  0.5}

In [10]:
def bayes_parameter_opt_lgb(X, y, init_round=3, opt_round=20, n_folds=3, 
                            random_seed=6, n_estimators=1000, learning_rate=0.05):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y,categorical_feature=categorical_features,free_raw_data=False)
    # parameters
    def lgb_eval(num_leaves, colsample_bytree, subsample, max_depth, reg_lambda, reg_alpha, min_split_gain, 
                 min_child_weight, min_child_sample, max_bin, subsample_freq, learning_rate):
        params = {'objective':'regression','boosting_type': 'gbdt','nthread': 4, 'verbose': -1,\
                  'num_boost_round': n_estimators, 'learning_rate':learning_rate}
        params['subsample_freq']=int(round(subsample_freq))
        params['min_child_sample']=int(round(min_child_sample))
        params['max_bin']=int(round(max_bin))
        params["num_leaves"] = int(round(num_leaves))
        params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
        params['subsample'] = max(min(subsample, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['reg_lambda'] = max(reg_lambda, 0)
        params['reg_alpha'] = max(reg_alpha, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        params['verbose'] = -1
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=False, 
                           verbose_eval=25, metrics=['rmse'],early_stopping_rounds=50)
        return -1.0 * np.min(cv_result['rmse-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (100, 4000),
                                            'colsample_bytree': (0.1, 0.9),
                                            'subsample': (0.1, 0.9),
                                            'max_depth': (-1, 12),
                                            'reg_lambda': (0.1, 3),
                                            'reg_alpha': (0.1, 3),
                                            'min_child_sample':(20,200),
                                            'max_bin':(180,600),
                                            'subsample_freq':(0,20),
                                            'min_split_gain': (0.1, 0.9),
                                            'min_child_weight': (3, 30),
                                            'learning_rate': (0.00001, 0.5)})
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)

In [11]:
import warnings
warnings.filterwarnings('ignore')
opt_params = bayes_parameter_opt_lgb(features, target, init_round=3, opt_round=30, 
                                     n_folds=3, random_seed=6, n_estimators=1000, learning_rate=0.05)

|   iter    |  target   | colsam... | learni... |  max_bin  | max_depth | min_ch... | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample | subsam... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[25]	cv_agg's rmse: 0.165234 + 0.00347736
[50]	cv_agg's rmse: 0.16513 + 0.00350636
[75]	cv_agg's rmse: 0.165109 + 0.00348301
[100]	cv_agg's rmse: 0.165112 + 0.00348126
| [0m 1       [0m | [0m-0.1651  [0m | [0m 0.3372  [0m | [0m 0.25    [0m | [0m 476.2   [0m | [0m 1.093   [0m | [0m 197.8   [0m | [0m 17.12   [0m | [0m 0.6954  [0m | [0m 2.448e+0[0m | [0m 1.779   [0m | [0m 2.706   [0m | [0m 0.8453  [0m | [0m 7.109   [0m |
[25]	cv_agg's rmse: 0.165678 + 0.00350436
[50]	cv_agg's rmse: 0.165678 + 0.00350436
| [0m 2       [0m | [0m-0.1655  [0m | [0m 0.2848  [0m | [0m 0.4094  [0m | [0m 499.4   [0m | [0m 3.36    [0m | [0m

KeyboardInterrupt: 

In [None]:
|   iter    |  target   | colsam... | learni... |  max_bin  | max_depth | min_ch... | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample | subsam... |
|  9        | -0.1647   |  0.3937   |  0.1731   |  183.7    |  6.981    |  196.0    |  3.968    |  0.3119   |  1.72e+03 |  1.253    |  0.7226   |  0.7965   |  0.2399   |

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1720,
    "learning_rate": 0.05,
    "colsample_bytree": 0.3937,
    "reg_lambda": 0.7226 ,
    'reg_alpha': 1.253,
    "metric": "rmse",
    'max_bins': 184,
    'max_depth': 7,
    'min_child_sample': 196,
    'min_child_weight': 4,
    'min_split_gain':0.3119,
    'subsample': 0.7965,
    'subsample_freq': 8,
    'learning_rate': 0.1731
}