In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [2]:
# use the leaking data plus as a test
df_train = pd.read_csv('../../Large_output/train_leaking_plus.csv')

In [4]:
def features_engineering(df):
    
    # Sort by localtime
    df.sort_values("local_time")
    df.reset_index(drop=True)
    
    # Add more features
    df["local_time"] = pd.to_datetime(df["local_time"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["local_time"].dt.hour
    df["weekend"] = df["local_time"].dt.weekday
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [5]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
df_train = reduce_mem_usage(df_train)
train_engineer = features_engineering(df_train)
train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\
=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)
target = np.log1p(train_engineer["meter_reading"])
features = train_engineer[['building_id', 'meter','site_id','primary_use', 
                          'square_feet','air_temperature','cloud_coverage',
                          'dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]

Memory usage of dataframe is 3736.60 MB
Memory usage after optimization is: 1313.65 MB
Decreased by 64.8%


In [6]:
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization

In [7]:
categorical_features = ["building_id", "site_id", "meter", "primary_use",  "weekend",'is_holiday']

In [11]:
# setup the function for bayes opt
def bayes_parameter_opt_lgb(X, y, init_round=3, opt_round=7, n_folds=3, random_seed=6, n_estimators=1000, learning_rate=0.05):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y,categorical_feature=categorical_features,free_raw_data=False)
    # parameters
    def lgb_eval(num_leaves, colsample_bytree, subsample, max_depth, reg_lambda, reg_alpha, min_split_gain, min_child_weight, 
                min_child_sample, max_bin, subsample_freq):
        params = {'objective':'regression','boosting_type': 'gbdt','nthread': 4, 'verbose': -1,\
                  'num_boost_round': n_estimators, 'learning_rate':learning_rate}
        params['subsample_freq']=int(round(subsample_freq))
        params['min_child_sample']=int(round(min_child_sample))
        params['max_bin']=int(round(max_bin))
        params["num_leaves"] = int(round(num_leaves))
        params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
        params['subsample'] = max(min(subsample, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['reg_lambda'] = max(reg_lambda, 0)
        params['reg_alpha'] = max(reg_alpha, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=False, verbose_eval=25, metrics=['rmse'],early_stopping_rounds=50)
        return -1.0 * np.min(cv_result['rmse-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (1000, 3000),
                                            'colsample_bytree': (0.1, 0.9),
                                            'subsample': (0.1, 0.9),
                                            'max_depth': (-1, 12),
                                            'reg_lambda': (0.1, 3),
                                            'reg_alpha': (0.1, 3),
                                            'min_child_sample':(20,200),
                                            'max_bin':(180,600),
                                            'subsample_freq':(1,20),
                                            'min_split_gain': (0.1, 0.9),
                                            'min_child_weight': (3, 30)})
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)

opt_params = bayes_parameter_opt_lgb(features, target, init_round=3, opt_round=7, n_folds=3, random_seed=6, n_estimators=1000, learning_rate=0.05)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample | subsam... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
[25]	cv_agg's rmse: 1.09657 + 0.000226948
[50]	cv_agg's rmse: 0.799107 + 0.000235839
[75]	cv_agg's rmse: 0.715756 + 0.000252245
[100]	cv_agg's rmse: 0.67395 + 0.000159532
[125]	cv_agg's rmse: 0.658381 + 0.000157569
[150]	cv_agg's rmse: 0.648811 + 0.00014468
[175]	cv_agg's rmse: 0.641947 + 9.12801e-05
[200]	cv_agg's rmse: 0.635277 + 8.97073e-05
[225]	cv_agg's rmse: 0.628499 + 6.78464e-05
[250]	cv_agg's rmse: 0.624807 + 2.29541e-05
[275]	cv_agg's rmse: 0.621711 + 8.18962e-05
[300]	cv_agg's rmse: 0.618956 + 0.000366487
[325]	cv_agg's rmse: 0.617304 + 0.000338922
[350]	cv_agg's rmse: 0.616136 + 0.000241523
[375]	cv_agg's rmse: 0.615207 + 0.000189091
[400]	cv_agg's rmse: 0.614529 



[25]	cv_agg's rmse: 1.08394 + 0.0034858
[50]	cv_agg's rmse: 0.83325 + 0.0034035
[75]	cv_agg's rmse: 0.738826 + 0.00203062
[100]	cv_agg's rmse: 0.702647 + 0.00097106
[125]	cv_agg's rmse: 0.688808 + 0.000637009
[150]	cv_agg's rmse: 0.680707 + 0.000601476
[175]	cv_agg's rmse: 0.676684 + 0.000748957
[200]	cv_agg's rmse: 0.672182 + 0.000649037
[225]	cv_agg's rmse: 0.668656 + 0.000401819
[250]	cv_agg's rmse: 0.665841 + 0.000461809
[275]	cv_agg's rmse: 0.663172 + 0.00032243
[300]	cv_agg's rmse: 0.66105 + 0.000322913
[325]	cv_agg's rmse: 0.659422 + 0.000259199
[350]	cv_agg's rmse: 0.65758 + 0.000183106
[375]	cv_agg's rmse: 0.655793 + 0.000147847
[400]	cv_agg's rmse: 0.654786 + 0.000162355
[425]	cv_agg's rmse: 0.653381 + 0.000162061
[450]	cv_agg's rmse: 0.651992 + 0.000151641
[475]	cv_agg's rmse: 0.650722 + 0.00010466
[500]	cv_agg's rmse: 0.649562 + 0.00021921
[525]	cv_agg's rmse: 0.648343 + 0.00036458
[550]	cv_agg's rmse: 0.647389 + 0.000293007
[575]	cv_agg's rmse: 0.646284 + 0.000175302
[600]



[25]	cv_agg's rmse: 1.60752 + 0.00168434
[50]	cv_agg's rmse: 1.45713 + 0.00136349
[75]	cv_agg's rmse: 1.33872 + 0.00154226
[100]	cv_agg's rmse: 1.25597 + 0.000998296
[125]	cv_agg's rmse: 1.21197 + 0.00125744
[150]	cv_agg's rmse: 1.16351 + 0.00246804
[175]	cv_agg's rmse: 1.12552 + 0.00215358
[200]	cv_agg's rmse: 1.10745 + 0.00226326
[225]	cv_agg's rmse: 1.09214 + 0.00106603
[250]	cv_agg's rmse: 1.07428 + 0.000725348
[275]	cv_agg's rmse: 1.0635 + 0.00185776
[300]	cv_agg's rmse: 1.04659 + 0.000484628
[325]	cv_agg's rmse: 1.03776 + 0.00217182
[350]	cv_agg's rmse: 1.03155 + 0.00215185
[375]	cv_agg's rmse: 1.01826 + 0.00203752
[400]	cv_agg's rmse: 1.01068 + 0.00236527
[425]	cv_agg's rmse: 1.00241 + 0.00315624
[450]	cv_agg's rmse: 1.00052 + 0.00312533
[475]	cv_agg's rmse: 0.99436 + 0.00305775
[500]	cv_agg's rmse: 0.988793 + 0.00230185
[525]	cv_agg's rmse: 0.979413 + 0.00413985
[550]	cv_agg's rmse: 0.975127 + 0.003704
[575]	cv_agg's rmse: 0.968732 + 0.00335111
[600]	cv_agg's rmse: 0.964258 + 0



[25]	cv_agg's rmse: 1.52009 + 0.000272445
[50]	cv_agg's rmse: 1.34825 + 0.000243431
[75]	cv_agg's rmse: 1.21728 + 0.00047436
[100]	cv_agg's rmse: 1.11445 + 0.000822596
[125]	cv_agg's rmse: 1.07251 + 0.000905082
[150]	cv_agg's rmse: 1.02855 + 0.00115529
[175]	cv_agg's rmse: 0.997978 + 0.00107508
[200]	cv_agg's rmse: 0.982865 + 0.00101366
[225]	cv_agg's rmse: 0.969778 + 0.00101394
[250]	cv_agg's rmse: 0.957384 + 0.000939188
[275]	cv_agg's rmse: 0.947669 + 0.000869286
[300]	cv_agg's rmse: 0.937349 + 0.000854704
[325]	cv_agg's rmse: 0.931469 + 0.000910284
[350]	cv_agg's rmse: 0.927577 + 0.000920866
[375]	cv_agg's rmse: 0.920913 + 0.000898511
[400]	cv_agg's rmse: 0.915141 + 0.00111698
[425]	cv_agg's rmse: 0.911031 + 0.000844258
[450]	cv_agg's rmse: 0.909756 + 0.000836171
[475]	cv_agg's rmse: 0.906968 + 0.000820373
[500]	cv_agg's rmse: 0.904705 + 0.000796658
[525]	cv_agg's rmse: 0.901127 + 0.000630176
[550]	cv_agg's rmse: 0.899626 + 0.000705784
[575]	cv_agg's rmse: 0.89785 + 0.000876186
[600



[25]	cv_agg's rmse: 0.875095 + 0.000378841
[50]	cv_agg's rmse: 0.693115 + 0.000646731
[75]	cv_agg's rmse: 0.656049 + 0.000549452
[100]	cv_agg's rmse: 0.639551 + 0.000228452
[125]	cv_agg's rmse: 0.628167 + 0.000242477
[150]	cv_agg's rmse: 0.619985 + 4.12991e-05
[175]	cv_agg's rmse: 0.614858 + 0.000311238
[200]	cv_agg's rmse: 0.611383 + 0.000256834
[225]	cv_agg's rmse: 0.608921 + 0.000271018
[250]	cv_agg's rmse: 0.606994 + 0.000254185
[275]	cv_agg's rmse: 0.604972 + 0.000413716
[300]	cv_agg's rmse: 0.603425 + 0.000501505
[325]	cv_agg's rmse: 0.60209 + 0.00047211
[350]	cv_agg's rmse: 0.600838 + 0.000439086
[375]	cv_agg's rmse: 0.599596 + 0.00046043
[400]	cv_agg's rmse: 0.598687 + 0.000509458
[425]	cv_agg's rmse: 0.597334 + 0.000681378
[450]	cv_agg's rmse: 0.596339 + 0.000835793
[475]	cv_agg's rmse: 0.59584 + 0.000970967
[500]	cv_agg's rmse: 0.59574 + 0.00104871
[525]	cv_agg's rmse: 0.59557 + 0.00108031
[550]	cv_agg's rmse: 0.595382 + 0.00104491
[575]	cv_agg's rmse: 0.595321 + 0.000996751




[25]	cv_agg's rmse: 1.08185 + 0.000594016
[50]	cv_agg's rmse: 0.823172 + 0.00288788
[75]	cv_agg's rmse: 0.72673 + 0.00279074
[100]	cv_agg's rmse: 0.687816 + 0.00106829
[125]	cv_agg's rmse: 0.670922 + 0.000891925
[150]	cv_agg's rmse: 0.661754 + 0.000681221
[175]	cv_agg's rmse: 0.655498 + 0.000653556
[200]	cv_agg's rmse: 0.649932 + 0.000390164
[225]	cv_agg's rmse: 0.645186 + 0.000475177
[250]	cv_agg's rmse: 0.641501 + 0.000570042
[275]	cv_agg's rmse: 0.638086 + 0.000476834
[300]	cv_agg's rmse: 0.635326 + 0.000751794
[325]	cv_agg's rmse: 0.632965 + 0.000699091
[350]	cv_agg's rmse: 0.630459 + 0.000470387
[375]	cv_agg's rmse: 0.628387 + 0.00041941
[400]	cv_agg's rmse: 0.626864 + 0.000447909
[425]	cv_agg's rmse: 0.625048 + 0.000488878
[450]	cv_agg's rmse: 0.623421 + 0.000456777
[475]	cv_agg's rmse: 0.621812 + 0.000573358
[500]	cv_agg's rmse: 0.620416 + 0.000624399
[525]	cv_agg's rmse: 0.618967 + 0.000449477
[550]	cv_agg's rmse: 0.617848 + 0.000456179
[575]	cv_agg's rmse: 0.616359 + 0.0004285



[25]	cv_agg's rmse: 0.923034 + 0.000308308
[50]	cv_agg's rmse: 0.734256 + 0.000498757
[75]	cv_agg's rmse: 0.686558 + 0.000531407
[100]	cv_agg's rmse: 0.665501 + 0.000456955
[125]	cv_agg's rmse: 0.652047 + 0.000322114
[150]	cv_agg's rmse: 0.643082 + 0.000344206
[175]	cv_agg's rmse: 0.637773 + 0.000367545
[200]	cv_agg's rmse: 0.633944 + 0.000502954
[225]	cv_agg's rmse: 0.631392 + 0.000304115
[250]	cv_agg's rmse: 0.629058 + 0.00050757
[275]	cv_agg's rmse: 0.626741 + 0.000716503
[300]	cv_agg's rmse: 0.624889 + 0.000713295
[325]	cv_agg's rmse: 0.623369 + 0.000804687
[350]	cv_agg's rmse: 0.621866 + 0.00090982
[375]	cv_agg's rmse: 0.620706 + 0.000818385
[400]	cv_agg's rmse: 0.61991 + 0.000780897
[425]	cv_agg's rmse: 0.61864 + 0.000614207
[450]	cv_agg's rmse: 0.617751 + 0.000406446
[475]	cv_agg's rmse: 0.61682 + 0.000392485
[500]	cv_agg's rmse: 0.615777 + 0.000415704
[525]	cv_agg's rmse: 0.614912 + 0.000439484
[550]	cv_agg's rmse: 0.614144 + 0.000483132
[575]	cv_agg's rmse: 0.613373 + 0.000515



[25]	cv_agg's rmse: 1.52016 + 0.000230021
[50]	cv_agg's rmse: 1.34831 + 0.000243513
[75]	cv_agg's rmse: 1.21737 + 0.000334685
[100]	cv_agg's rmse: 1.1144 + 0.000671473
[125]	cv_agg's rmse: 1.07239 + 0.000791715
[150]	cv_agg's rmse: 1.02837 + 0.000850426
[175]	cv_agg's rmse: 0.997757 + 0.000790776
[200]	cv_agg's rmse: 0.982632 + 0.000719083
[225]	cv_agg's rmse: 0.969471 + 0.000609933
[250]	cv_agg's rmse: 0.957071 + 0.000569716
[275]	cv_agg's rmse: 0.947248 + 0.000574891
[300]	cv_agg's rmse: 0.937023 + 0.000534257
[325]	cv_agg's rmse: 0.931132 + 0.000627355
[350]	cv_agg's rmse: 0.927221 + 0.000637691
[375]	cv_agg's rmse: 0.920652 + 0.000600282
[400]	cv_agg's rmse: 0.914901 + 0.000766052
[425]	cv_agg's rmse: 0.91078 + 0.000522233
[450]	cv_agg's rmse: 0.909515 + 0.000526609
[475]	cv_agg's rmse: 0.906702 + 0.000530056
[500]	cv_agg's rmse: 0.904394 + 0.00055288
[525]	cv_agg's rmse: 0.900856 + 0.000376226
[550]	cv_agg's rmse: 0.899362 + 0.00040695
[575]	cv_agg's rmse: 0.897611 + 0.000568439
[

In [None]:
|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample | subsam... |
|  7        | -0.5946   |  0.9      |  180.0    | -1.0      |  20.0     |  30.0     |  0.9      |  2.551e+0 |  0.1      |  0.1      |  0.9      |  20.0     |

In [8]:
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 2551,
    "learning_rate": 0.05,
    "colsample_bytree": 0.9,
    "reg_lambda": 0.1,
    'reg_alpha':0.1,
    "metric": "rmse",
    'max_bins':180,
    'max_depth':-1,
    'min_child_sample':20,
    'min_child_weight':30,
    'min_split_gain':0.9,
    'subsample':0.9,
    'subsample_freq':20,
     
}