In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
df_train= pd.read_csv('../../Large_output/train_clean_merge.csv')

In [4]:
def features_engineering(df):
    
    # Sort by localtime
    df.sort_values("local_time")
    df.reset_index(drop=True)
    
    # Add more features
    df["local_time"] = pd.to_datetime(df["local_time"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["local_time"].dt.hour
    df["weekend"] = df["local_time"].dt.weekday
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [5]:
df_train = reduce_mem_usage(df_train,use_float16=True)

Memory usage of dataframe is 2638.86 MB
Memory usage after optimization is: 733.78 MB
Decreased by 72.2%


In [6]:
train_engineer = features_engineering(df_train)
train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\
=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)

In [7]:
target = np.log1p(df_train["meter_reading"])
features = df_train[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]

In [10]:
# xgb bayesian: 
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization
dtrain = xgb.DMatrix(features, label=target)

In [None]:
# run on gpu

In [17]:
def xgb_evaluate(max_depth, subsample,gamma, colsample_bytree, max_leaves,max_bin, min_child_weight,colsample_bylevel,\
                reg_alpha,reg_lambda, random_seed=6):
    params = {'eval_metric': 'rmse',\
              'objective': 'reg:linear',\
              'booster':'gbtree',\
              'max_depth': int(max_depth),\
              'subsample': subsample,\
              'eta': 0.05,\
              'tree_method':'gpu_hist',\
              'gamma': gamma,\
              'colsample_bytree': colsample_bytree,\
              'max_leaves': int(max_leaves),\
              'max_bin':int(max_bin),\
              'min_child_weight':min_child_weight,\
              'colsample_bylevel':colsample_bylevel,\
              'reg_alpha':reg_alpha,\
              'reg_lambda':reg_lambda,
              'n_gpus': 2}
    cv_result = xgb.cv(params, dtrain, num_boost_round=1000, nfold=3,seed=random_seed, stratified=False, verbose_eval=100,early_stopping_rounds=50)    
    # Bayesian optimization only knows how to maximize, not minimize, so return the negative RMSE
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [19]:
xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (0, 12), \
                                             'gamma': (0, 1),\
                                             'subsample': (0.1, 0.9),\
                                            'max_leaves': (1000, 2000),\
                                            'colsample_bytree': (0.1, 0.9),\
                                            'reg_lambda': (0.1, 2),\
                                            'reg_alpha': (0.1, 2),\
                                            'max_bin':(180,500),\
                                            'colsample_bylevel':(0.1,0.9),\
                                            'min_child_weight': (3, 20)})
# Use the expected improvement acquisition function to handle negative numbers
xgb_bo.maximize(init_points=3, n_iter=10, acq='ei')

|   iter    |  target   | colsam... | colsam... |   gamma   |  max_bin  | max_depth | max_le... | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
[0]	train-rmse:4.12314+0.0003706	test-rmse:4.12314+0.000676327
[100]	train-rmse:1.39528+0.00200797	test-rmse:1.39561+0.00249345
[200]	train-rmse:1.26164+0.00378422	test-rmse:1.26224+0.00386222
[300]	train-rmse:1.19906+0.00115681	test-rmse:1.19987+0.000617367
[400]	train-rmse:1.1564+0.00218372	test-rmse:1.1574+0.00200922
[500]	train-rmse:1.12336+0.00814927	test-rmse:1.12458+0.0075568
[600]	train-rmse:1.09848+0.00801517	test-rmse:1.09987+0.00742283
[700]	train-rmse:1.08005+0.00944957	test-rmse:1.08159+0.00898835
[800]	train-rmse:1.06308+0.00765239	test-rmse:1.0648+0.00711621
[900]	train-rmse:1.047+0.00934657	test-rmse:1.04888+0.00886026
[999]	train-rmse:1.03257+0.00904562	test-rmse:1.03462+0.0086239
|

[0]	train-rmse:4.12191+0.000350515	test-rmse:4.12191+0.00066408
[100]	train-rmse:1.27258+0.00496985	test-rmse:1.2735+0.00481093
[200]	train-rmse:1.15728+0.00521147	test-rmse:1.15893+0.0052561
[300]	train-rmse:1.09291+0.00361559	test-rmse:1.09518+0.00316121
[400]	train-rmse:1.05661+0.00574123	test-rmse:1.05941+0.00539948
[500]	train-rmse:1.02624+0.00727025	test-rmse:1.02947+0.00700512
[600]	train-rmse:1.00122+0.00704845	test-rmse:1.00492+0.00709764
[700]	train-rmse:0.977006+0.00545869	test-rmse:0.981163+0.0054098
[800]	train-rmse:0.957564+0.00632136	test-rmse:0.962168+0.00645766
[900]	train-rmse:0.939693+0.0052722	test-rmse:0.944736+0.00542688
[999]	train-rmse:0.925171+0.00535174	test-rmse:0.930638+0.00544016
| [0m 7       [0m | [0m-0.9306  [0m | [0m 0.1     [0m | [0m 0.9     [0m | [0m 1.0     [0m | [0m 180.0   [0m | [0m 12.0    [0m | [0m 1.281e+0[0m | [0m 3.0     [0m | [0m 2.0     [0m | [0m 2.0     [0m | [0m 0.1     [0m |
[0]	train-rmse:4.12649+0.000316229	test-

[0]	train-rmse:4.09881+0.000269857	test-rmse:4.09882+0.000740935
[100]	train-rmse:0.86566+0.0011386	test-rmse:0.868008+0.00160907
[200]	train-rmse:0.78165+0.000440528	test-rmse:0.785763+0.00117285
[300]	train-rmse:0.740664+0.0017635	test-rmse:0.746295+0.00260847
[400]	train-rmse:0.71583+0.0011753	test-rmse:0.722903+0.00173396
[500]	train-rmse:0.69637+0.00183903	test-rmse:0.704926+0.0020122
[600]	train-rmse:0.68202+0.00143813	test-rmse:0.692061+0.00153493
[700]	train-rmse:0.670476+0.00125857	test-rmse:0.682002+0.00123233
[800]	train-rmse:0.660016+0.00117765	test-rmse:0.673081+0.00120619
[900]	train-rmse:0.651486+0.00100987	test-rmse:0.666026+0.000918536
[999]	train-rmse:0.643945+0.00103792	test-rmse:0.659999+0.00097396
| [0m 13      [0m | [0m-0.66    [0m | [0m 0.9     [0m | [0m 0.9     [0m | [0m 0.0     [0m | [0m 371.5   [0m | [0m 0.0     [0m | [0m 1.315e+0[0m | [0m 20.0    [0m | [0m 0.1     [0m | [0m 0.1     [0m | [0m 0.9     [0m |


In [None]:
|   iter    |  target   | colsam... | colsam... |   gamma   |  max_bin  | max_depth | max_le... | min_ch... | reg_alpha | reg_la... | subsample |
|  11       | -0.6449   |  0.9      |  0.9      |  1.0      |  500.0    |  12.0     |  2e+03    |  3.0      |  0.1      |  0.1      |  0.9      |

In [None]:
params = {'eval_metric': 'rmse',\
              'objective': 'reg:squarederror',\
              'booster':'gbtree',\
              'nthread' : 4,\
              'eta' : 0.05,\
              'max_leaves': 2000,\
              'max_depth' : 12,\
              'subsample' : 0.9,\
              'colsample_bytree' : 0.9,\
              'colsample_bylevel' : 0.9,\
             'gamma':1.0,\
             'max_bin':500,\
             'min_child_weight':3.0,\
             'reg_alpha':0.1,\
             'reg_lambda':0.1}