In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import gc

In [2]:
df_train= pd.read_csv('../../Large_output/train_clean_merge.csv')

In [3]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
def features_engineering(df):
    
    # Sort by localtime
    df.sort_values("local_time")
    df.reset_index(drop=True)
    
    # Add more features
    df["local_time"] = pd.to_datetime(df["local_time"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["local_time"].dt.hour
    df["weekend"] = df["local_time"].dt.weekday
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [5]:
# reduce memory usage
df_train = reduce_mem_usage(df_train,use_float16=True)

Memory usage of dataframe is 2638.86 MB
Memory usage after optimization is: 733.78 MB
Decreased by 72.2%


In [6]:
# train feature engineering and change the site_0 unit
train_engineer = features_engineering(df_train)
train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\
=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)

In [7]:
# get the target and features
target = np.log1p(df_train["meter_reading"])
features = df_train[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]

In [9]:
import catboost as cgb

In [10]:
# define the function for optimization and setup the parameter range
def cat_hyp(depth, bagging_temperature):
    params = {'iterations': 1000, 'learning_rate': 0.05, 'eval_metric': 'RMSE', 'loss_function': 'RMSE', 
             'verbose': False, 'metric_period': 100, 'task_type': 'GPU'}
    params['depth'] = int(round(depth))
    params['bagging_temperature'] = bagging_temperature
    cat_feat = ["building_id", "site_id", "meter", 
                        "primary_use",  "weekend",'is_holiday']
    cv_dataset = cgb.Pool(data=features, label = target, cat_features = cat_feat)
    scores = cgb.cv(cv_dataset, params, fold_count = 3)
    return -1.0*np.min(scores['test-RMSE-mean'])

In [11]:
# setup the parameter opt range
pds = {'depth': (1,12), 'bagging_temperature': (3,10)}

In [12]:
from bayes_opt import BayesianOptimization

In [13]:
# range opt
optimizer = BayesianOptimization(cat_hyp, pds, random_state = 42)
optimizer.maximize(init_points = 3, n_iter=9)

|   iter    |  target   | baggin... |   depth   |
-------------------------------------------------
| [0m 1       [0m | [0m-0.8063  [0m | [0m 5.622   [0m | [0m 11.46   [0m |
| [0m 2       [0m | [0m-0.9055  [0m | [0m 8.124   [0m | [0m 7.585   [0m |
| [0m 3       [0m | [0m-1.024   [0m | [0m 4.092   [0m | [0m 2.716   [0m |
| [0m 4       [0m | [0m-0.8536  [0m | [0m 10.0    [0m | [0m 12.0    [0m |
| [0m 5       [0m | [0m-0.8243  [0m | [0m 3.0     [0m | [0m 12.0    [0m |
| [95m 6       [0m | [95m-0.8     [0m | [95m 6.155   [0m | [95m 12.0    [0m |
| [95m 7       [0m | [95m-0.7945  [0m | [95m 5.479   [0m | [95m 12.0    [0m |
| [0m 8       [0m | [0m-1.461   [0m | [0m 10.0    [0m | [0m 1.0     [0m |
| [0m 9       [0m | [0m-0.8403  [0m | [0m 3.0     [0m | [0m 7.246   [0m |
| [0m 10      [0m | [0m-0.7985  [0m | [0m 4.769   [0m | [0m 12.0    [0m |
| [0m 11      [0m | [0m-0.7947  [0m | [0m 5.312   [0m | [0m 12.0 

In [None]:
# best is 5.335  12.0