# Gradient Boosted Tree Based Model using CatBoost
Author: Riley Denn  
Team Fermata Energy

In [2]:
import numpy as np
import pandas as pd
import json
import os
import ipyparallel as ipp
from catboost import CatBoostRegressor

In [3]:
with open('../../config.json', 'r') as config_file:
    config = json.load(config_file)

DRIVE_PATH = config['drive_path']
EXTERNAL_DATA_PATH = DRIVE_PATH + "/[EXTERNAL] breakthrough_tech_ai_f24/data"
PROCESSED_DATA_PATH = DRIVE_PATH + "/processed_data"
PROCESSED_WEATHER_LOAD = PROCESSED_DATA_PATH + "/processed_weather_load_w_timestamp"

In [4]:
with open('../data/subset20_data.json', 'r') as test_train_file:
    test_train_ids = json.load(test_train_file)

# train_ids = test_train_ids['train_bldg_ids']
# test_ids = test_train_ids['test_bldg_ids']

train_ids = [int(bldg_id.replace('.csv', '')) for bldg_id in test_train_ids['train_bldg_ids']]
test_ids = [int(bldg_id.replace('.csv', '')) for bldg_id in test_train_ids['test_bldg_ids']]

In [5]:
df_metadata = pd.read_csv(PROCESSED_DATA_PATH + "/subset20.csv")
df_metadata.head()

# use pandas categorical to undo one hot encoding

Unnamed: 0,bldg_id,in.state,in.cluster_id,in.vintage,in.sqft,in.building_america_climate_zone_Cold,in.building_america_climate_zone_Hot-Dry,in.building_america_climate_zone_Hot-Humid,in.building_america_climate_zone_Marine,in.building_america_climate_zone_Mixed-Dry,...,in.comstock_building_type_SecondarySchool,in.comstock_building_type_SmallHotel,in.comstock_building_type_SmallOffice,in.comstock_building_type_Warehouse,in.comstock_building_type_group_Education,in.comstock_building_type_group_Food Service,in.comstock_building_type_group_Lodging,in.comstock_building_type_group_Mercantile,in.comstock_building_type_group_Office,in.comstock_building_type_group_Warehouse and Storage
0,105885,10,42.0,3,750000.0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,305819,40,74.0,2,150000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,305934,40,75.0,4,350000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,317044,40,75.0,3,350000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,32,1,53.0,6,37500.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [6]:
df_metadata.columns

Index(['bldg_id', 'in.state', 'in.cluster_id', 'in.vintage', 'in.sqft',
       'in.building_america_climate_zone_Cold',
       'in.building_america_climate_zone_Hot-Dry',
       'in.building_america_climate_zone_Hot-Humid',
       'in.building_america_climate_zone_Marine',
       'in.building_america_climate_zone_Mixed-Dry',
       'in.building_america_climate_zone_Mixed-Humid',
       'in.building_america_climate_zone_Subarctic',
       'in.building_america_climate_zone_Very Cold', 'in.iso_rto_region_CAISO',
       'in.iso_rto_region_ERCOT', 'in.iso_rto_region_MISO',
       'in.iso_rto_region_NEISO', 'in.iso_rto_region_NYISO',
       'in.iso_rto_region_PJM', 'in.iso_rto_region_SPP',
       'in.iso_rto_region_none', 'in.heating_fuel_DistrictHeating',
       'in.heating_fuel_Electricity', 'in.heating_fuel_FuelOil',
       'in.heating_fuel_NaturalGas', 'in.heating_fuel_Propane',
       'in.interior_lighting_generation_gen1_t12_incandescent',
       'in.interior_lighting_generation_gen2_t

In [7]:
sample_bldg_id = df_metadata['bldg_id'][1]
sample_df = pd.read_csv(f"{PROCESSED_WEATHER_LOAD}/{sample_bldg_id}.csv")
sample_df.head()

Unnamed: 0,timestamp,out.electricity.total.energy_consumption,Dry Bulb Temperature [°C],Relative Humidity [%],heat_index,minute,hour,day,month,is_weekday,is_holiday,max_load_hourly,min_load_hourly,max_temp_hourly,min_temp_hourly,bldg_id
0,2018-01-01 01:00:00,112.398425,2.984615,75.216154,37.372308,0,1,1,1,1,1,195.978207,112.398425,2.984615,2.719231,305819
1,2018-01-01 01:15:00,195.978207,2.896154,75.650769,37.213077,15,1,1,1,1,1,195.978207,112.398425,2.984615,2.719231,305819
2,2018-01-01 01:30:00,193.125037,2.807692,76.085385,37.053846,30,1,1,1,1,1,195.978207,112.398425,2.984615,2.719231,305819
3,2018-01-01 01:45:00,190.015538,2.719231,76.52,36.894615,45,1,1,1,1,1,195.978207,112.398425,2.984615,2.719231,305819
4,2018-01-01 02:00:00,186.667871,2.630769,76.954615,36.735385,0,2,1,1,1,1,194.222126,186.667871,2.630769,2.140385,305819


In [8]:
sample_df.columns

Index(['timestamp', 'out.electricity.total.energy_consumption',
       'Dry Bulb Temperature [°C]', 'Relative Humidity [%]', 'heat_index',
       'minute', 'hour', 'day', 'month', 'is_weekday', 'is_holiday',
       'max_load_hourly', 'min_load_hourly', 'max_temp_hourly',
       'min_temp_hourly', 'bldg_id'],
      dtype='object')

In [11]:
sample_bldg_id = df_metadata['bldg_id'][1]
sample_df = pd.read_csv(f"{PROCESSED_WEATHER_LOAD}/{sample_bldg_id}.csv")
sample_metadata = df_metadata[df_metadata['bldg_id'] == sample_bldg_id]
sample_df = sample_df.merge(sample_metadata, on='bldg_id', how='left')
sample_df.set_index('timestamp', inplace=True)

if sample_df.isnull().any().any():
    print(f"Columns with missing values for building {sample_bldg_id}: {sample_df.columns[sample_df.isnull().any()]}") 
sample_df.head()


Unnamed: 0_level_0,out.electricity.total.energy_consumption,Dry Bulb Temperature [°C],Relative Humidity [%],heat_index,minute,hour,day,month,is_weekday,is_holiday,...,in.comstock_building_type_SecondarySchool,in.comstock_building_type_SmallHotel,in.comstock_building_type_SmallOffice,in.comstock_building_type_Warehouse,in.comstock_building_type_group_Education,in.comstock_building_type_group_Food Service,in.comstock_building_type_group_Lodging,in.comstock_building_type_group_Mercantile,in.comstock_building_type_group_Office,in.comstock_building_type_group_Warehouse and Storage
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01 01:00:00,112.398425,2.984615,75.216154,37.372308,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0
2018-01-01 01:15:00,195.978207,2.896154,75.650769,37.213077,15,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0
2018-01-01 01:30:00,193.125037,2.807692,76.085385,37.053846,30,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0
2018-01-01 01:45:00,190.015538,2.719231,76.52,36.894615,45,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0
2018-01-01 02:00:00,186.667871,2.630769,76.954615,36.735385,0,2,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0


In [None]:
model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6)

# Define your feature columns (excluding target column and bldg_id)
X = sample_df.drop(columns=['out.electricity.total.energy_consumption', 'bldg_id'])
y = sample_df['out.electricity.total.energy_consumption']

# Fit the model to this building's data
model.fit(X, y)

# Save the model if needed (for later use with other buildings)
model.save_model(f'catboost_model_building_{building_id}.cbm')

In [110]:
# def prep_features_label(bldg_id, df_metadata, data_path):
#     import pandas as pd
#     print(data_path)
#     PROCESSED_WEATHER_LOAD = data_path
#     df_bldg = pd.read_csv(f"{PROCESSED_WEATHER_LOAD}/{bldg_id}.csv")

#     df_bldg['timestamp'] = pd.to_datetime(df_bldg['timestamp'])
#     df_bldg['month'] = df_bldg['timestamp'].dt.month
#     df_bldg['day'] = df_bldg['timestamp'].dt.day
#     df_bldg['hour'] = df_bldg['timestamp'].dt.hour
#     df_bldg['minute'] = df_bldg['timestamp'].dt.minute

#     df_bldg.set_index('timestamp', inplace=True)

#     bldg_metadata = df_metadata[df_metadata['bldg_id'] == bldg_id]
#     df_bldg = df_bldg.merge(bldg_metadata, on='bldg_id', how='left')

#     X = df_bldg.drop(columns=['out.electricity.total.energy_consumption', 'bldg_id'])
#     y = df_bldg['out.electricity.total.energy_consumption']

#     return X, y, bldg_id


# def train_in_chunks(df_metadata, train_bldg_ids):
#     model = None


#     # with cluster[:].map_async(prep_features_label, train_bldg_ids, df_metadata) as async_result:
#     # results = async_result.get()
    
    
#     with ipp.Cluster(n=4).start_and_connect_sync() as rc:
#         e_all = rc[:]
#         #ar = e_all.map_async(prep_features_label, train_bldg_ids, df_metadata, PROCESSED_WEATHER_LOAD)
#         ar = e_all.map_async(prep_features_label, train_bldg_ids, [df_metadata] * len(train_bldg_ids), [PROCESSED_WEATHER_LOAD] * len(train_bldg_ids))
#         results = ar.get()
    
#     # Use the results for further processing
#     for result in results:
#         X, y, bldg_id = result
        
#         # Train model or process data...
        
#         # Initialize model if it's the first building
#         if model is None:
#             model = CatBoostRegressor(thread_count=-1)
#             model.fit(X, y, verbose=100)
#         else:
#             # After the first model, check if feature consistency is maintained
#             if set(X.columns) != set(model.feature_names_):
#                 print(f"Feature mismatch for building {bldg_id}")
#                 # Handle feature mismatch, e.g., reorder or reprocess features

#             model.fit(X, y, init_model=model, verbose=100)

#     # Stop the cluster
#     rc.stop()
    
    # return model


# def train_in_chunks(df_metadata, train_bldg_ids):
#     model = None

#     with concurrent.futures.ProcessPoolExecutor() as executor:
#         futures = []
#         # Submit tasks for each building
#         for bldg_id in train_bldg_ids:
#             futures.append(executor.submit(prep_features_label, bldg_id, df_metadata))

#         for future in concurrent.futures.as_completed(futures):
#             X, y, bldg_id = future.result()

#             # Initialize model if it's the first building
#             if model is None:
#                 model = CatBoostRegressor()
#                 model.fit(X, y, verbose=100)
#             else:
#                 # # After the first model, check if feature consistency is maintained
#                 # if set(X.columns) != set(model.feature_names_):
#                 #     print(f"Feature mismatch for building {bldg_id}")
#                 #     # Handle feature mismatch, e.g., reorder or reprocess features

#                 model.fit(X, y, init_model=model, verbose=100)

#     return model

def train_in_chunks(df_metadata, train_bldg_ids):

    '''model = None
    for id in buildings:
        new_model = CatboostRegressor()
        if model is None:
            new_model.fit(...)
        else:
            new_model.fit(..., init_model=model)
        model = new_model''' # Try this
    
    model = None

    # Loop through each building and process
    for bldg_id in train_bldg_ids:
        df_bldg = pd.read_csv(f"{PROCESSED_WEATHER_LOAD}/{bldg_id}.csv")

        df_bldg['timestamp'] = pd.to_datetime(df_bldg['timestamp'])

        df_bldg['month'] = df_bldg['timestamp'].dt.month
        df_bldg['day'] = df_bldg['timestamp'].dt.day
        df_bldg['hour'] = df_bldg['timestamp'].dt.hour
        df_bldg['minute'] = df_bldg['timestamp'].dt.minute

        df_bldg.set_index('timestamp', inplace=True)

        bldg_metadata = df_metadata[df_metadata['bldg_id'] == bldg_id]
        df_bldg = df_bldg.merge(bldg_metadata, on='bldg_id', how='left')

        # Prepare features (X) and label (y)
        X = df_bldg.drop(columns=['out.electricity.total.energy_consumption', 'bldg_id'])
        y = df_bldg['out.electricity.total.energy_consumption']

        if model is None:
            model = CatBoostRegressor(iterations=100, thread_count=-1)
            model.fit(X, y, verbose=100)
        else:
            # After the first model, check if feature consistency is maintained
            if set(X.columns) != set(model.feature_names_):
                print(f"Feature mismatch for building {bldg_id}")
                # Handle feature mismatch, e.g., reorder or reprocess features
            
            model.fit(X, y, init_model=model, verbose=100) # can try getting rid of init_model ?

    return model


In [112]:
# can use different models for different types of buildings

model = train_in_chunks(df_metadata, train_ids[:100])

model.save_model('catboost_model_subset_20.cbm')

Learning rate set to 0.46689
0:	learn: 1.5224869	total: 2.92ms	remaining: 289ms
99:	learn: 0.1293735	total: 259ms	remaining: 0us
Learning rate set to 0.422987
0:	learn: 31.5979759	total: 3.37ms	remaining: 333ms
99:	learn: 1.4480021	total: 263ms	remaining: 0us
Learning rate set to 0.422987
0:	learn: 1.3185012	total: 2.81ms	remaining: 279ms
99:	learn: 0.4771251	total: 261ms	remaining: 0us
Learning rate set to 0.422987
0:	learn: 0.2756078	total: 2.37ms	remaining: 235ms
99:	learn: 0.1317135	total: 256ms	remaining: 0us
Learning rate set to 0.422987
0:	learn: 1.2004166	total: 2.54ms	remaining: 251ms
99:	learn: 0.5888917	total: 255ms	remaining: 0us
Learning rate set to 0.422987
0:	learn: 3.5186381	total: 2.39ms	remaining: 236ms
99:	learn: 2.2791750	total: 256ms	remaining: 0us
Learning rate set to 0.422987
0:	learn: 18.3313444	total: 2.79ms	remaining: 276ms
99:	learn: 8.2945713	total: 249ms	remaining: 0us
Learning rate set to 0.422987
0:	learn: 6.2023246	total: 2.68ms	remaining: 266ms
99:	lear

OSError: [Errno 89] Operation canceled

In [64]:
missing_bldg_ids = set(train_ids) - set(df_metadata['bldg_id'])
print(f"Missing building IDs in metadata: {missing_bldg_ids}")

Missing building IDs in metadata: set()


In [84]:
import ipyparallel as ipp

rc = ipp.Cluster(n=4).start_and_connect_sync()

# Check the status of the cluster
print("Cluster is connected: ", rc.ids)

Starting 4 engines with <class 'ipyparallel.cluster.launcher.LocalEngineSetLauncher'>


  0%|          | 0/4 [00:00<?, ?engine/s]

Cluster is connected:  [0, 1, 2, 3]


In [93]:
with ipp.Cluster(n=4) as rc:
    e_all = rc[:]
    ar = e_all.apply_sync(prep_features_label, train_ids, df_metadata)  # Apply function synchronously
    ar.wait_interactive()
    results = ar.get()  # Get the results once computation is complete

# Output the results
print(results)

Starting 4 engines with <class 'ipyparallel.cluster.launcher.LocalEngineSetLauncher'>


  0%|          | 0/4 [00:00<?, ?engine/s]

Stopping engine(s): 1731542593
engine set stopped 1731542593: {'engines': {'0': {'exit_code': 0, 'pid': 81681, 'identifier': '0'}, '1': {'exit_code': 0, 'pid': 81682, 'identifier': '1'}, '2': {'exit_code': 0, 'pid': 81683, 'identifier': '2'}, '3': {'exit_code': 0, 'pid': 81684, 'identifier': '3'}}, 'exit_code': 0}
Stopping controller
Controller stopped: {'exit_code': 0, 'pid': 81671, 'identifier': 'ipcontroller-1731542592-2jyy-79661'}


CompositeError: one or more exceptions raised in: prep_features_label
[0:apply]NameError: name 'PROCESSED_WEATHER_LOAD' is not defined
[1:apply]NameError: name 'PROCESSED_WEATHER_LOAD' is not defined
[2:apply]NameError: name 'PROCESSED_WEATHER_LOAD' is not defined
[3:apply]NameError: name 'PROCESSED_WEATHER_LOAD' is not defined

In [2]:
# Code from Cindy

import torch
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("Using CPU")

Using CPU


In [None]:
# Code from Cindy - template for using Pool

from catboost import CatBoostClassifier, Pool
def batch_data_generator(data, labels, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i+batch_size], labels[i:i+batch_size]
model = CatBoostClassifier(iterations=1000, task_type="GPU")
X, y = ...
batch_size = 10000
for X_batch, y_batch in batch_data_generator(X, y, batch_size):
    batch_pool = Pool(data=X_batch, label=y_batch)
    model.fit(batch_pool, init_model=model, use_best_model=False)