# Persistance Benchmark Model
This benchmark model uses the load from 24 hours prior to predict the load at any given day/time.

Author: Riley Denn

In [10]:
import pandas as pd
import numpy as np
import json
import os
import random
from sklearn.model_selection import train_test_split

In [3]:
with open('../../config.json', 'r') as config_file:
    config = json.load(config_file)

DRIVE_PATH = config['drive_path']
EXTERNAL_DATA_PATH = DRIVE_PATH + "/[EXTERNAL] breakthrough_tech_ai_f24/data"
PROCESSED_DATA_PATH = DRIVE_PATH + "/processed_data"
PROCESSED_WEATHER_LOAD = PROCESSED_DATA_PATH + "/processed_weather_load_w_timestamp"

In [4]:
df_metadata = pd.read_csv(PROCESSED_DATA_PATH + "/subset20.csv")
df_metadata.head()

Unnamed: 0,bldg_id,in.state,in.cluster_id,in.vintage,in.sqft,in.building_america_climate_zone_Cold,in.building_america_climate_zone_Hot-Dry,in.building_america_climate_zone_Hot-Humid,in.building_america_climate_zone_Marine,in.building_america_climate_zone_Mixed-Dry,...,in.comstock_building_type_SecondarySchool,in.comstock_building_type_SmallHotel,in.comstock_building_type_SmallOffice,in.comstock_building_type_Warehouse,in.comstock_building_type_group_Education,in.comstock_building_type_group_Food Service,in.comstock_building_type_group_Lodging,in.comstock_building_type_group_Mercantile,in.comstock_building_type_group_Office,in.comstock_building_type_group_Warehouse and Storage
0,105885,10,42.0,3,750000.0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,305819,40,74.0,2,150000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,305934,40,75.0,4,350000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,317044,40,75.0,3,350000.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,32,1,53.0,6,37500.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [5]:
df_metadata.shape

(6401, 53)

In [6]:
sample_weather_load_path = PROCESSED_WEATHER_LOAD + "/60127.csv"
sample_weather_load = pd.read_csv(sample_weather_load_path)
sample_weather_load.set_index('timestamp', inplace=True)
sample_weather_load['load_24h_ago'] = sample_weather_load['out.electricity.total.energy_consumption'].shift(96)
sample_weather_load.dropna(inplace=True)
sample_weather_load.head(96)

Unnamed: 0_level_0,Index,out.electricity.total.energy_consumption,Dry Bulb Temperature [°C],Relative Humidity [%],heat_index,minute,hour,day,month,year,is_weekday,is_holiday,max_load_hourly,max_temp_hourly,min_temp_hourly,bldg_id,load_24h_ago
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2018-01-02 01:00:00,96,4.834816,-11.700,81.910513,10.940,0,1,2,1,2018,1,0,4.834816,-11.7,-12.075,60127,4.835047
2018-01-02 01:15:00,97,4.123039,-11.825,81.894723,10.715,15,1,2,1,2018,1,0,4.834816,-11.7,-12.075,60127,4.123039
2018-01-02 01:30:00,98,3.411031,-11.950,81.878932,10.490,30,1,2,1,2018,1,0,4.834816,-11.7,-12.075,60127,3.411031
2018-01-02 01:45:00,99,3.279638,-12.075,81.863142,10.265,45,1,2,1,2018,1,0,4.834816,-11.7,-12.075,60127,3.279638
2018-01-02 02:00:00,100,3.148244,-12.200,81.847352,10.040,0,2,2,1,2018,1,0,3.148244,-12.2,-13.475,60127,3.148244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-01-02 23:45:00,187,9.484679,-8.900,74.513243,15.980,45,23,2,1,2018,1,0,11.234517,-8.9,-8.900,60127,9.515865
2018-01-03 00:00:00,188,8.398514,-8.900,74.513241,15.980,0,0,3,1,2018,1,0,8.398514,-8.9,-8.900,60127,8.429700
2018-01-03 00:15:00,189,7.312349,-8.900,74.513240,15.980,15,0,3,1,2018,1,0,8.398514,-8.9,-8.900,60127,7.343535
2018-01-03 00:30:00,190,6.226184,-8.900,74.513239,15.980,30,0,3,1,2018,1,0,8.398514,-8.9,-8.900,60127,6.257370


In [7]:
#Test to make sure shift worked
for i in range(96,sample_weather_load.shape[0]):
    working = True
    if sample_weather_load['out.electricity.total.energy_consumption'].iloc[i-96]!= sample_weather_load['load_24h_ago'].iloc[i]:
        working = False
        break

print(working)

True


In [22]:
def calculate_smape(actual, predicted):
    numerator = np.abs(predicted - actual)
    denominator = (np.abs(actual) + np.abs(predicted))/2
    diff = numerator / denominator
    smape = np.mean(diff) * 100  # Multiply by 100 to get percentage
    return smape

In [27]:
building_ids = os.listdir(PROCESSED_WEATHER_LOAD)

if len(building_ids) < 5000:
    print(f"Only {len(building_ids)} building files available. Adjusting the subset size.")
    subset_size = len(building_ids)
else:
    subset_size = 5000

subset = random.sample(building_ids, subset_size)

Only 1400 building files available. Adjusting the subset size.


Note: When data in ".../processed_data/processed_weather_load_w_timestamp" finishes processing, replace above block with:
building_ids = df_metadata['bldg_id'].tolist()

and in loop below, change:
path = f"{PROCESSED_WEATHER_LOAD}/{bldg}" 
to 
path = f"{PROCESSED_WEATHER_LOAD}/{bldg}.csv"

In [28]:
smape_scores = []

for bldg in building_ids:

    path = f"{PROCESSED_WEATHER_LOAD}/{bldg}"
    
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        print(f"File not found for building ID {bldg}. Skipping...")
        continue
    
    df = pd.read_csv(path)
    df.set_index('timestamp', inplace=True)
    df['load_24h_ago'] = df['out.electricity.total.energy_consumption'].shift(96)
    df.dropna(inplace=True)

    smape = calculate_smape(actual=df['out.electricity.total.energy_consumption'], predicted=df['load_24h_ago'])
    smape_scores.append(smape)

# Calculate the average SMAPE across all buildings
average_smape = np.mean(smape_scores)
print(f'Average SMAPE across all buildings: {average_smape:.2f}%')

Average SMAPE across all buildings: 18.94%
