# Combine weather.csv and load.csv for all buildings

Creates a csv for each building with combined weather and load columns, as well as columns for heat index, minute, hour, day, month, year, is_weekend, is_holiday, max hourly load, max and min temperature, and building id. Outputs each csv to the shared team drive.

The output csvs are located in the drive folder 'Team-Fermata-Energy/processed_data/processed_weather_load_w_timestamp' and the name of each csv is the building id.

Author: Riley Denn

Code modified from teammate Victoria Worthington's utils_tester.ipynb

In [1]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.model_selection import train_test_split
import metpy.calc as mpcalc
from metpy.units import units
import holidays

In [2]:
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

DRIVE_PATH = config['drive_path']
EXTERNAL_DATA_PATH = DRIVE_PATH + "/[EXTERNAL] breakthrough_tech_ai_f24/data"
PROCESSED_DATA_PATH = DRIVE_PATH + "/processed_data"
PROCESSED_WEATHER_LOAD = PROCESSED_DATA_PATH + "/processed_weather_load_w_timestamp"

In [20]:
def process_building_data(bldg, external_data_path, us_holidays=holidays.US(), save_csv=False, processed_data_path=''):
    """
    Process weather and load data for a single building.
    
    Parameters:
        bldg (str): Building identifier.
        external_data_path (str): Path to the external data files.
        us_holidays (holidays.HolidayBase): US holidays object for holiday identification.
        save_csv: bool for whether or not to save the processed dataframe to a csv
        processed_data_path (str): path to save processed data csv to
        
        Note: If save_csv=True, processed_data_path must be a valid path name.
        
    Returns:
        pd.DataFrame: Processed DataFrame for the given building.
    """
    weather_path = external_data_path + "/building_data/" + bldg + "/weather.csv"
    load_path = external_data_path + "/building_data/" + bldg + "/load.csv"

    # Read weather and load data for the building
    df_weather = pd.read_csv(weather_path)
    df_load = pd.read_csv(load_path)

    # Renaming 'date_time' to 'timestamp'
    df_weather.rename(columns={'date_time': 'timestamp'}, inplace=True)

    # Convert 'timestamp' to datetime
    df_weather['timestamp'] = pd.to_datetime(df_weather['timestamp'])
    df_load['timestamp'] = pd.to_datetime(df_load['timestamp'])

    # Resample weather data to 15-minute intervals and interpolate missing values
    df_weather = df_weather.set_index('timestamp').resample('15min').asfreq().interpolate(method='linear').reset_index()

    # Add heat index to weather data
    df_weather['heat_index'] = mpcalc.heat_index(
        df_weather['Dry Bulb Temperature [°C]'].values * units.degC,
        df_weather['Relative Humidity [%]'].values * units.percent
    )

    # Merge weather and load data
    df_merged = pd.merge(df_load, df_weather, on='timestamp', how='inner')

    # Add time-based features
    df_merged['minute'] = df_merged['timestamp'].dt.minute
    df_merged['hour'] = df_merged['timestamp'].dt.hour
    df_merged['day'] = df_merged['timestamp'].dt.day
    df_merged['month'] = df_merged['timestamp'].dt.month
    df_merged['year'] = df_merged['timestamp'].dt.year

    # Add weekday/weekend binary indicator
    df_merged['is_weekday'] = (df_merged['timestamp'].dt.dayofweek < 5).astype(int)

    # Add US holidays binary indicator
    df_merged['is_holiday'] = (df_merged['timestamp'].dt.date.isin(us_holidays)).astype(int)

    # Calculate max hourly load and max and min temperature
    df_merged['max_load_hourly'] = df_merged.groupby(['hour', 'day', 'month', 'year'])['out.electricity.total.energy_consumption'].transform('max')
    df_merged['max_temp_hourly'] = df_merged.groupby(['hour', 'day', 'month', 'year'])['Dry Bulb Temperature [°C]'].transform('max')
    df_merged['min_temp_hourly'] = df_merged.groupby(['hour', 'day', 'month', 'year'])['Dry Bulb Temperature [°C]'].transform('min')

    df_merged['bldg_id'] = int(bldg)

    df_merged.index.name = 'Index'
    
    # Save the processed data (optional)
    if save_csv:
        output_path = f"{processed_data_path}/{bldg}.csv"
        df_merged.to_csv(output_path, index=True)

    # Return the processed DataFrame
    return df_merged


In [21]:
# List all CSV files in the folder
building_csvs = os.listdir(EXTERNAL_DATA_PATH+"/building_data")
us_holidays = holidays.US()

In [22]:
PROCESSED_WEATHER_LOAD

'/Users/rileydenn/Library/CloudStorage/GoogleDrive-rileydenn@gmail.com/.shortcut-targets-by-id/1FsOPywSgK_wZmrVrSTBVi4q8G3Mg_yMJ/Team-Fermata-Energy/processed_data/processed_weather_load_w_timestamp'

In [23]:
sample_df=process_building_data(
    building_csvs[0], EXTERNAL_DATA_PATH, us_holidays, 
    save_csv=True, processed_data_path=PROCESSED_WEATHER_LOAD)
sample_df.head()

Unnamed: 0_level_0,timestamp,out.electricity.total.energy_consumption,Dry Bulb Temperature [°C],Relative Humidity [%],heat_index,minute,hour,day,month,year,is_weekday,is_holiday,max_load_hourly,max_temp_hourly,min_temp_hourly,bldg_id
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,2018-01-01 01:00:00,8.047018,7.8,79.05113,44.059403,0,1,1,1,2018,1,0,8.047018,7.8,6.4875,39386
1,2018-01-01 01:15:00,7.632998,7.3625,80.731683,43.272139,15,1,1,1,2018,1,0,8.047018,7.8,6.4875,39386
2,2018-01-01 01:30:00,7.382527,6.925,82.412236,42.484875,30,1,1,1,2018,1,0,8.047018,7.8,6.4875,39386
3,2018-01-01 01:45:00,7.253988,6.4875,84.092789,41.697611,45,1,1,1,2018,1,0,8.047018,7.8,6.4875,39386
4,2018-01-01 02:00:00,7.106161,6.05,85.773342,40.910347,0,2,1,1,2018,1,0,7.198038,6.05,5.9,39386


In [24]:
sample_df.dtypes

timestamp                                   datetime64[ns]
out.electricity.total.energy_consumption           float64
Dry Bulb Temperature [°C]                          float64
Relative Humidity [%]                              float64
heat_index                                         float64
minute                                               int32
hour                                                 int32
day                                                  int32
month                                                int32
year                                                 int32
is_weekday                                           int64
is_holiday                                           int64
max_load_hourly                                    float64
max_temp_hourly                                    float64
min_temp_hourly                                    float64
bldg_id                                              int64
dtype: object

In [None]:
# Loop through all buildings and process them
for bldg in building_csvs:
    processed_data = process_building_data(
        bldg, EXTERNAL_DATA_PATH, us_holidays, 
        save_csv=True, processed_data_path=PROCESSED_WEATHER_LOAD
    )