# Combine weather.csv and load.csv for all buildings

Creates a csv for each building with combined weather and load columns, as well as columns for heat index, minute, hour, day, month, year, is_weekend, is_holiday, max and min hourly load, max and min temperature, and building id. Outputs each csv to the shared team drive.

The output csvs are located in the drive folder 'Team-Fermata-Energy/processed_data/processed_weather_load_w_timestamp' and the name of each csv is the building id.

Author: Riley Denn

Code modified from teammate Victoria Worthington's utils_tester.ipynb

In [2]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.model_selection import train_test_split
import metpy.calc as mpcalc
from metpy.units import units
import holidays

In [6]:
with open('../../config.json', 'r') as config_file:
    config = json.load(config_file)

DRIVE_PATH = config['drive_path']
EXTERNAL_DATA_PATH = DRIVE_PATH + "/[EXTERNAL] breakthrough_tech_ai_f24/data"
PROCESSED_DATA_PATH = DRIVE_PATH + "/processed_data"
PROCESSED_WEATHER_LOAD = PROCESSED_DATA_PATH + "/processed_weather_load_w_timestamp"

In [7]:
def process_building_data(bldg, external_data_path, us_holidays={date for date in holidays.US(years=2018)}, save_csv=False, processed_data_path=''):
    """
    Process weather and load data for a single building.
    
    Parameters:
        bldg (str): Building identifier.
        external_data_path (str): Path to the external data files.
        us_holidays (set of datetime.date): A set of US holiday dates for the relevant year(s).
        save_csv: bool for whether or not to save the processed dataframe to a csv
        processed_data_path (str): path to save processed data csv to
        
        Note: If save_csv=True, processed_data_path must be a valid path name.
        
    Returns:
        pd.DataFrame: Processed DataFrame for the given building.
    """
    weather_path = external_data_path + "/building_data/" + bldg + "/weather.csv"
    load_path = external_data_path + "/building_data/" + bldg + "/load.csv"

    # Read weather and load data for the building
    df_weather = pd.read_csv(weather_path)
    df_load = pd.read_csv(load_path)

    # Renaming 'date_time' to 'timestamp'
    df_weather.rename(columns={'date_time': 'timestamp'}, inplace=True)

    # Convert 'timestamp' to datetime
    df_weather['timestamp'] = pd.to_datetime(df_weather['timestamp'])
    df_load['timestamp'] = pd.to_datetime(df_load['timestamp'])

    # Resample weather data to 15-minute intervals and interpolate missing values
    df_weather = df_weather.set_index('timestamp').resample('15min').asfreq().interpolate(method='linear').reset_index()

    # Add heat index to weather data
    df_weather['heat_index'] = mpcalc.heat_index(
        df_weather['Dry Bulb Temperature [°C]'].values * units.degC,
        df_weather['Relative Humidity [%]'].values * units.percent
    )

    # Merge weather and load data
    df_merged = pd.merge(df_load, df_weather, on='timestamp', how='inner')

    # Add time-based features
    df_merged['minute'] = df_merged['timestamp'].dt.minute
    df_merged['hour'] = df_merged['timestamp'].dt.hour
    df_merged['day'] = df_merged['timestamp'].dt.day
    df_merged['month'] = df_merged['timestamp'].dt.month

    # Add weekday/weekend binary indicator
    df_merged['is_weekday'] = (df_merged['timestamp'].dt.dayofweek < 5).astype(int)

    # Add US holidays binary indicator
    df_merged['is_holiday'] = (df_merged['timestamp'].dt.date.isin(us_holidays)).astype(int)

    # Calculate max hourly load and max and min temperature
    df_merged['max_load_hourly'] = df_merged.groupby(['hour', 'day', 'month'])['out.electricity.total.energy_consumption'].transform('max')
    df_merged['min_load_hourly'] = df_merged.groupby(['hour', 'day', 'month'])['out.electricity.total.energy_consumption'].transform('min')
    df_merged['max_temp_hourly'] = df_merged.groupby(['hour', 'day', 'month'])['Dry Bulb Temperature [°C]'].transform('max')
    df_merged['min_temp_hourly'] = df_merged.groupby(['hour', 'day', 'month'])['Dry Bulb Temperature [°C]'].transform('min')

    df_merged['bldg_id'] = int(bldg)
    
    # Save the processed data (optional)
    if save_csv:
        output_path = f"{processed_data_path}/{bldg}.csv"
        df_merged.to_csv(output_path, index=False)

    # Return the processed DataFrame
    return df_merged


In [8]:
all_bldg_ids = set(os.listdir(EXTERNAL_DATA_PATH+"/building_data")) # ex: {'12345', ...}

already_processed = set(os.listdir(PROCESSED_WEATHER_LOAD)) # ex: {'12345.csv', ...}

subset20 = pd.read_csv(PROCESSED_DATA_PATH+"/subset20.csv")
subset_ids = {str(id) for id in subset20['bldg_id']}  # ex: {'12345', ...}

In [9]:
us_holidays = {date for date in holidays.US(years=2018)}

In [12]:
sample_df = process_building_data(
    next(iter(all_bldg_ids)), EXTERNAL_DATA_PATH, us_holidays,
    save_csv=True, processed_data_path=PROCESSED_WEATHER_LOAD)
sample_df[sample_df['timestamp']=='2018-12-25 01:00:00']

Unnamed: 0,timestamp,out.electricity.total.energy_consumption,Dry Bulb Temperature [°C],Relative Humidity [%],heat_index,minute,hour,day,month,is_weekday,is_holiday,max_load_hourly,min_load_hourly,max_temp_hourly,min_temp_hourly,bldg_id
34368,2018-12-25 01:00:00,18.625848,8.925,96.01581,47.084243,0,1,25,12,1,1,20.287458,18.625848,8.925,8.90625,23856


In [13]:
sample_df.dtypes

timestamp                                   datetime64[ns]
out.electricity.total.energy_consumption           float64
Dry Bulb Temperature [°C]                          float64
Relative Humidity [%]                              float64
heat_index                                         float64
minute                                               int32
hour                                                 int32
day                                                  int32
month                                                int32
is_weekday                                           int64
is_holiday                                           int64
max_load_hourly                                    float64
min_load_hourly                                    float64
max_temp_hourly                                    float64
min_temp_hourly                                    float64
bldg_id                                              int64
dtype: object

In [None]:
already_processed = set(os.listdir(PROCESSED_WEATHER_LOAD)) # ex: {'12345.csv', ...}

to_process = all_bldg_ids - already_processed

# Loop through all buildings that have yet to be processed and process them
for bldg in to_process:
    process_building_data(
        bldg, EXTERNAL_DATA_PATH, us_holidays, 
        save_csv=True, processed_data_path=PROCESSED_WEATHER_LOAD
    )