In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor

In [None]:
md_path = '/Users/veronica/Library/CloudStorage/GoogleDrive-veronicahangsan@gmail.com/.shortcut-targets-by-id/1FsOPywSgK_wZmrVrSTBVi4q8G3Mg_yMJ/Team-Fermata-Energy/processed_data/md_one_hot_encoded_subset20.csv'
PATH = '/Users/veronica/Library/CloudStorage/GoogleDrive-veronicahangsan@gmail.com/.shortcut-targets-by-id/1FsOPywSgK_wZmrVrSTBVi4q8G3Mg_yMJ/Team-Fermata-Energy/processed_data/processed_weather_load_w_timestamp/'
json_file = '../data/subset20_20_data.json'

In [None]:
md = pd.read_csv(md_path)
with open(json_file, 'r') as file:
    json_data = json.load(file)

In [None]:
md.head

In [None]:
md.columns

In [None]:
train_bldg_ids = [filename.split('.')[0] for filename in json_data.get("train_bldg_ids", [])]
test_bldg_ids = [filename.split('.')[0] for filename in json_data.get("test_bldg_ids", [])]

In [None]:
md['bldg_id'] = md['bldg_id'].astype(str).str.strip()

In [None]:
education_buildings = md[md['in.comstock_building_type_group_Education'] == 1]
hot_dry_buildings = md[md['in.building_america_climate_zone_Mixed-Humid'] == 1]
edu_and_mixed_humid = education_buildings.merge(hot_dry_buildings, on='bldg_id', how='inner')
edu_and_mixed_humid_bldg_ids = edu_and_mixed_humid['bldg_id'].unique()

In [None]:
print(f"Buildings categorized as Education and in 'Hot-Dry' zone: {edu_and_mixed_humid_bldg_ids[:10]}")

In [None]:
valid_train_bldgs = set(train_bldg_ids).intersection(edu_and_mixed_humid_bldg_ids)
valid_test_bldgs = set(test_bldg_ids).intersection(edu_and_mixed_humid_bldg_ids)

print(f"Valid train buildings (Education + Mixed-Humid): {valid_train_bldgs}")
print(f"Valid test buildings (Education + Mixed-Humid): {valid_test_bldgs}")

In [None]:
print(f"Number of valid training buildings: {len(valid_train_bldgs)}")
print(f"Number of valid testing buildings: {len(valid_test_bldgs)}")

In [None]:
def load_data_for_building(filename, directory):
    try:
        file_path = f"{directory}/{filename}.csv"
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print(f"Error loading file {filename}: {e}")
        return None


In [None]:
def prepare_data_for_building(df_load, md):
    """
    Prepare features (X) and target (y) for training the model.
    """
    df_load['bldg_id'] = df_load['bldg_id'].astype(str)
    md['bldg_id'] = md['bldg_id'].astype(str)

    # Create lag features
    for i in range(1, 97):
        df_load[f"shift_{i}"] = df_load["out.electricity.total.energy_consumption"].shift(i)

    # Process timestamp if available
    if 'timestamp' in df_load.columns:
        df_load['timestamp'] = pd.to_datetime(df_load['timestamp'])
        datetime_columns = {
            'hour': df_load['timestamp'].dt.hour,
            'day_of_week': df_load['timestamp'].dt.dayofweek,
            'day_of_year': df_load['timestamp'].dt.dayofyear,
            'month': df_load['timestamp'].dt.month,
            'year': df_load['timestamp'].dt.year
        }
        df_load = pd.concat([df_load, pd.DataFrame(datetime_columns)], axis=1)
        df_load = df_load.drop(columns=['timestamp'])  # drop timestamp column

    # Merge with metadata
    merged_df = df_load.merge(md, on='bldg_id', how='left')
    merged_df = merged_df.drop(['bldg_id'], axis=1)

    # Drop rows with missing values
    merged_df = merged_df.dropna()
    
    return merged_df


In [None]:
def smape(actual, predicted):
    actual, predicted = np.array(actual), np.array(predicted)
    denominator = np.abs(actual) + np.abs(predicted)
    diff = np.abs(actual - predicted) / denominator
    diff[denominator == 0] = 0.0
    return 200 * np.mean(diff)

In [None]:
def train_random_forest_model(directory, valid_train_bldgs, valid_test_bldgs, target_column='out.electricity.total.energy_consumption'):
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.ensemble import VotingRegressor

    rf_model = RandomForestRegressor(n_estimators=15, max_depth=4, min_samples_split=15, min_samples_leaf=10, random_state=42, n_jobs=1)
    gb_model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
    ensemble_model = VotingRegressor(estimators=[('rf', rf_model), ('gb', gb_model)])

    smape_train_list = []
    smape_test_list = []

    for building_id in valid_train_bldgs:
        train_df = load_data_for_building(building_id, directory)
        if train_df is not None and target_column in train_df:
            train_df_prepared = prepare_data_for_building(train_df, md)
            if train_df_prepared is not None and len(train_df_prepared) > 0:
                X_train = train_df_prepared.drop(columns=[target_column])
                y_train = train_df_prepared[target_column]
                ensemble_model.fit(X_train, y_train)

                y_train_pred = ensemble_model.predict(X_train)
                smape_train = smape(y_train, y_train_pred)
                smape_train_list.append(smape_train)
            else:
                print(f"Prepared data is empty for {building_id}")

    for building_id in valid_test_bldgs:
        test_df = load_data_for_building(building_id, directory)
        if test_df is not None and target_column in test_df:
            test_df_prepared = prepare_data_for_building(test_df, md)
            if test_df_prepared is not None and len(test_df_prepared) > 0:
                X_test = test_df_prepared.drop(columns=[target_column])
                y_test = test_df_prepared[target_column]
                y_test_pred = ensemble_model.predict(X_test)
                smape_test = smape(y_test, y_test_pred)
                smape_test_list.append(smape_test)

    avg_smape_train = np.mean(smape_train_list)
    avg_smape_test = np.mean(smape_test_list)
    print(f"Average SMAPE (training set): {avg_smape_train:.4f}")
    print(f"Average SMAPE (testing set): {avg_smape_test:.4f}")
    return ensemble_model


In [16]:
trained_model = train_random_forest_model(PATH, valid_train_bldgs, valid_test_bldgs)

In [None]:
def plot_24_hour_predictions(building_ids, directory, model, target_column='out.electricity.total.energy_consumption'):
    plt.figure(figsize=(15, 10))
    
    for i, building_id in enumerate(building_ids[:5]):
        df = load_data_for_building(building_id, directory)
        if df is not None:
            df_prepared = prepare_data_for_building(df, md)

            if df_prepared is not None and len(df_prepared) > 48:
                X = df_prepared.drop(columns=[target_column])
                y_actual = df_prepared[target_column]

                # last 24 hours for prediction
                X_last_24 = X.iloc[-24:]
                y_actual_last_24 = y_actual.iloc[-24:]
                
                #  next 24 hours
                X_next_24 = X.iloc[-24:].copy()
                y_next_24_pred = model.predict(X_next_24)

                timestamps = np.arange(24)
                
                # actual vs predicted for the next 24 hours
                plt.subplot(3, 2, i + 1)
                plt.plot(timestamps, y_actual_last_24, label='Actual (Last 24h)', color='blue')
                plt.plot(timestamps, y_next_24_pred, label='Predicted (Next 24h)', color='red', linestyle="--")
                plt.title(f'Building ID: {building_id}')
                plt.xlabel('Hours')
                plt.ylabel('Electricity Consumption')
                plt.legend()

    plt.tight_layout()
    plt.show()
