In [1]:
import csv, pickle, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from autosklearn.regression import AutoSklearnRegressor
from sklearn.linear_model import LinearRegression

from fbprophet import Prophet

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
buildings_list = ['Stadium_Data_Extended.csv']

in_path = './clean_data_extended/'

# types of models
models = {}
model_types = ['ensembles', 'solos']

time_steps = [1, 8, 12, 24]

for model_type in model_types:
    out_path = f'./models/{model_type}/'

    for building in buildings_list:
        df = pd.read_csv(in_path + building)

        # Convert the data into a Pandas dataframe
        df['ts'] = pd.to_datetime(df['ts'])
        df = df.drop_duplicates(subset=['bldgname', 'ts'])
        df = df.sort_values(['bldgname', 'ts'])

        # Group the dataframe by building name and timestamp
        groups = df.groupby('bldgname')
        df = df.set_index('ts')

        add_features = ['temp_c', 'rel_humidity_%', 'surface_pressure_hpa', 'cloud_cover_%', 'direct_radiation_w/m2', 'precipitation_mm', 'wind_speed_ground_km/h', 'wind_dir_ground_deg']
        y_columns = ['present_elec_kwh', 'present_htwt_mmbtuh', 'present_wtr_usgal', 'present_chll_tonh', 'present_co2_tonh']
        header = ['ts'] + add_features + y_columns

        print(building)

        for name, group in groups:
            bldgname = name

            group = group.drop_duplicates(subset=['ts'])

            for y in y_columns:
                model_data = group[header]
                if model_data[y].count() >= 365*24 and y != 'present_co2_tonh':

                    model_data = model_data.rename(columns={ y: 'y', 'ts': 'ds' })
                    model_data = model_data.sort_values(['ds'])

                    del_cols = y_columns
                    del_cols.remove(y)
                    model_data = model_data.drop(columns=['ds'] + del_cols, axis=1)
                    print(model_data.columns)

                    # save the original values into new column
                    saved_model_data = model_data.copy()

                    # Fill in missing values
                    model_data['y'] = model_data['y'].interpolate(method='linear', limit_direction='both')    

                    # normalize the data, save orginal data column for graphing later
                    split_index = int(len(model_data) * 0.8)

                    train_df = model_data[:split_index]
                    test_df = model_data[split_index:]
                    saved_test_df = saved_model_data[split_index:]

                    X_train = train_df.to_numpy()[:, :-1]
                    y_train = train_df.to_numpy()[:, -1]

                    X_test = test_df.to_numpy()[:, :-1]
                    y_test = test_df.to_numpy()[:, -1]
                    saved_y_test = saved_test_df.to_numpy()[:, -1]
                    
                    # 2 hours each task
                    time_dist = 60*5

                    # Create the model (solo or ensemble)
                    if model_type == 'solos':
                        model = AutoSklearnRegressor(
                            time_left_for_this_task=time_dist,
                            per_run_time_limit=int(time_dist/10),
                            memory_limit = 102400,
                            ensemble_kwargs = {'ensemble_size': 1}
                        )
                    else:
                        model = AutoSklearnRegressor(
                            time_left_for_this_task=time_dist,
                            per_run_time_limit=int(time_dist/10),
                            memory_limit = 102400,
                        )


                    # Train the model
                    model.fit(X_train, y_train)
                    
                    # Predict on the test set
                    y_pred = model.predict(X_test)

                    # save the model
                    model_file = f'{out_path}{bldgname}_{y}_model'

                    # with open(model_file + '.pkl', 'wb') as file:
                    #     pickle.dump(model, file)

                    # calculate metrics
                    print(f'{bldgname}, {y}')
                    # print(test_size, len(y_test))
                    print(model.leaderboard())

                    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                    print('RMSE: %.3f' % rmse)

                    mae = mean_absolute_error(y_test, y_pred)
                    print('MAE: %.3f' % mae)

                    r2 = r2_score(y_test, y_pred)
                    print('R2: %.3f' % r2)

                    # save results
                    models[(model_type, bldgname, y)] = (rmse, mae, r2, model_file)

                    # plot results
                    fig, ax = plt.subplots()

                    # Plot the actual values
                    # ax.plot(np.concatenate([y_train, y_test]), label='Actual Values')
                    ax.plot(y_test, label='Actual Values', alpha=0.7)

                    # Plot the predictions
                    # ax.plot(range(train_len, train_len + len(y_test)), y_pred, label='Predicted Values')
                    ax.plot(y_pred, label='Forecasted Values', alpha=0.8)

                    # Plot the replaced missing values
                    nan_mask = np.isnan(saved_y_test)  # boolean mask of NaN values in saved_y_test
                    y_test[~nan_mask] = np.nan
                    
                    ax.plot(y_test, label='Predicted Values', alpha=0.75)

                    ax.set_title(f'{bldgname} {y} Consumption')
                    ax.set_xlabel('Time (Hours)')
                    ax.set_ylabel(y.split('_')[-2] + ' (' + y.split('_')[-1] + ')')

                    ax.legend()
                    plt.grid(True)
                    plt.savefig(model_file + '.png')
                    plt.close(fig)

Stadium_Data_Extended.csv
Index(['temp_c', 'rel_humidity_%', 'surface_pressure_hpa', 'cloud_cover_%',
       'direct_radiation_w/m2', 'precipitation_mm', 'wind_speed_ground_km/h',
       'wind_dir_ground_deg', 'y'],
      dtype='object')
Stadium, present_elec_kwh
          rank  ensemble_weight                 type      cost   duration
model_id                                                                 
17           1             0.70          extra_trees  0.401684  26.761500
12           2             0.04    gradient_boosting  0.454830   6.917921
18           3             0.06    gradient_boosting  0.464114   4.641095
8            4             0.20  k_nearest_neighbors  0.527069   0.898081
RMSE: 46.216
MAE: 34.472
R2: 0.502
Stadium_Data_Extended.csv
Index(['temp_c', 'rel_humidity_%', 'surface_pressure_hpa', 'cloud_cover_%',
       'direct_radiation_w/m2', 'precipitation_mm', 'wind_speed_ground_km/h',
       'wind_dir_ground_deg', 'y'],
      dtype='object')
Stadium, present_el

In [3]:
# Create a CSV files to save the results

header = ['model_type', 'bldgname', 'y', 'rmse', 'mae', 'r2', 'model_file']
rows = []

# create csv file for each model folder
for m_type in model_types:
    out_path = f'./models/{m_type}/'

    with open(f'{out_path}results.csv', mode='w') as results_file:
        writer = csv.writer(results_file)
        writer.writerow(header)

        # Plot the actual values and predictions
        for name, (rmse, mae, r2, model_file) in models.items():
            model_type, bldgname, y = name

            # Write the row to the CSV file
            if m_type == model_type:
                row = [model_type, bldgname, y, rmse, mae, r2, model_file + '.pkl']
                writer.writerow(row)
                rows.append(row)

# create master results csv
with open('results.csv', mode='w') as results_file:
    writer = csv.writer(results_file)
    writer.writerow(header)

    for row in rows:
        writer.writerow(row)