In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from glob import glob
from tqdm import tqdm
import plotly.graph_objs as go

### Initial Setup
* Download GoogleDrive to desktop
* Sync files
* Include path where file is located

In [2]:
subset_df = pd.read_csv('/Users/veronica/Library/CloudStorage/GoogleDrive-veronicahangsan@gmail.com/.shortcut-targets-by-id/1FsOPywSgK_wZmrVrSTBVi4q8G3Mg_yMJ/Team-Fermata-Energy/processed_data/subset20.csv')
PATH = '/Users/veronica/Library/CloudStorage/GoogleDrive-veronicahangsan@gmail.com/.shortcut-targets-by-id/1FsOPywSgK_wZmrVrSTBVi4q8G3Mg_yMJ/Team-Fermata-Energy/processed_data/processed_weather_and_load'
print(subset_df.head())

   bldg_id  in.state  in.cluster_id  in.vintage   in.sqft  \
0   105885        10           42.0           3  750000.0   
1   305819        40           74.0           2  150000.0   
2   305934        40           75.0           4  350000.0   
3   317044        40           75.0           3  350000.0   
4       32         1           53.0           6   37500.0   

   in.building_america_climate_zone_Cold  \
0                                      0   
1                                      0   
2                                      0   
3                                      0   
4                                      0   

   in.building_america_climate_zone_Hot-Dry  \
0                                         0   
1                                         0   
2                                         0   
3                                         0   
4                                         0   

   in.building_america_climate_zone_Hot-Humid  \
0                                   

In [3]:
subset_df.shape

(6401, 53)

In [4]:
def smape(y_true, y_pred):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE).

    Parameters:
        y_true (array-like): True values.
        y_pred (array-like): Predicted values.

    Returns:
        float: SMAPE value.
    """
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    smape_value = np.mean(numerator / denominator) * 100
    return smape_value

In [5]:
def create_Y_X(df_load, required_columns=None):
    """
    Create Y and X variables for linear regression model.

    Parameters:
        df_load (pandas.DataFrame): DataFrame containing load data.

    Returns:
        tuple: Tuple containing Y and X variables.  
    """
    if required_columns:
        df_load = df_load[required_columns]
    Y = df_load['out.electricity.total.energy_consumption']
    X = df_load.drop('out.electricity.total.energy_consumption', axis=1)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return Y, X_scaled


In [6]:
def plot_actual_vs_predicted(df_actual, df_predicted):
    """
    Plot actual vs predicted values using Plotly.

    Parameters:
        df_actual (pandas.DataFrame): DataFrame containing actual values with timestamp and energy consumption columns.
        df_predicted (pandas.DataFrame): DataFrame containing predicted values with timestamp and energy consumption columns.

    Returns:
        fig: Plotly figure object.
    """
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_actual['timestamp'], 
                             y=df_actual['out.electricity.total.energy_consumption'], 
                             mode='lines', 
                             name='Actual Consumption', 
                             line=dict(color='blue')))
    fig.add_trace(go.Scatter(x=df_predicted['timestamp'], 
                             y=df_predicted['out.electricity.total.energy_consumption'], 
                             mode='lines', 
                             name='Predicted Consumption', 
                             line=dict(color='red', dash='dash')))
    fig.update_layout(
        title="Actual vs Predicted Electricity Consumption Over Time",
        xaxis_title="Timestamp",
        yaxis_title="Energy Consumption",
        template="plotly_white"
    )
    fig.show()
    return fig

In [7]:
def train_sgd_regressor(directory, target_column='out.electricity.total.energy_consumption', test_size=0.2, chunk_size=1000, required_columns=None):
    """
    Train the SGD Regressor model using data from building CSV files.

    Parameters:
        directory (str): Path to the directory containing building CSV files.
        target_column (str): The name of the target column.
        test_size (float): Proportion of the building files to use for testing.
        chunk_size (int): Number of rows to process in each chunk.
        required_columns (list): List of required feature names to ensure consistency.

    Returns:
        model: Trained model.
        test_list: List of test files used.
        avg_smape: Average SMAPE across all buildings.
    """
    model = SGDRegressor()
    smape_list = []
    csv_files = glob(f"{directory}/*.csv")
    if not csv_files:
        raise ValueError("No CSV files found in the specified directory.")
    print(f"Number of CSV files: {len(csv_files)}")
    train_files, test_files = train_test_split(csv_files, test_size=test_size)
    for filename in tqdm(train_files, desc="Training on buildings", unit="file"):
        try:
            for chunk in pd.read_csv(filename, chunksize=chunk_size):
                Y, X = create_Y_X(chunk, required_columns=required_columns)
                model.partial_fit(X, Y)
        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue
    for filename in tqdm(test_files, desc="Testing on buildings", unit="file"):
        try:
            for chunk in pd.read_csv(filename, chunksize=chunk_size):
                Y, X = create_Y_X(chunk, required_columns=required_columns)
                Y_pred = model.predict(X)
                smape_value = smape(Y, Y_pred)
                smape_list.append(smape_value)
        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue
    avg_smape = np.mean(smape_list) if smape_list else float('inf')
    return model, test_files, avg_smape

In [8]:
model, test_files, avg_smape = train_sgd_regressor(PATH)
print(f"Average SMAPE: {avg_smape:.2f}")

Number of CSV files: 6401


Training on buildings:  79%|███████▉  | 4049/5120 [04:02<00:56, 19.12file/s]

Error processing file /Users/veronica/Library/CloudStorage/GoogleDrive-veronicahangsan@gmail.com/.shortcut-targets-by-id/1FsOPywSgK_wZmrVrSTBVi4q8G3Mg_yMJ/Team-Fermata-Energy/processed_data/processed_weather_and_load/39386.csv: could not convert string to float: '2018-01-01 01:00:00'


Training on buildings: 100%|██████████| 5120/5120 [05:03<00:00, 16.84file/s]
Testing on buildings: 100%|██████████| 1281/1281 [02:13<00:00,  9.60file/s]

Average SMAPE: 123.40





In [9]:
print(avg_smape)

123.39606947983826


In [10]:
df = pd.read_csv(f"{test_files[0]}")
Y, X = create_Y_X(df)
predictions = model.predict(X)
predictions = pd.DataFrame(predictions, columns=['out.electricity.total.energy_consumption'])

In [11]:
start_time = '2018-01-01 00:00:00'
freq = '15T'
num_points = predictions.shape[0]
timestamps = pd.date_range(start=start_time, periods=num_points, freq=freq)
predictions['timestamp'] = timestamps

  timestamps = pd.date_range(start=start_time, periods=num_points, freq=freq)


In [12]:
df['timestamp'] = timestamps
df_actual = df[['timestamp', 'out.electricity.total.energy_consumption']]

In [16]:
fig = plot_actual_vs_predicted(df_actual, predictions)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [17]:
pip install nbformat --upgrade


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
