In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go


In [2]:
df = pd.read_csv('/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/32.csv')

In [3]:
PATH = '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/'

In [4]:
df.head()

Unnamed: 0,Index,out.electricity.total.energy_consumption,Dry Bulb Temperature [°C],Relative Humidity [%],heat_index,hour,month,is_weekday,is_holiday,max_load_hourly,max_temp_hourly,min_temp_hourly,bldg_id
0,0,5.420033,-6.1,42.781847,21.02,1,1,1,0,5.476113,-6.1,-6.55,32
1,1,5.476113,-6.25,43.350762,20.75,1,1,1,0,5.476113,-6.1,-6.55,32
2,2,5.476113,-6.4,43.919676,20.48,1,1,1,0,5.476113,-6.1,-6.55,32
3,3,5.476113,-6.55,44.488591,20.21,1,1,1,0,5.476113,-6.1,-6.55,32
4,4,5.476113,-6.7,45.057505,19.94,2,1,1,0,5.476113,-6.7,-6.7,32


In [7]:
df.shape

(35037, 13)

In [8]:
buildings = df['bldg_id'].tolist()

In [7]:
def smape(y_true, y_pred):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE).

    Parameters:
        y_true (array-like): True values.
        y_pred (array-like): Predicted values.

    Returns:
        float: SMAPE value.
    """
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    smape_value = np.mean(numerator / denominator) * 100
    return smape_value

In [8]:
def create_Y_X(df_load):
    """
    Create Y and X variables for linear regression model.

    Parameters:
        df_load (pandas.DataFrame): DataFrame containing load data.

    Returns:
        tuple: Tuple containing Y and X variables.  
    """
    Y = df_load['out.electricity.total.energy_consumption']
    X = df_load.drop('out.electricity.total.energy_consumption', axis=1)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return Y, X_scaled

In [9]:
def plot_actual_vs_predicted(df_actual, df_predicted):
    """
    Plot actual vs predicted values using Plotly.

    Parameters:
        df_actual (pandas.DataFrame): DataFrame containing actual values with timestamp and energy consumption columns.
        df_predicted (pandas.DataFrame): DataFrame containing predicted values with timestamp and energy consumption columns.

    Returns:
        fig: Plotly figure object.
    """
    fig = go.Figure()

    # Add actual values
    fig.add_trace(go.Scatter(x=df_actual['timestamp'], 
                             y=df_actual['out.electricity.total.energy_consumption'], 
                             mode='lines', 
                             name='Actual Consumption', 
                             line=dict(color='blue')))

    # Add predicted values
    fig.add_trace(go.Scatter(x=df_predicted['timestamp'], 
                             y=df_predicted['out.electricity.total.energy_consumption'], 
                             mode='lines', 
                             name='Predicted Consumption', 
                             line=dict(color='red', dash='dash')))

    # Update the layout
    fig.update_layout(
        title="Actual vs Predicted Electricity Consumption Over Time",
        xaxis_title="Timestamp",
        yaxis_title="Energy Consumption",
        template="plotly_white"
    )
    
    fig.show()
    return fig

In [10]:
def train_sgd_regressor(directory, target_column='out.electricity.total.energy_consumption', test_size=0.2, chunk_size=1000):
    """
    Train the SGD Regressor model using data from building CSV files.

    Parameters:
        directory (str): Path to the directory containing building CSV files.
        target_column (str): The name of the target column.
        test_size (float): Proportion of the building files to use for testing.
        chunk_size (int): Number of rows to process in each chunk.

    Returns:
        model: Trained model.
        building_pred: Predictions for one building.
        building_actual: Actual values for that building.
        avg_smape: Average SMAPE across all buildings.
        avg_r2: Average R² score across all buildings.
    """
    model = SGDRegressor()

    # Lists to store individual SMAPE and R² scores
    smape_list = []
    building_pred = None
    building_actual = None

    # Use glob to list all CSV files in the directory
    csv_files = glob(f"{directory}/*.csv")
    
    # Split building files into train and test sets
    train_files, test_files = train_test_split(csv_files, test_size=test_size)

    test_list = list(test_files)

    # Training phase: loop over the training set files
    for filename in tqdm(train_files, desc="Training on buildings", unit="file"):
        try:
            # Use chunksize in read_csv to read the file in chunks
            for chunk in pd.read_csv(filename, chunksize=chunk_size):
                Y, X = create_Y_X(chunk)

                # Train the model with the current chunk
                model.partial_fit(X, Y)

        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue  # Skip to the next file if there's an error

    # Testing phase: loop over the test set files
    for filename in tqdm(test_files, desc="Testing on buildings", unit="file"):
        try:
            # Use chunksize in read_csv to read the file in chunks
            for chunk in pd.read_csv(filename, chunksize=chunk_size):
                Y, X = create_Y_X(chunk)

                # Make predictions on the current chunk
                Y_pred = model.predict(X)

                # Compute SMAPE and R² for the current building
                smape_value = smape(Y, Y_pred)
    
                # Store the SMAPE and R² for averaging later
                smape_list.append(smape_value)

        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue  # Skip to the next file if there's an error

    # Compute average SMAPE and R² score
    avg_smape = np.mean(smape_list)

    return model, test_list, avg_smape, train_files

In [11]:
model, testfiles, avg_smape, train_files = train_sgd_regressor(PATH)

Training on buildings: 100%|██████████| 5120/5120 [09:33<00:00,  8.93file/s]
Testing on buildings: 100%|██████████| 1281/1281 [02:00<00:00, 10.64file/s]


## Linear Regression:
Average SMAPE: 87.80732828774465


In [12]:
print(testfiles)

['/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/199200.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/221209.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/274292.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/201373.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/252383.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/113609.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/327480.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/266740.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/p

In [13]:
print(train_files)

['/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/157651.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/323709.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/311605.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/134138.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/57607.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/136190.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/86021.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/280464.csv', '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/pro

In [14]:
print(avg_smape)

106.6743575747299


In [15]:
df = pd.read_csv(f"{testfiles[0]}")

In [39]:
df.head()

Unnamed: 0,Index,out.electricity.total.energy_consumption,Dry Bulb Temperature [°C],Relative Humidity [%],heat_index,hour,month,is_weekday,is_holiday,max_load_hourly,max_temp_hourly,min_temp_hourly,bldg_id
0,0,3.860081,2.2,56.104622,35.96,1,1,1,0,4.217974,2.2,1.825,307416
1,1,3.75489,2.075,56.615213,35.735,1,1,1,0,4.217974,2.2,1.825,307416
2,2,4.217974,1.95,57.125804,35.51,1,1,1,0,4.217974,2.2,1.825,307416
3,3,3.326608,1.825,57.636395,35.285,1,1,1,0,4.217974,2.2,1.825,307416
4,4,3.328615,1.7,58.146986,35.06,2,1,1,0,3.328615,1.7,1.475,307416


In [45]:
fig = plot_actual_vs_predicted(df_actual, predictions)

In [46]:
print(smape(df_actual['out.electricity.total.energy_consumption'], predictions['out.electricity.total.energy_consumption']))

26.908190417449145
