In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go


In [3]:
df = pd.read_csv('/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/32.csv')

In [4]:
PATH = '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/'

In [5]:
df.head()

Unnamed: 0,Index,out.electricity.total.energy_consumption,Dry Bulb Temperature [°C],Relative Humidity [%],heat_index,hour,month,is_weekday,is_holiday,max_load_hourly,max_temp_hourly,min_temp_hourly,bldg_id
0,0,5.420033,-6.1,42.781847,21.02,1,1,1,0,5.476113,-6.1,-6.55,32
1,1,5.476113,-6.25,43.350762,20.75,1,1,1,0,5.476113,-6.1,-6.55,32
2,2,5.476113,-6.4,43.919676,20.48,1,1,1,0,5.476113,-6.1,-6.55,32
3,3,5.476113,-6.55,44.488591,20.21,1,1,1,0,5.476113,-6.1,-6.55,32
4,4,5.476113,-6.7,45.057505,19.94,2,1,1,0,5.476113,-6.7,-6.7,32


In [6]:
df.shape

(35037, 13)

In [7]:
buildings = df['bldg_id'].tolist()

In [8]:
def smape(y_true, y_pred):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE).

    Parameters:
        y_true (array-like): True values.
        y_pred (array-like): Predicted values.

    Returns:
        float: SMAPE value.
    """
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    smape_value = np.mean(numerator / denominator) * 100
    return smape_value

In [28]:
def create_Y_X(df_load):
    """
    Create Y and X variables for linear regression model.

    Parameters:
        df_load (pandas.DataFrame): DataFrame containing load data.

    Returns:
        tuple: Tuple containing Y and X variables.  
    """
    for i in range(1, 97):
        df_load[f"shift_{i}"] = df["out.electricity.total.energy_consumption"].shift(i)

    df_load = df_load.dropna()

    Y = df_load['out.electricity.total.energy_consumption']
    X = df_load.drop(['out.electricity.total.energy_consumption', 'Index', 'bldg_id'], axis=1)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return Y, X_scaled

In [29]:
Y, X = create_Y_X(df)

In [30]:
def train_sgd_regressor(directory, target_column='out.electricity.total.energy_consumption', test_size=0.2, chunk_size=1000):
    """
    Train the SGD Regressor model using data from building CSV files.

    Parameters:
        directory (str): Path to the directory containing building CSV files.
        target_column (str): The name of the target column.
        test_size (float): Proportion of the building files to use for testing.
        chunk_size (int): Number of rows to process in each chunk.

    Returns:
        model: Trained model.
        building_pred: Predictions for one building.
        building_actual: Actual values for that building.
        avg_smape: Average SMAPE across all buildings.
        avg_r2: Average R² score across all buildings.
    """
    model = SGDRegressor()

    # Lists to store individual SMAPE scores for training and testing
    train_smape_list = []
    test_smape_list = []

    # Use glob to list all CSV files in the directory
    csv_files = glob(f"{directory}/*.csv")
    
    # Split building files into train and test sets
    train_files, test_files = train_test_split(csv_files, test_size=test_size)

    # Training phase: loop over the training set files
    for filename in tqdm(train_files, desc="Training on buildings", unit="file"):
        try:
            # Use chunksize in read_csv to read the file in chunks
            for chunk in pd.read_csv(filename, chunksize=chunk_size):
                Y, X = create_Y_X(chunk)

                # Train the model with the current chunk
                model.partial_fit(X, Y)

                # Calculate SMAPE on the training chunk and store it
                Y_pred_train = model.predict(X)
                smape_train = smape(Y, Y_pred_train)
                train_smape_list.append(smape_train)

        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue  # Skip to the next file if there's an error

    # Testing phase: loop over the test set files
    for filename in tqdm(test_files, desc="Testing on buildings", unit="file"):
        try:
            # Use chunksize in read_csv to read the file in chunks
            for chunk in pd.read_csv(filename, chunksize=chunk_size):
                Y, X = create_Y_X(chunk)

                # Make predictions on the current chunk
                Y_pred = model.predict(X)

                # Compute SMAPE for the current test building
                smape_value = smape(Y, Y_pred)
                test_smape_list.append(smape_value)

        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue  # Skip to the next file if there's an error

    # Compute average SMAPE for training and testing
    avg_train_smape = np.mean(train_smape_list)
    avg_test_smape = np.mean(test_smape_list)

    return model, test_files, avg_train_smape, avg_test_smape

In [31]:
model, testfiles, avg_train_smape, avg_test_smape = train_sgd_regressor(PATH)

Training on buildings:   0%|          | 11/5120 [00:07<57:29,  1.48file/s]


KeyboardInterrupt: 

## Linear Regression:
Average SMAPE: 87.80732828774465


In [26]:
print(avg_test_smape)

199.9999737802848


In [27]:
print(avg_train_smape)

199.9999950668809
