In [30]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import json
import os


In [31]:
df = pd.read_csv('/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/32.csv')

In [39]:
subset20split = '/Users/mainoahmuna/Downloads/Projects/FermataEnergy_BTTStudio/src/data/subset20_data.json'

In [29]:
PATH = '/Users/mainoahmuna/Google Drive/My Drive/Team-Fermata-Energy/processed_data/processed_weather_and_load/'

In [8]:
df.head()

Unnamed: 0,Index,out.electricity.total.energy_consumption,Dry Bulb Temperature [°C],Relative Humidity [%],heat_index,hour,month,is_weekday,is_holiday,max_load_hourly,max_temp_hourly,min_temp_hourly,bldg_id
0,0,5.420033,-6.1,42.781847,21.02,1,1,1,0,5.476113,-6.1,-6.55,32
1,1,5.476113,-6.25,43.350762,20.75,1,1,1,0,5.476113,-6.1,-6.55,32
2,2,5.476113,-6.4,43.919676,20.48,1,1,1,0,5.476113,-6.1,-6.55,32
3,3,5.476113,-6.55,44.488591,20.21,1,1,1,0,5.476113,-6.1,-6.55,32
4,4,5.476113,-6.7,45.057505,19.94,2,1,1,0,5.476113,-6.7,-6.7,32


In [9]:
df.shape

(35037, 13)

In [10]:
buildings = df['bldg_id'].tolist()

In [11]:
def smape(y_true, y_pred):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE).

    Parameters:
        y_true (array-like): True values.
        y_pred (array-like): Predicted values.

    Returns:
        float: SMAPE value.
    """
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    smape_value = np.mean(numerator / denominator) * 100
    return smape_value

In [12]:
def create_Y_X(df_load):
    """
    Create Y and X variables for linear regression model.

    Parameters:
        df_load (pandas.DataFrame): DataFrame containing load data.

    Returns:
        tuple: Tuple containing Y and X variables.  
    """
    for i in range(1, 97):
        df_load[f"shift_{i}"] = df["out.electricity.total.energy_consumption"].shift(i)

    df_load = df_load.dropna()

    Y = df_load['out.electricity.total.energy_consumption']
    X = df_load.drop(['out.electricity.total.energy_consumption', 'Index', 'bldg_id'], axis=1)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return Y, X_scaled

In [13]:
Y, X = create_Y_X(df)

In [43]:
def train_sgd_regressor(directory, split_file, target_column='out.electricity.total.energy_consumption', chunk_size=1000):
    """
    Train the SGD Regressor model using data from building CSV files, using a predefined train/test split.

    Parameters:
        directory (str): Path to the directory containing building CSV files.
        split_file (str): Path to the JSON file with predefined train and test split.
        target_column (str): The name of the target column.
        chunk_size (int): Number of rows to process in each chunk.

    Returns:
        model: Trained model.
        test_files: List of test files used.
        avg_train_smape: Average SMAPE across all training buildings.
        avg_test_smape: Average SMAPE across all test buildings.
    """
    model = SGDRegressor()

    # Load predefined train and test splits from JSON file
    with open(split_file, 'r') as file:
        split_data = json.load(file)
        train_files = [f"{directory}/{file_id}" for file_id in split_data['train_bldg_ids']]
        test_files = [f"{directory}/{file_id}" for file_id in split_data['test_bldg_ids']]

    # Lists to store individual SMAPE scores for training and testing
    train_smape_list = []
    test_smape_list = []

    # Training phase: loop over the training set files
    for filename in tqdm(train_files, desc="Training on buildings", unit="file"):
        try:
            for chunk in pd.read_csv(filename, chunksize=chunk_size):
                Y, X = create_Y_X(chunk)

                # Train the model with the current chunk
                model.partial_fit(X, Y)

                # Calculate SMAPE on the training chunk and store it
                Y_pred_train = model.predict(X)
                smape_train = smape(Y, Y_pred_train)
                train_smape_list.append(smape_train)

        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue  # Skip to the next file if there's an error

    # Testing phase: loop over the test set files
    for filename in tqdm(test_files, desc="Testing on buildings", unit="file"):
        try:
            for chunk in pd.read_csv(filename, chunksize=chunk_size):
                Y, X = create_Y_X(chunk)

                # Make predictions on the current chunk
                Y_pred = model.predict(X)

                # Compute SMAPE for the current test building
                smape_value = smape(Y, Y_pred)
                test_smape_list.append(smape_value)

        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue  # Skip to the next file if there's an error

    # Compute average SMAPE for training and testing
    avg_train_smape = np.mean(train_smape_list)
    avg_test_smape = np.mean(test_smape_list)

    return model, test_files, avg_train_smape, avg_test_smape

In [44]:
model, testfiles, avg_train_smape, avg_test_smape = train_sgd_regressor(PATH, subset20split)

Training on buildings: 100%|██████████| 5120/5120 [55:05<00:00,  1.55file/s] 
Testing on buildings: 100%|██████████| 1281/1281 [13:05<00:00,  1.63file/s]


## Linear Regression:
Average Test SMAPE: 86.93336613657864

Average Train SMAPE: 52.37481315660798

In [45]:
print(avg_test_smape)

86.93336613657864


In [46]:
print(avg_train_smape)

52.37481315660798
