In [72]:
import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np


from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## Add failure mode analysis


In [73]:
data = pd.read_csv("../Merging/Merged_Data.csv")

In [74]:

data['period'] = pd.to_datetime(data['period'])
data.sort_values(by='period', inplace=True)
plant_codes = data['plantCode'].unique()
data = data[data['period'] >= '2019-01-01']
pd.set_option('display.max_colwidth', None)

In [75]:
# Define features for training and testing
model_features = ['ELEVATION', 'TEMP',
       'WDSP', 'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP','TEMPEXT_BASE40', 'TEMPEXT_BASE45', 'TEMPEXT_BASE50', 
       'OVER_60', 'OVER_70', 'OVER_80', 'UNDER_40', 'UNDER_30', 'UNDER_20',
       'SUM_OVER_UNDER', 'LATITUDE','LONGITUDE','Zip','plantCode']

In [76]:
def train_and_evaluate(model, model_features, data, param_grid=None, start_train = '2019-01-01', end_train = '2022-12-31', 
                       start_test = '2023-01-01', end_test = '2023-12-31'):
    results = []
    
    # Exclude negative consumption values
    data = data[data['total-consumption'] > 0]

    # Convert 'period' column to datetime format
    data = data.copy()
    data['period'] = pd.to_datetime(data['period'])

    # Extract features and target variable
    X = data[model_features].fillna(0)
    y = data['total-consumption']

    # Define the years for cross-validation folds
    years = sorted(data['period'].dt.year.unique())

    # Iterate over years for cross-validation
    for i in range(len(years)):
        train_years = years[:i+1]  # Include data from previous years
        if i + 1 < len(years):
            test_year = years[i + 1]  # Test on the following year if it exists
        else:
            break  # No more test years available

        # Filter the data for training and testing
        train_data = data[data['period'].dt.year.isin(train_years)]
        test_data = data[data['period'].dt.year == test_year]

        # Extract features and target variable for training
        X_train = train_data[model_features]
        y_train = train_data['total-consumption']

        # Extract features and target variable for testing
        X_test = test_data[model_features]
        y_test = test_data['total-consumption']

        # Perform hyperparameter tuning using GridSearchCV if it exists
        if param_grid is not None:
            grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
            grid_search.fit(X_train, y_train)

            # Get the best model from the grid search
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_

            # Make predictions on the test set
            y_pred = best_model.predict(X_test)
        else:
            best_model=model
            best_params = None
            model.fit(X_train, y_train)

            # Make predictions on the test set
            y_pred = model.predict(X_test)

        # Evaluate the model
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store results along with training years and test year
        results.append({'Mean Squared Error': mse, 'R-squared': r2, 'Training Years': train_years, 'Test Year': test_year, 'Best Parameters': best_params})

    results_df = pd.DataFrame(results)
    return results_df

In [77]:
# Define the number of trees in the forest
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]}

# Create a Random Forest model
rf_model = RandomForestRegressor()

# Call the function with the model and other parameters
train_and_evaluate(rf_model, model_features, data, param_grid_rf)

Unnamed: 0,Mean Squared Error,R-squared,Training Years,Test Year,Best Parameters
0,437994900000.0,0.82582,[2019],2020,"{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}"
1,473068400000.0,0.767302,"[2019, 2020]",2021,"{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}"
2,218780000000.0,0.890279,"[2019, 2020, 2021]",2022,"{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}"
3,493548900000.0,0.872777,"[2019, 2020, 2021, 2022]",2023,"{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}"


In [78]:
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

knn_model = KNeighborsRegressor()

# Call the train_test_and_plot function
train_and_evaluate(knn_model, model_features, data, param_grid_knn)


Unnamed: 0,Mean Squared Error,R-squared,Training Years,Test Year,Best Parameters
0,293056700000.0,0.883458,[2019],2020,"{'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'uniform'}"
1,225891400000.0,0.888886,"[2019, 2020]",2021,"{'algorithm': 'ball_tree', 'n_neighbors': 9, 'weights': 'distance'}"
2,207447800000.0,0.895963,"[2019, 2020, 2021]",2022,"{'algorithm': 'ball_tree', 'n_neighbors': 9, 'weights': 'distance'}"
3,508229300000.0,0.868993,"[2019, 2020, 2021, 2022]",2023,"{'algorithm': 'ball_tree', 'n_neighbors': 9, 'weights': 'distance'}"


In [79]:
nb_model = GaussianNB() # Gaussian NB does not have any hyperparameters to tune, unlike the others here.
train_and_evaluate(nb_model, model_features, data)


Unnamed: 0,Mean Squared Error,R-squared,Training Years,Test Year,Best Parameters
0,722000600000.0,0.712878,[2019],2020,
1,567581800000.0,0.720812,"[2019, 2020]",2021,
2,508741100000.0,0.744861,"[2019, 2020, 2021]",2022,
3,1047925000000.0,0.729874,"[2019, 2020, 2021, 2022]",2023,
