In [15]:
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from statsmodels.tsa.seasonal import seasonal_decompose
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [16]:
data = pd.read_csv("../Merging/Merged_Data.csv")
print(data.columns)

Index(['period', 'plantCode', 'plantName', 'fuel2002', 'fuelTypeDescription',
       'state', 'stateDescription', 'primeMover', 'total-consumption',
       'total-consumption-units', 'total-consumption-btu', 'generation',
       'gross-generation', 'consumption-for-eg', 'consumption-for-eg-units',
       'consumption-for-eg-btu', 'average-heat-content',
       'average-heat-content-units', 'total-consumption-btu-units',
       'generation-units', 'gross-generation-units',
       'consumption-for-eg-btu-units', 'X', 'Y', 'FID', 'OBJECTID',
       'Plant_Code', 'Plant_Name', 'Utility_ID', 'Utility_Na', 'sector_nam',
       'Street_Add', 'City', 'County', 'State', 'Zip', 'PrimSource',
       'source_des', 'tech_desc', 'Install_MW', 'Total_MW', 'Bat_MW', 'Bio_MW',
       'Coal_MW', 'Geo_MW', 'Hydro_MW', 'HydroPS_MW', 'NG_MW', 'Nuclear_MW',
       'Crude_MW', 'Solar_MW', 'Wind_MW', 'Other_MW', 'Source', 'Period',
       'Longitude', 'Latitude', 'LATITUDE', 'LONGITUDE', 'DATE', 'ELEVATION',


In [17]:
# Load cluster labels
with open('Norm_Plant_Clusters.p', 'rb') as f:
    cluster_labels = pickle.load(f)

print(cluster_labels)

{'N_KM3':      plantCode  N_0  N_1  N_2
0            9  0.0  0.0  1.0
1           99  0.0  0.0  1.0
2          136  0.0  0.0  1.0
3          298  0.0  0.0  1.0
4          550  0.0  1.0  0.0
..         ...  ...  ...  ...
478      66596  1.0  0.0  0.0
479      66597  1.0  0.0  0.0
480      66612  1.0  0.0  0.0
481      66613  1.0  0.0  0.0
482      66614  1.0  0.0  0.0

[483 rows x 4 columns], 'N_H3':      plantCode  N_0  N_1  N_2
0            9  0.0  0.0  1.0
1           99  0.0  0.0  1.0
2          136  0.0  0.0  1.0
3          298  0.0  0.0  1.0
4          550  0.0  0.0  1.0
..         ...  ...  ...  ...
478      66596  1.0  0.0  0.0
479      66597  1.0  0.0  0.0
480      66612  1.0  0.0  0.0
481      66613  1.0  0.0  0.0
482      66614  1.0  0.0  0.0

[483 rows x 4 columns], 'N_KM4':      plantCode  N_0  N_1  N_2  N_3
0            9  0.0  0.0  1.0  0.0
1           99  0.0  0.0  1.0  0.0
2          136  0.0  0.0  1.0  0.0
3          298  0.0  0.0  1.0  0.0
4          550  0.0  1.0  0.

In [18]:
# Define the time periods and features for training and testing
start_train = '2019-01-01'
end_train = '2022-12-31'
start_test = '2023-01-01'
end_test = '2023-12-31'
model_features = ['ELEVATION', 'TEMP',
       'WDSP', 'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP','TEMPEXT_BASE40', 'TEMPEXT_BASE45', 'TEMPEXT_BASE50', 
       'OVER_60', 'OVER_70', 'OVER_80', 'UNDER_40', 'UNDER_30', 'UNDER_20',
       'SUM_OVER_UNDER', 'LATITUDE','LONGITUDE','Zip','plantCode']
# add feature for distance to weather station from power plant?


In [19]:

data['period'] = pd.to_datetime(data['period'])
data.sort_values(by='period', inplace=True)
plant_codes = data['plantCode'].unique()
data = data[data['period'] >= '2019-01-01']

# Exclude negative consumption values
data = data[data['total-consumption'] > 0]

# Convert 'state' column to categorical type
data['state'] = data['state'].astype('category')
    
# Extract features and target variable
X = data[model_features]
y = data['total-consumption']

# Filter the data for training and testing
train_data = data[(data['period'] >= start_train) & (data['period'] <= end_train)]
test_data = data[(data['period'] >= start_test) & (data['period'] <= end_test)]

# Extract features and target variable for training
X_train = train_data[model_features]
y_train = train_data['total-consumption']

# Extract features and target variable for testing
X_test = test_data[model_features]
y_test = test_data['total-consumption']

In [20]:
def train_and_evaluate_with_clusters(cluster_labels, model, start_train, end_train, start_test, end_test, model_features, data):
    results = []
    for key, clusters_df in cluster_labels.items(): # For "key cluster" in clusters (dictionary of dictionaries)
        # Convert cluster labels into one-hot encoded features
        def cluster_to_one_hot(data, clusters):
            one_hot_clusters = pd.get_dummies(clusters)
            merged_data = pd.merge(data, one_hot_clusters, how='left', left_on='plantCode', right_on='plantCode')
            return merged_data
        
        

        X_with_clusters_train = cluster_to_one_hot(X_train.copy(), clusters_df)
        X_with_clusters_test = cluster_to_one_hot(X_test.copy(), clusters_df)


        X_with_clusters_train.columns = X_with_clusters_train.columns.astype(str)
        X_with_clusters_test.columns = X_with_clusters_test.columns.astype(str)
        
        # Fit the model on the training data
        model.fit(X_with_clusters_train, y_train)

        # Make predictions on the test set
        y_pred = model.predict(X_with_clusters_test)

        # Evaluate the model
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        results.append({'Cluster Combo': key, 'Mean Squared Error': mse, 'R-squared': r2}) # descending by R^2 value

    results_df = pd.DataFrame(results)
    print(results_df)


In [21]:

# Create a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100)

# Call the function with the model and other parameters
train_and_evaluate_with_clusters(cluster_labels, rf_model, start_train, end_train, start_test, end_test, model_features, data)

   Cluster Combo  Mean Squared Error  R-squared
0          N_KM3        5.447034e+11   0.859591
1           N_H3        5.472748e+11   0.858928
2          N_KM4        5.813762e+11   0.850138
3           N_H4        5.203913e+11   0.865858
4          N_KM5        5.610422e+11   0.855379
..           ...                 ...        ...
90  N_H4_d0.2145        5.570771e+11   0.856401
91  N_H4_d0.2263        5.184757e+11   0.866352
92  N_H4_d0.2382        5.066836e+11   0.869391
93    N_H4_d0.25        5.368365e+11   0.861619
94           AP1        5.509478e+11   0.857981

[95 rows x 3 columns]


In [22]:
knn_model = KNeighborsRegressor(n_neighbors=5)

# Call the train_test_and_plot function
train_and_evaluate_with_clusters(cluster_labels, knn_model, start_train, end_train, start_test, end_test, model_features, data)


   Cluster Combo  Mean Squared Error  R-squared
0          N_KM3        5.138354e+11   0.867548
1           N_H3        5.137280e+11   0.867575
2          N_KM4        5.138354e+11   0.867548
3           N_H4        5.137280e+11   0.867575
4          N_KM5        5.127779e+11   0.867820
..           ...                 ...        ...
90  N_H4_d0.2145        5.137280e+11   0.867575
91  N_H4_d0.2263        5.137280e+11   0.867575
92  N_H4_d0.2382        5.137280e+11   0.867575
93    N_H4_d0.25        5.137280e+11   0.867575
94           AP1        5.137280e+11   0.867575

[95 rows x 3 columns]


In [23]:
nb_model = GaussianNB()
train_and_evaluate_with_clusters(cluster_labels, nb_model, start_train, end_train, start_test, end_test, model_features, data)


   Cluster Combo  Mean Squared Error  R-squared
0          N_KM3        1.048007e+12   0.729853
1           N_H3        1.047742e+12   0.729922
2          N_KM4        1.036346e+12   0.732859
3           N_H4        1.047742e+12   0.729922
4          N_KM5        1.036367e+12   0.732854
..           ...                 ...        ...
90  N_H4_d0.2145        1.047742e+12   0.729922
91  N_H4_d0.2263        1.047742e+12   0.729922
92  N_H4_d0.2382        1.047742e+12   0.729922
93    N_H4_d0.25        1.047742e+12   0.729922
94           AP1        1.047925e+12   0.729874

[95 rows x 3 columns]
