Supervised Unsupervised</br>
- Uses clustering labels produced from the Unsupervised Learning models as input features to the best model identified in Supervised Learning. The resulting R2 and MSE scores from using these labels as features are stored in Cluster_Results.p along with Unsupervised Model parameter_key. Includes tables of top models performance and scores.
- Dependencies: Merged_Data.csv, Clusters_Norm.p, Clusters_Raw.p, S_Scores.p</br>
- Output: Cluster_Results.p</br></br>


In [51]:
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from statsmodels.tsa.seasonal import seasonal_decompose
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [52]:
#List of Features in Merged Data
data = pd.read_csv("../Merging/Merged_Data.csv")
print(data.columns)
data.head(5)

Index(['period', 'plantCode', 'plantName', 'fuel2002', 'fuelTypeDescription',
       'state', 'stateDescription', 'primeMover', 'total-consumption',
       'total-consumption-units', 'total-consumption-btu', 'generation',
       'gross-generation', 'consumption-for-eg', 'consumption-for-eg-units',
       'consumption-for-eg-btu', 'average-heat-content',
       'average-heat-content-units', 'total-consumption-btu-units',
       'generation-units', 'gross-generation-units',
       'consumption-for-eg-btu-units', 'X', 'Y', 'FID', 'OBJECTID',
       'Plant_Code', 'Plant_Name', 'Utility_ID', 'Utility_Na', 'sector_nam',
       'Street_Add', 'City', 'County', 'State', 'Zip', 'PrimSource',
       'source_des', 'tech_desc', 'Install_MW', 'Total_MW', 'Bat_MW', 'Bio_MW',
       'Coal_MW', 'Geo_MW', 'Hydro_MW', 'HydroPS_MW', 'NG_MW', 'Nuclear_MW',
       'Crude_MW', 'Solar_MW', 'Wind_MW', 'Other_MW', 'Source', 'Period',
       'Longitude', 'Latitude', 'LATITUDE', 'LONGITUDE', 'DATE', 'ELEVATION',


Unnamed: 0,period,plantCode,plantName,fuel2002,fuelTypeDescription,state,stateDescription,primeMover,total-consumption,total-consumption-units,...,TEMPEXT_BASE40,TEMPEXT_BASE45,TEMPEXT_BASE50,OVER_60,OVER_70,OVER_80,UNDER_40,UNDER_30,UNDER_20,SUM_OVER_UNDER
0,2020-01-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,84432,MMBtu per Mcf,...,23.723333,18.723333,14.123333,0.6,0.3,0.0,0.0,0.0,0.0,0.9
1,2020-12-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,11343,MMBtu per Mcf,...,17.293548,12.487097,8.706452,0.354839,0.032258,0.0,0.0,0.0,0.0,0.387097
2,2023-08-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,109965,MMBtu per Mcf,...,46.403226,41.403226,36.403226,1.0,1.0,1.0,0.0,0.0,0.0,3.0
3,2023-04-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,12548,MMBtu per Mcf,...,32.903333,27.903333,22.903333,1.0,0.833333,0.0,0.0,0.0,0.0,1.833333
4,2023-02-01,57950,MSC Croswell,NG,Natural Gas,MI,Michigan,ALL,96587,MMBtu per Mcf,...,25.867857,20.867857,15.867857,0.785714,0.357143,0.0,0.0,0.0,0.0,1.142857


In [84]:
# Load cluster labels

plant_clusters = {}

with open('Clusters_Norm.p', 'rb') as f:
    Norm = pickle.load(f)

with open('Clusters_Raw.p', 'rb') as f:
    Raw = pickle.load(f)

plant_clusters['norm'] = Norm
plant_clusters['raw'] = Raw

plant_clusters['norm']['H']['3H_Lcomp']

Unnamed: 0,plantCode,0,1,2
0,9,0.0,0.0,1.0
1,99,0.0,0.0,1.0
2,136,0.0,0.0,1.0
3,298,0.0,0.0,1.0
4,550,0.0,0.0,1.0
...,...,...,...,...
478,66596,1.0,0.0,0.0
479,66597,1.0,0.0,0.0
480,66612,1.0,0.0,0.0
481,66613,1.0,0.0,0.0


In [54]:
# Define the time periods and features for training and testing
start_train = '2019-01-01'
end_train = '2022-12-31'
start_test = '2023-01-01'
end_test = '2023-12-31'
model_features = ['ELEVATION', 'TEMP',
       'WDSP', 'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP','TEMPEXT_BASE40', 'TEMPEXT_BASE45', 'TEMPEXT_BASE50', 
       'OVER_60', 'OVER_70', 'OVER_80', 'UNDER_40', 'UNDER_30', 'UNDER_20',
       'SUM_OVER_UNDER', 'LATITUDE','LONGITUDE','Zip','plantCode']
# add feature for distance to weather station from power plant?


In [55]:
#Feature processing and train/test splits
data['period'] = pd.to_datetime(data['period'])
data.sort_values(by='period', inplace=True)
plant_codes = data['plantCode'].unique()
data = data[data['period'] >= '2019-01-01']

# Exclude negative consumption values
data = data[data['total-consumption'] > 0]

# Convert 'state' column to categorical type
data['state'] = data['state'].astype('category')
    
# Extract features and target variable
X = data[model_features]
y = data['total-consumption']

# Filter the data for training and testing
train_data = data[(data['period'] >= start_train) & (data['period'] <= end_train)]
test_data = data[(data['period'] >= start_test) & (data['period'] <= end_test)]

# Extract features and target variable for training
X_train = train_data[model_features]
y_train = train_data['total-consumption']

# Extract features and target variable for testing
X_test = test_data[model_features]
y_test = test_data['total-consumption']

In [56]:
def train_and_evaluate_with_clusters(cluster_labels, model, X_train, X_test, y_train, y_test):
    '''Input: cluster_labels = nested dictionary with one hot encoded cluster labels for different clusterings
        model = what model to use for the evaluation
        X_train = X training data - set earlier
        X_test = X testing data - set earlier
        y_train = y training data - set earlier
        y_test = y test data, used to evaluate model predictions
    Output: Nested dictionary - raw and normalized are outer keys, while inner keys are 'Results' that contain all models and R2 scores
        and 'Best' which contains the best performing model of each model type along with it's R2 and Silhouette scores'''
    results = {'norm':{}, 'raw':{}}
    for raw_norm_key, raw_norm_clusters in cluster_labels.items():
        raw_norm_results = [] 
        raw_norm_best = []
        for model_key, model_clusters in raw_norm_clusters.items():
            best_key = []
            best_mse = 1
            best_r2 = 0
            for key, clusters_df in model_clusters.items():
            # Convert cluster labels into one-hot encoded features
                def cluster_to_one_hot(data, clusters):
                    one_hot_clusters = pd.get_dummies(clusters)
                    merged_data = pd.merge(data, one_hot_clusters, how='left', left_on='plantCode', right_on='plantCode')
                    #merged_data.drop(columns=['plantCode'], inplace=True)  # Drop the redundant plant code column from clusters
                    return merged_data
        
                X_with_clusters_train = cluster_to_one_hot(X_train.copy(), clusters_df)
                X_with_clusters_test = cluster_to_one_hot(X_test.copy(), clusters_df)
        
                # Fit the model on the training data
                model.fit(X_with_clusters_train, y_train)

                # Make predictions on the test set
                y_pred = model.predict(X_with_clusters_test)

                # Evaluate the model
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)
                
                if r2 > best_r2:
                    best_key = key
                    best_mse = mse
                    best_r2 = r2

                raw_norm_results.append({'Model': model_key, 'Parameter Key': key, 'Mean Squared Error': mse, 'R-squared': r2})
            raw_norm_best.append({'Model': model_key, 'Parameter Key': best_key, 'Mean Squared Error': best_mse, 'R-squared': best_r2})


        best_df = pd.DataFrame(raw_norm_best).sort_values(by=['R-squared'], ascending=False).reset_index(drop=True)
        results_df = pd.DataFrame(raw_norm_results).sort_values(by=['R-squared'], ascending=False).reset_index(drop=True)
        results[raw_norm_key]['Best'] = best_df
        results[raw_norm_key]['Results'] = results_df

    with open('S_Scores.p', 'rb') as f:
        scores = pickle.load(f)

    for type in ['raw', 'norm']:
        results[type]['Best'] = pd.merge(results[type]['Best'], 
                                       scores[type][['Parameter Key', 'params', 's_score', 'k']], 
                                       how="left", left_on ='Parameter Key', 
                                       right_on='Parameter Key')
        

    return results


In [56]:
#This takes a long time to run! Potentially more than a day if you are on a personal machine
#Run the best model from supervised iterating though all the clusters to find which result in greatest increase in predictive performance

# Define the number of trees in the forest.
n_estimators = 150  # Example number of trees, you can change it as needed

# Create a Random Forest model
rf_model = RandomForestRegressor(n_estimators=n_estimators, 
                                 max_depth = 20,
                                 min_samples_split = 10,
                                 random_state=0)

# Call the function with the model and other parameters
results = train_and_evaluate_with_clusters(plant_clusters, rf_model, X_train, X_test, y_train, y_test)
pickle.dump(results, open(f"Cluster_Results.p", "wb"))


In [79]:
with open('Cluster_Results.p', 'rb') as f:
        results = pickle.load(f)

In [81]:
#Best Performances by Models - Raw Results
results['raw']['Best'].drop(columns=['Parameter Key'])

Unnamed: 0,Model,params,Mean Squared Error,R-squared,s_score,k
0,KM,"{'init': 'random', 'max_iter': 26}",476858500000.0,0.877079,0.703891,10
1,DB,"{'eps': 6000, 'mins': 5}",490175100000.0,0.873647,-0.435829,7
2,AP,"{'pref': 0, 'max_iter': 1, 'damp': 0.5}",491192000000.0,0.873384,-1.0,1
3,H,"{'link': 'single', 'dist': 'None'}",498252500000.0,0.871564,0.561521,7


In [80]:
#Best Performances by Models - Normalized Results 
results['norm']['Best'].drop(columns=['Parameter Key'])

Unnamed: 0,Model,params,Mean Squared Error,R-squared,s_score,k
0,DB,"{'eps': 0.0002836, 'mins': 28}",473675700000.0,0.8779,-0.184491,5
1,H,"{'link': 'single', 'dist': 'None'}",489400100000.0,0.873846,0.427557,18
2,KM,"{'init': 'k-medoids++', 'max_iter': 26}",490311000000.0,0.873612,0.727147,3
3,AP,"{'pref': 0, 'max_iter': 1, 'damp': 0.5}",491192000000.0,0.873384,-1.0,1


In [82]:
#Top 15 Models Overall - Raw Data
results['raw']['Results'].iloc[:15]

Unnamed: 0,Model,Parameter Key,Mean Squared Error,R-squared
0,KM,10KM_Irand_MI126,476858500000.0,0.877079
1,KM,10KM_Irand_MI26,476858500000.0,0.877079
2,KM,10KM_Irand_MI226,476858500000.0,0.877079
3,KM,10KM_Irand_MI251,476858500000.0,0.877079
4,KM,10KM_Irand_MI276,476858500000.0,0.877079
5,KM,10KM_Irand_MI151,476858500000.0,0.877079
6,KM,10KM_Irand_MI176,476858500000.0,0.877079
7,KM,10KM_Irand_MI101,476858500000.0,0.877079
8,KM,10KM_Irand_MI76,476858500000.0,0.877079
9,KM,10KM_Irand_MI51,476858500000.0,0.877079


In [83]:
#Top 15 Models Overall - Normalized Data
results['norm']['Results'].iloc[:15]

Unnamed: 0,Model,Parameter Key,Mean Squared Error,R-squared
0,DB,5DB_M28eps0.000284,473675700000.0,0.8779
1,H,18H_Lsing,489400100000.0,0.873846
2,DB,4DB_M5eps0.000204,489471300000.0,0.873828
3,H,6H_Laver,489617000000.0,0.87379
4,H,6H_Laver_DT0.07,489617000000.0,0.87379
5,H,6H_Laver_DT0.061,489617000000.0,0.87379
6,H,20H_Laver,489640100000.0,0.873785
7,H,8H_Lcomp_DT0.088,489696000000.0,0.87377
8,H,8H_Lcomp,489696000000.0,0.87377
9,H,4H_Laver_DT0.088,489751100000.0,0.873756


In [57]:
# Exploring results with a lower number of estimators to look at sensitivity of relative cluster performance to number of trees
n_estimators = 5  # Example number of trees, you can change it as needed

# Create a Random Forest model
rf_model = RandomForestRegressor(n_estimators=n_estimators, 
                                 max_depth = 20,
                                 min_samples_split = 10,
                                 random_state=0)

# Call the function with the model and other parameters
results = train_and_evaluate_with_clusters(plant_clusters, rf_model, X_train, X_test, y_train, y_test)
pickle.dump(results, open(f"Cluster_Results_5.p", "wb"))

In [75]:
with open('Cluster_Results_5.p', 'rb') as f:
        results = pickle.load(f)

In [76]:
results['norm']['Best']

Unnamed: 0,Model,Parameter Key,params,Mean Squared Error,R-squared
0,KM,6KM_Irand_MI1,"{'init': 'random', 'max_iter': 1}",519565100000.0,0.866071
1,H,9H_Laver,"{'link': 'average', 'dist': 'None'}",525780700000.0,0.864469
2,DB,5DB_M5eps0.000164,"{'eps': 0.0001642, 'mins': 5}",526071600000.0,0.864394
3,AP,481AP_MI1_D0.5,"{'pref': 1, 'max_iter': 1, 'damp': 0.5}",528601000000.0,0.863742


In [77]:
results['raw']['Best']

Unnamed: 0,Model,Parameter Key,params,Mean Squared Error,R-squared
0,KM,9KM_Ibuil_MI26,"{'init': 'build', 'max_iter': 26}",500000000000.0,0.871114
1,H,7H_Lsing,"{'link': 'single', 'dist': 'None'}",516750800000.0,0.866796
2,DB,8DB_M5eps5000,"{'eps': 5000, 'mins': 5}",528951500000.0,0.863651
3,AP,1AP_MI1_D0.5,"{'pref': 0, 'max_iter': 1, 'damp': 0.5}",534130100000.0,0.862316
