In [59]:
import pandas as pd
import numpy as np
import tensorflow as tf 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("https://raw.githubusercontent.com/nrharbeck/EnergyPredictionML/main/DataProcessing/EIA_DSIRE_Data_Tech.csv")

print(df)

       Unnamed: 0  ...  No Programs Available_Regulatory Policy
0               0  ...                                        0
1               1  ...                                        0
2               2  ...                                        0
3               3  ...                                        0
4               4  ...                                        0
...           ...  ...                                      ...
13590       13590  ...                                        0
13591       13591  ...                                        0
13592       13592  ...                                        0
13593       13593  ...                                        0
13594       13594  ...                                        0

[13595 rows x 188 columns]


In [61]:
#Drop any NA rows for model building
df = df.dropna(how='any')

#Now remove

#Remove CO2 emissions
df = df[~df.series_id.str.contains("EMISS.CO2-TOTV")]

#Make a new column with the EIA generation categories
df['generation_energy'] = df['series_id'].str[9:-8]

#Guide to see if strings changse from https://stackoverflow.com/questions/40348541/pandas-diff-with-string
df['Series_Change'] = df['series_id'].ne(df['series_id'].shift().bfill()).astype(int)
df["Generation_Diff"] = np.where(-df['Generation'].diff() > 0, 1, 0) 
df['Generation_Increase'] = np.where((df["Generation_Diff"] > 0) & (df['Series_Change'] == 0), 1, 0)

#Split data into features and target. Descriptive EIA data is removed here.
X = df.drop(columns=['Generation', 'Series_Change', 'Generation_Diff', 'Generation_Increase', 'Unnamed: 0', 'Index', 'units', 'Copyright', 'description','end','f','geography','iso3166','name','source','start', 'series_id'])
y = df['Generation_Increase']

#Encode categorical features
#print(X.columns)
X = pd.get_dummies(X, columns=['Date','generation_energy','State'])


In [64]:
#Split data into train and test sets. Validation with the training set will be incorporated into the pipeline below
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [65]:
#Compare Performance with a Logistic Regression model
#Guide from Professor Yuxiao Huang, The George Washinton University
from sklearn.preprocessing import LabelEncoder
#Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

# Implement me
le = LabelEncoder()
y_train = le.fit_transform(y_train)

print(pd.DataFrame(data=y_train, columns=['Generation_Increase'])['Generation_Increase'].value_counts())

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

clfs = {'lr': LogisticRegression(random_state=0),
        'mlp': MLPClassifier(random_state=0),
        'dt': DecisionTreeClassifier(random_state=0),
        'rf': RandomForestClassifier(random_state=0),
        'xgb': XGBClassifier(seed=0)}

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipe_clfs = {}

for name, clf in clfs.items():
    pipe_clfs[name] = Pipeline([('StandardScaler', StandardScaler()),('clf', clf)])

param_grids = {}
C_range = [10 ** i for i in range(-4, 5)]

#Logistic Regression Parameter Grid
param_grid = [{'clf__multi_class': ['ovr'], 
               'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
               'clf__C': C_range},
              
              {'clf__multi_class': ['multinomial'],
               'clf__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
               'clf__C': C_range}]

param_grids['lr'] = param_grid

#MLP Parameter Grid
param_grid = [{'clf__hidden_layer_sizes': [10, 100],
               'clf__activation': ['identity', 'logistic', 'tanh', 'relu']}]
param_grids['mlp'] = param_grid

#Decision Tree Parameter Grid
param_grid = [{'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]
param_grids['dt'] = param_grid

#Random Forest Parameter Grid
param_grid = [{'clf__n_estimators': [10, 100, 1000],
               'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]
param_grids['rf'] = param_grid

#XGBoost Parameter Grid
param_grid = [{'clf__eta': [10 ** i for i in range(-4, 1)],
               'clf__gamma': [0, 10, 100],
               'clf__lambda': [10 ** i for i in range(-4, 5)]}]
param_grids['xgb'] = param_grid

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# The list of [best_score_, best_params_, best_estimator_]
best_score_param_estimators = []

# For each classifier
for name in pipe_clfs.keys():
    # GridSearchCV
    gs = GridSearchCV(estimator=pipe_clfs[name],
                      param_grid=param_grids[name],
                      scoring='accuracy',
                      n_jobs=1,
                      iid=False,
                      cv=StratifiedKFold(n_splits=10,
                                         shuffle=True,
                                         random_state=0))
    # Fit the pipeline
    gs = gs.fit(X_train, y_train)
    
    # Update best_score_param_estimators
    best_score_param_estimators.append([gs.best_score_, gs.best_params_, gs.best_estimator_])

# Sort best_score_param_estimators in descending order of the best_score_
best_score_param_estimators = sorted(best_score_param_estimators, key=lambda x : x[0], reverse=True)

# For each [best_score_, best_params_, best_estimator_]
for best_score_param_estimator in best_score_param_estimators:
    # Print out [best_score_, best_params_, best_estimator_], where best_estimator_ is a pipeline
    # Since we only print out the type of classifier of the pipeline
    print([best_score_param_estimator[0], best_score_param_estimator[1], type(best_score_param_estimator[2].named_steps['clf'])], end='\n\n')

1    5017
0    4334
Name: Generation_Increase, dtype: int64
[0.6622822112527995, {'clf__eta': 0.0001, 'clf__gamma': 0, 'clf__lambda': 0.0001}, <class 'xgboost.sklearn.XGBClassifier'>]

[0.6585361533890947, {'clf__activation': 'logistic', 'clf__hidden_layer_sizes': 100}, <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>]

[0.658110288404406, {'clf__C': 0.1, 'clf__multi_class': 'ovr', 'clf__solver': 'saga'}, <class 'sklearn.linear_model._logistic.LogisticRegression'>]

[0.6358670414552767, {'clf__min_samples_leaf': 30, 'clf__min_samples_split': 2, 'clf__n_estimators': 1000}, <class 'sklearn.ensemble._forest.RandomForestClassifier'>]

[0.6349052744640981, {'clf__min_samples_leaf': 30, 'clf__min_samples_split': 2}, <class 'sklearn.tree._classes.DecisionTreeClassifier'>]



In [66]:
y_pred = best_score_param_estimators[1][2].predict(X_test)
print(y_pred)
print(np.array(y_test))
print("Accuracy on the test set:", round((1-(np.abs(y_pred - y_test).sum()/len(y_pred)))*100,8),"percent")

[1 0 0 ... 1 0 1]
[0 0 0 ... 1 0 1]
Accuracy on the test set: 65.78272027 percent


In [68]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import KFold
# determine the number of input features
n_features = X_train.shape[1]

#Set up cross validation with guide from https://www.machinecurve.com/index.php/2020/02/18/how-to-use-k-fold-cross-validation-with-keras/
# Define the K-fold Cross Validator
kfold = KFold(n_splits=3, shuffle=True)
acc_per_fold =[]
loss_per_fold = []
# K-fold Cross Validation model evaluation
fold_no = 1
"""
for train, val in kfold.split(X_train, y_train):

  # define model
  hidden_layers = 3
  model = Sequential()
  model.add(Dense(100, activation='sigmoid', kernel_initializer='he_normal', input_shape=(n_features,)))
  for layer in range(hidden_layers):
    model.add(Dense(30, activation='relu', kernel_initializer='he_normal'))
  model.add(Dense(1, activation='sigmoid'))

  # compile the model
  model.compile(optimizer='adam', loss='BinaryCrossentropy', metrics=['accuracy'])


  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  # Fit data to model
  history = model.fit(X_train.iloc[train], y_train.iloc[train],
              batch_size=32,
              epochs=150,
              verbose=0)

  # Generate generalization metrics
  scores = model.evaluate(X_train.iloc[val], y_train.iloc[val], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1
"""  
# define model without cross validation
hidden_layers = 3
model = Sequential()
model.add(Dense(100, activation='sigmoid', kernel_initializer='he_normal', input_shape=(n_features,)))
for layer in range(hidden_layers):
  model.add(Dense(30, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='BinaryCrossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=200, batch_size=32, verbose=0)
# Generate generalization metrics
scores = model.evaluate(X_test, y_test, verbose=0)
print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
acc_per_fold.append(scores[1] * 100)
loss_per_fold.append(scores[0])

Score for fold 1: loss of 1.3033943176269531; accuracy of 62.01881766319275%


In [69]:
# evaluate the model
error_train = model.evaluate(X_train, y_train, verbose=0)
error = model.evaluate(X_test, y_test, verbose=0)

print('Training set accuracy: %.3f percent' % (error_train[1]*100))
print('Test set accuracy: %.3f percent' % (error[1]*100))

# make a prediction
yhat = model.predict(X_test)
print("Accuracy on the test set:", round((1-(np.abs(((yhat>0.50)*1).reshape(1,-1) - np.array(y_test).reshape(1,-1)).sum()/len(y_test)))*100,8),"percent")

Training set accuracy: 78.676 percent
Test set accuracy: 62.019 percent
Accuracy on the test set: 62.0188195 percent
