## This code is to combine results from various outputs for PGE Data using Year 1 (test data) and Year 2 (test data) to train and Year 3 to predicit

In [1]:
# Commonly used python functions and display settings
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings("ignore") # specify to ignore warning messages

In [2]:
# Key imports for this code (various ML and Stat Models)
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression

In [3]:
import constants
import helper_methods

## Train Individual Models and Generate Excel for Predictions

In [4]:
generate_individual_predictions = False

In [5]:
if generate_individual_predictions:
    %run PK_MODEL.py
    %run NG_MODEL.py
    %run NU_MODEL.py
    %run BY_MODEL.py

In [6]:
if generate_individual_predictions:
    %run SA_MODEL.py

## Get data and analyze

In [7]:
# fetch data from the excel file


sa_train_data = pd.read_excel(constants.SA_TEST_1)
sa_test_data = pd.read_excel(constants.SA_TEST_2)
sa = constants.SA_COL

by_train_data = pd.read_excel(constants.BY_TEST_1)
by_test_data = pd.read_excel(constants.BY_TEST_2)
by = constants.BY_COL


nu_train_data = pd.read_excel(constants.NU_TEST_1)
nu_test_data = pd.read_excel(constants.NU_TEST_2)
nu = constants.NU_COL


pk_train_data = pd.read_excel(constants.PK_TEST_1)
pk_test_data = pd.read_excel(constants.PK_TEST_2)
pk = constants.PK_COL


ng_train_data = pd.read_excel(constants.NG_TEST_1)
ng_test_data = pd.read_excel(constants.NG_TEST_2)
ng = constants.NG_COL

methods = [pk, sa, nu, by, ng]

allcols = ['Year', 'Month', 'Day', 'Hour', 'Load', 'Site-1 Temp', 'Site-2 Temp',
       'Site-3 Temp', 'Site-4 Temp', 'Site-5 Temp', 'Site-1 GHI', 'Site-2 GHI',
       'Site-3 GHI', 'Site-4 GHI', 'Site-5 GHI']

cols = ['Year', 'Month', 'Day', 'Hour']

train_data = ng_train_data[allcols + [ng]]
train_data = train_data.merge(pk_train_data[cols+[pk]], on = cols, how = 'left')
train_data = train_data.merge(by_train_data[cols+[by]], on = cols, how = 'left')
train_data = train_data.merge(nu_train_data[cols+[nu]], on = cols, how = 'left')
train_data = train_data.merge(sa_train_data[cols+[sa]], on = cols, how = 'left')

test_data = ng_test_data[allcols + [ng]]
test_data = test_data.merge(pk_test_data[cols+[pk]], on = cols, how = 'left')
test_data = test_data.merge(by_test_data[cols+[by]], on = cols, how = 'left')
test_data = test_data.merge(nu_test_data[cols+[nu]], on = cols, how = 'left')
test_data = test_data.merge(sa_test_data[cols+[sa]], on = cols, how = 'left')

train_data.head()
train_data.tail()

test_data.head()
test_data.tail()

# Finding how many rows of data we have and if there are any NaN values
len(train_data)
len(test_data)
train_data.isna().sum()
test_data.isna().sum()

Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,Site-1 GHI,Site-2 GHI,Site-3 GHI,Site-4 GHI,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA
0,1,1,1,1,1997,8.0,8.2,5.3,9.4,8.1,0,0,0,0,0,1935.31,,1931.47,1905.52,
1,1,1,1,2,1921,8.3,8.6,5.2,8.6,7.1,0,0,0,0,0,1860.35,,1852.36,2100.16,
2,1,1,1,3,1861,8.1,8.8,5.1,8.7,6.2,0,0,0,0,0,1829.12,,1814.53,2077.88,
3,1,1,1,4,1833,7.6,8.1,4.3,8.5,6.0,0,0,0,0,0,1817.96,,1815.76,2062.45,
4,1,1,1,5,1847,7.3,7.5,4.0,8.6,6.9,0,0,0,0,0,1905.81,,1882.84,2007.13,


Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,Site-1 GHI,Site-2 GHI,Site-3 GHI,Site-4 GHI,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA
8779,1,12,31,20,2548,10.7,12.3,10.4,11.6,10.6,0,0,0,0,0,2669.7,2744.88,2620.39,2722.87,2667.68
8780,1,12,31,21,2446,9.7,11.7,8.4,11.0,12.3,0,0,0,0,0,2590.76,2647.25,2559.21,2608.06,2592.53
8781,1,12,31,22,2350,9.2,10.2,6.1,11.3,11.6,0,0,0,0,0,2438.13,2473.65,2412.72,2504.86,2474.73
8782,1,12,31,23,2227,8.4,9.5,4.2,10.5,8.5,0,0,0,0,0,2377.31,2360.37,2299.92,2388.27,2266.51
8783,1,12,31,24,2125,7.5,8.1,3.7,8.7,6.8,0,0,0,0,0,2212.78,2177.14,2195.27,2271.81,2177.12


Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,Site-1 GHI,Site-2 GHI,Site-3 GHI,Site-4 GHI,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA
0,2,1,1,1,2021,6.7,7.4,2.9,7.1,5.9,0,0,0,0,0,1991.57,1976.26,1989.82,1992.32,1972.31
1,2,1,1,2,1923,6.2,7.1,2.5,6.7,6.0,0,0,0,0,0,1905.37,1933.52,1900.49,1950.43,1897.57
2,2,1,1,3,1855,6.0,7.1,2.4,7.2,6.3,0,0,0,0,0,1812.8,1867.68,1833.55,1866.5,1886.81
3,2,1,1,4,1830,5.7,7.2,2.0,7.9,8.6,0,0,0,0,0,1749.44,1894.85,1781.64,1809.24,1894.87
4,2,1,1,5,1844,7.4,8.3,4.2,11.4,12.4,0,0,0,0,0,1862.8,1944.38,1811.23,1835.14,1905.08


Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,Site-1 GHI,Site-2 GHI,Site-3 GHI,Site-4 GHI,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA
8755,2,12,31,20,2545,12.5,11.9,10.3,12.6,11.3,0,0,0,0,0,2609.52,2556.2,2608.76,2584.44,2706.96
8756,2,12,31,21,2449,12.5,11.9,9.7,12.6,11.2,0,0,0,0,0,2509.41,2444.91,2529.14,2489.97,2529.9
8757,2,12,31,22,2348,12.5,12.0,9.7,12.7,11.1,0,0,0,0,0,2372.92,2294.96,2391.28,2334.18,2427.21
8758,2,12,31,23,2229,12.6,12.4,9.5,12.7,11.0,0,0,0,0,0,2195.36,2208.06,2227.2,2167.55,2206.03
8759,2,12,31,24,2118,12.4,12.1,9.7,12.5,10.6,0,0,0,0,0,2072.39,2008.86,2063.95,2058.77,2036.59


8784

8760

Year            0
Month           0
Day             0
Hour            0
Load            0
Site-1 Temp     0
Site-2 Temp     0
Site-3 Temp     0
Site-4 Temp     0
Site-5 Temp     0
Site-1 GHI      0
Site-2 GHI      0
Site-3 GHI      0
Site-4 GHI      0
Site-5 GHI      0
Model NG        0
Model PK       48
Model BY        0
Model NU        0
Model SA       24
dtype: int64

Year           0
Month          0
Day            0
Hour           0
Load           0
Site-1 Temp    0
Site-2 Temp    0
Site-3 Temp    0
Site-4 Temp    0
Site-5 Temp    0
Site-1 GHI     0
Site-2 GHI     0
Site-3 GHI     0
Site-4 GHI     0
Site-5 GHI     0
Model NG       0
Model PK       0
Model BY       0
Model NU       0
Model SA       0
dtype: int64

In [8]:
train_data['avg_to_impute'] =  (train_data[ng] + train_data[nu] )/ 2
train_data[pk].fillna(train_data['avg_to_impute'], inplace = True)
train_data[sa].fillna(train_data['avg_to_impute'], inplace = True)
train_data.drop(columns = ['avg_to_impute'], inplace = True)
train_data.head()


Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,Site-1 GHI,Site-2 GHI,Site-3 GHI,Site-4 GHI,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA
0,1,1,1,1,1997,8.0,8.2,5.3,9.4,8.1,0,0,0,0,0,1935.31,1920.41,1931.47,1905.52,1920.41
1,1,1,1,2,1921,8.3,8.6,5.2,8.6,7.1,0,0,0,0,0,1860.35,1980.25,1852.36,2100.16,1980.25
2,1,1,1,3,1861,8.1,8.8,5.1,8.7,6.2,0,0,0,0,0,1829.12,1953.5,1814.53,2077.88,1953.5
3,1,1,1,4,1833,7.6,8.1,4.3,8.5,6.0,0,0,0,0,0,1817.96,1940.21,1815.76,2062.45,1940.21
4,1,1,1,5,1847,7.3,7.5,4.0,8.6,6.9,0,0,0,0,0,1905.81,1956.47,1882.84,2007.13,1956.47


In [9]:
test_data['avg_to_impute'] =  (test_data[ng] + test_data[nu] ) / 2
test_data[pk].fillna(test_data['avg_to_impute'], inplace = True)
test_data[sa].fillna(test_data['avg_to_impute'], inplace = True)
test_data.drop(columns = ['avg_to_impute'], inplace = True)
test_data.head()

Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,Site-1 GHI,Site-2 GHI,Site-3 GHI,Site-4 GHI,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA
0,2,1,1,1,2021,6.7,7.4,2.9,7.1,5.9,0,0,0,0,0,1991.57,1976.26,1989.82,1992.32,1972.31
1,2,1,1,2,1923,6.2,7.1,2.5,6.7,6.0,0,0,0,0,0,1905.37,1933.52,1900.49,1950.43,1897.57
2,2,1,1,3,1855,6.0,7.1,2.4,7.2,6.3,0,0,0,0,0,1812.8,1867.68,1833.55,1866.5,1886.81
3,2,1,1,4,1830,5.7,7.2,2.0,7.9,8.6,0,0,0,0,0,1749.44,1894.85,1781.64,1809.24,1894.87
4,2,1,1,5,1844,7.4,8.3,4.2,11.4,12.4,0,0,0,0,0,1862.8,1944.38,1811.23,1835.14,1905.08


## Classify to get best among the 7 methods

In [10]:
# Function to find the best method and its corresponding prediction
def find_best_method(row):
    errors = {method: abs(row[method] - row['Load']) for method in methods}
    best_method = max(errors, key=lambda k: (-errors[k], k))  # Prioritizing later columns in case of ties
    return best_method, row[best_method]

# Apply function to get best method and best prediction
train_data[['best', 'best_pred']] = train_data.apply(lambda row: pd.Series(find_best_method(row)), axis=1)

train_data.head()
train_data.tail()

Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,...,Site-3 GHI,Site-4 GHI,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA,best,best_pred
0,1,1,1,1,1997,8.0,8.2,5.3,9.4,8.1,...,0,0,0,1935.31,1920.41,1931.47,1905.52,1920.41,Model NG,1935.31
1,1,1,1,2,1921,8.3,8.6,5.2,8.6,7.1,...,0,0,0,1860.35,1980.25,1852.36,2100.16,1980.25,Model SA,1980.25
2,1,1,1,3,1861,8.1,8.8,5.1,8.7,6.2,...,0,0,0,1829.12,1953.5,1814.53,2077.88,1953.5,Model NG,1829.12
3,1,1,1,4,1833,7.6,8.1,4.3,8.5,6.0,...,0,0,0,1817.96,1940.21,1815.76,2062.45,1940.21,Model NG,1817.96
4,1,1,1,5,1847,7.3,7.5,4.0,8.6,6.9,...,0,0,0,1905.81,1956.47,1882.84,2007.13,1956.47,Model BY,1882.84


Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,...,Site-3 GHI,Site-4 GHI,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA,best,best_pred
8779,1,12,31,20,2548,10.7,12.3,10.4,11.6,10.6,...,0,0,0,2669.7,2744.88,2620.39,2722.87,2667.68,Model BY,2620.39
8780,1,12,31,21,2446,9.7,11.7,8.4,11.0,12.3,...,0,0,0,2590.76,2647.25,2559.21,2608.06,2592.53,Model BY,2559.21
8781,1,12,31,22,2350,9.2,10.2,6.1,11.3,11.6,...,0,0,0,2438.13,2473.65,2412.72,2504.86,2474.73,Model BY,2412.72
8782,1,12,31,23,2227,8.4,9.5,4.2,10.5,8.5,...,0,0,0,2377.31,2360.37,2299.92,2388.27,2266.51,Model SA,2266.51
8783,1,12,31,24,2125,7.5,8.1,3.7,8.7,6.8,...,0,0,0,2212.78,2177.14,2195.27,2271.81,2177.12,Model SA,2177.12


In [11]:
# Define mapping
class_mapping = {methods[i]: i for i in range(len(methods))}

# Apply mapping to 'best' column in df_train
train_data['best_encoded'] = train_data['best'].map(class_mapping)


In [12]:
# Creating training data dropping columns not needed and also ground truth
X_train = train_data.drop(columns = ['Year', 'Day', 'Load', 'best_pred', 'best', 'best_encoded'] + methods)
y_train = train_data['best_encoded']
X_test = test_data.drop(columns = ['Year', 'Day', 'Load'] + methods)

In [13]:
# Define the XGBoost regressor with specific hyperparameters
model = XGBClassifier(
        n_estimators=400,
        max_depth=5,
        learning_rate=0.05,
        subsample=1.0,
        colsample_bytree=0.6,
        objective='reg:squarederror',
        random_state=42
    )

# Train the model
model.fit(X_train, y_train)

In [14]:
# Make predictions
# y_preds = gb.predict(X_test)
y_preds = model.predict(X_test)
y_preds

array([2, 4, 3, ..., 3, 0, 0])

In [15]:
test_data['best_encoded'] = y_preds
train_data['best_encoded_fcst'] = model.predict(X_train)
test_data.head()

Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,...,Site-2 GHI,Site-3 GHI,Site-4 GHI,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA,best_encoded
0,2,1,1,1,2021,6.7,7.4,2.9,7.1,5.9,...,0,0,0,0,1991.57,1976.26,1989.82,1992.32,1972.31,2
1,2,1,1,2,1923,6.2,7.1,2.5,6.7,6.0,...,0,0,0,0,1905.37,1933.52,1900.49,1950.43,1897.57,4
2,2,1,1,3,1855,6.0,7.1,2.4,7.2,6.3,...,0,0,0,0,1812.8,1867.68,1833.55,1866.5,1886.81,3
3,2,1,1,4,1830,5.7,7.2,2.0,7.9,8.6,...,0,0,0,0,1749.44,1894.85,1781.64,1809.24,1894.87,3
4,2,1,1,5,1844,7.4,8.3,4.2,11.4,12.4,...,0,0,0,0,1862.8,1944.38,1811.23,1835.14,1905.08,3


In [16]:
# Reverse mapping
inverse_mapping = {v: k for k, v in class_mapping.items()}

# Apply inverse mapping to predicted classes
test_data['best'] = test_data['best_encoded'].map(inverse_mapping)  # Assuming predictions are stored in 'best_encoded'
train_data['best_fcst'] = train_data['best_encoded_fcst'].map(inverse_mapping) 
test_data.head()

Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,...,Site-3 GHI,Site-4 GHI,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA,best_encoded,best
0,2,1,1,1,2021,6.7,7.4,2.9,7.1,5.9,...,0,0,0,1991.57,1976.26,1989.82,1992.32,1972.31,2,Model NU
1,2,1,1,2,1923,6.2,7.1,2.5,6.7,6.0,...,0,0,0,1905.37,1933.52,1900.49,1950.43,1897.57,4,Model NG
2,2,1,1,3,1855,6.0,7.1,2.4,7.2,6.3,...,0,0,0,1812.8,1867.68,1833.55,1866.5,1886.81,3,Model BY
3,2,1,1,4,1830,5.7,7.2,2.0,7.9,8.6,...,0,0,0,1749.44,1894.85,1781.64,1809.24,1894.87,3,Model BY
4,2,1,1,5,1844,7.4,8.3,4.2,11.4,12.4,...,0,0,0,1862.8,1944.38,1811.23,1835.14,1905.08,3,Model BY


In [17]:
# Populate 'best_pred' based on the chosen best method
test_data['best_pred'] = test_data.apply(lambda row: row[row['best']], axis=1)
train_data['best_pred_fcst'] = train_data.apply(lambda row: row[row['best_fcst']], axis=1)
test_data.head()

Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,...,Site-4 GHI,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA,best_encoded,best,best_pred
0,2,1,1,1,2021,6.7,7.4,2.9,7.1,5.9,...,0,0,1991.57,1976.26,1989.82,1992.32,1972.31,2,Model NU,1992.32
1,2,1,1,2,1923,6.2,7.1,2.5,6.7,6.0,...,0,0,1905.37,1933.52,1900.49,1950.43,1897.57,4,Model NG,1905.37
2,2,1,1,3,1855,6.0,7.1,2.4,7.2,6.3,...,0,0,1812.8,1867.68,1833.55,1866.5,1886.81,3,Model BY,1833.55
3,2,1,1,4,1830,5.7,7.2,2.0,7.9,8.6,...,0,0,1749.44,1894.85,1781.64,1809.24,1894.87,3,Model BY,1781.64
4,2,1,1,5,1844,7.4,8.3,4.2,11.4,12.4,...,0,0,1862.8,1944.38,1811.23,1835.14,1905.08,3,Model BY,1811.23


## Regression to get weighted sum of 7 predictions

In [18]:
# Creating training data dropping columns not needed and also ground truth
X_train = train_data[methods] 
y_train = train_data['Load']
X_test = test_data[methods]
y_test = test_data['Load']

In [19]:
model = LinearRegression(fit_intercept = False)
model.fit(X_train, y_train) 

# The following gives the R-square score
model.score(X_train, y_train) 

# This is the coefficient Beta_1 to Beta_m
model.coef_

0.8466731194497951

array([0.06415487, 0.58101612, 0.09399755, 0.22459056, 0.02390409])

In [20]:
y_pred = model.predict(X_test)

In [21]:
train_data['weighted_pred'] = model.predict(X_train)
test_data['weighted_pred'] = model.predict(X_test)

In [22]:
train_data.head()
test_data.head()

Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,...,Model BY,Model NU,Model SA,best,best_pred,best_encoded,best_encoded_fcst,best_fcst,best_pred_fcst,weighted_pred
0,1,1,1,1,1997,8.0,8.2,5.3,9.4,8.1,...,1931.47,1905.52,1920.41,Model NG,1935.31,4,4,Model NG,1935.31,1898.16
1,1,1,1,2,1921,8.3,8.6,5.2,8.6,7.1,...,1852.36,2100.16,1980.25,Model SA,1980.25,1,4,Model NG,1860.35,1935.5
2,1,1,1,3,1861,8.1,8.8,5.1,8.7,6.2,...,1814.53,2077.88,1953.5,Model NG,1829.12,4,4,Model NG,1829.12,1906.91
3,1,1,1,4,1833,7.6,8.1,4.3,8.5,6.0,...,1815.76,2062.45,1940.21,Model NG,1817.96,4,4,Model NG,1817.96,1896.89
4,1,1,1,5,1847,7.3,7.5,4.0,8.6,6.9,...,1882.84,2007.13,1956.47,Model BY,1882.84,3,3,Model BY,1882.84,1919.34


Unnamed: 0,Year,Month,Day,Hour,Load,Site-1 Temp,Site-2 Temp,Site-3 Temp,Site-4 Temp,Site-5 Temp,...,Site-5 GHI,Model NG,Model PK,Model BY,Model NU,Model SA,best_encoded,best,best_pred,weighted_pred
0,2,1,1,1,2021,6.7,7.4,2.9,7.1,5.9,...,0,1991.57,1976.26,1989.82,1992.32,1972.31,2,Model NU,1992.32,1954.51
1,2,1,1,2,1923,6.2,7.1,2.5,6.7,6.0,...,0,1905.37,1933.52,1900.49,1950.43,1897.57,4,Model NG,1905.37,1882.28
2,2,1,1,3,1855,6.0,7.1,2.4,7.2,6.3,...,0,1812.8,1867.68,1833.55,1866.5,1886.81,3,Model BY,1833.55,1846.66
3,2,1,1,4,1830,5.7,7.2,2.0,7.9,8.6,...,0,1749.44,1894.85,1781.64,1809.24,1894.87,3,Model BY,1781.64,1834.54
4,2,1,1,5,1844,7.4,8.3,4.2,11.4,12.4,...,0,1862.8,1944.38,1811.23,1835.14,1905.08,3,Model BY,1811.23,1855.44


## Residual based method using the best pred so far

In [23]:
# Creating Residual
train_data['Residuals'] = train_data['Load'] - train_data[pk] # old sa, by
# Creating training data dropping columns not needed and also ground truth
X_train = train_data[['Month', 'Hour', 'Site-1 Temp', 'Site-2 Temp',
       'Site-3 Temp', 'Site-4 Temp', 'Site-5 Temp', 'Site-1 GHI', 'Site-2 GHI',
       'Site-3 GHI', 'Site-4 GHI', 'Site-5 GHI']]
y_train = train_data['Residuals']
X_test = test_data[['Month', 'Hour', 'Site-1 Temp', 'Site-2 Temp',
       'Site-3 Temp', 'Site-4 Temp', 'Site-5 Temp', 'Site-1 GHI', 'Site-2 GHI',
       'Site-3 GHI', 'Site-4 GHI', 'Site-5 GHI']]

# Define the XGBoost regressor with specific hyperparameters
model = XGBRegressor(
        n_estimators=400,
        max_depth=5,
        learning_rate=0.05,
        subsample=1.0,
        colsample_bytree=0.6,
        objective='reg:squarederror',
        random_state=42
    )
# Train the model
model.fit(X_train, y_train) 

In [24]:
# test_data['Residuals'] = gb.predict(X_test)
test_data['Residuals'] = model.predict(X_test)
test_data['Res_pred'] = test_data['Residuals'] + test_data[pk] # old sa, by
train_data['Res_pred'] = model.predict(X_train) + train_data[pk]

## ML-based corrections for best pred so far

In [25]:
# Creating training data dropping columns not needed and also ground truth
X_train = train_data[['Month', 'Hour', 'Site-1 Temp', 'Site-2 Temp',
       'Site-3 Temp', 'Site-4 Temp', 'Site-5 Temp', 'Site-1 GHI', 'Site-2 GHI',
       'Site-3 GHI', 'Site-4 GHI', 'Site-5 GHI', pk]] # old sa, by
y_train = train_data['Load']
X_test = test_data[['Month', 'Hour', 'Site-1 Temp', 'Site-2 Temp',
       'Site-3 Temp', 'Site-4 Temp', 'Site-5 Temp', 'Site-1 GHI', 'Site-2 GHI',
       'Site-3 GHI', 'Site-4 GHI', 'Site-5 GHI', pk]] # old sa, by

# Define the XGBoost regressor with specific hyperparameters
model = XGBRegressor(
        n_estimators=400,
        max_depth=5,
        learning_rate=0.05,
        subsample=1.0,
        colsample_bytree=0.6,
        objective='reg:squarederror',
        random_state=42
    )
# Train the model
model.fit(X_train, y_train) 

In [26]:
# test_data['Adj_pred'] = gb.predict(X_test)
test_data['Adj_pred'] = model.predict(X_test)
train_data['Adj_pred'] = model.predict(X_train)

## Comparing Predictions

In [27]:
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, mean_squared_error

# List of prediction columns
prediction_columns = methods + ['best_pred', 'weighted_pred', 'Res_pred', 'Adj_pred']
train_pred_columns = methods + ['best_pred_fcst', 'weighted_pred', 'Res_pred', 'Adj_pred']

# Function to compute metrics
def compute_metrics(df, predictions, ground_truth):
    metrics = []
    for col in predictions:
        y_true = df[ground_truth]
        y_pred = df[col]
        mae = mean_absolute_error(y_true, y_pred)
        medae = median_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # Mean Absolute Percentage Error
        metrics.append([col, mae, medae, r2, rmse, mape])
    
    return pd.DataFrame(metrics, columns=['Prediction', 'MAE', 'MedAE', 'R-squared', 'RMSE', 'MAPE'])

# Compute and display metrics
train_metrics_df = compute_metrics(train_data, train_pred_columns, 'Load')
print(train_metrics_df)
test_metrics_df = compute_metrics(test_data, prediction_columns, 'Load')
# print(metrics_df)

       Prediction    MAE  MedAE  R-squared   RMSE  MAPE
0        Model PK 142.31 111.12       0.83 194.07  6.81
1        Model SA 145.02 114.64       0.83 191.10  7.06
2        Model NU 144.40 109.40       0.81 201.65  6.84
3        Model BY 141.95 107.99       0.81 201.48  6.68
4        Model NG 142.15 107.35       0.81 200.43  6.70
5  best_pred_fcst 104.91  75.28       0.89 151.33  5.04
6   weighted_pred 133.24 102.65       0.85 182.37  6.34
7        Res_pred  77.38  58.76       0.95 102.98  3.72
8        Adj_pred  73.62  55.66       0.96  98.63  3.54


In [28]:
train_metrics_df.head(12)
helper_methods.generate_results_latex(train_metrics_df)

Unnamed: 0,Prediction,MAE,MedAE,R-squared,RMSE,MAPE
0,Model PK,142.31,111.12,0.83,194.07,6.81
1,Model SA,145.02,114.64,0.83,191.1,7.06
2,Model NU,144.4,109.4,0.81,201.65,6.84
3,Model BY,141.95,107.99,0.81,201.48,6.68
4,Model NG,142.15,107.35,0.81,200.43,6.7
5,best_pred_fcst,104.91,75.28,0.89,151.33,5.04
6,weighted_pred,133.24,102.65,0.85,182.37,6.34
7,Res_pred,77.38,58.76,0.95,102.98,3.72
8,Adj_pred,73.62,55.66,0.96,98.63,3.54


Model PK & 142.31 & 111.12 & 0.83 & 194.07 & 6.81 \\
Model SA & 145.02 & 114.64 & 0.83 & 191.10 & 7.06 \\
Model NU & 144.40 & 109.40 & 0.81 & 201.65 & 6.84 \\
Model BY & 141.95 & 107.99 & 0.81 & 201.48 & 6.68 \\
Model NG & 142.15 & 107.35 & 0.81 & 200.43 & 6.70 \\
best_pred_fcst & 104.91 & 75.28 & 0.89 & 151.33 & 5.04 \\
weighted_pred & 133.24 & 102.65 & 0.85 & 182.37 & 6.34 \\
Res_pred & 77.38 & 58.76 & 0.95 & 102.98 & 3.72 \\
Adj_pred & 73.62 & 55.66 & 0.96 & 98.63 & 3.54 \\


In [29]:
test_metrics_df.head(12)
helper_methods.generate_results_latex(test_metrics_df)


Unnamed: 0,Prediction,MAE,MedAE,R-squared,RMSE,MAPE
0,Model PK,126.57,98.44,0.83,167.47,6.03
1,Model SA,135.2,103.01,0.8,180.8,6.46
2,Model NU,134.65,104.88,0.81,178.54,6.46
3,Model BY,136.72,103.51,0.79,186.47,6.56
4,Model NG,133.53,102.87,0.8,179.89,6.41
5,best_pred,131.28,101.88,0.81,175.17,6.31
6,weighted_pred,136.74,110.04,0.81,179.1,6.47
7,Res_pred,177.35,154.13,0.69,225.15,8.55
8,Adj_pred,152.76,129.9,0.77,195.85,7.32


Model PK & 126.57 & 98.44 & 0.83 & 167.47 & 6.03 \\
Model SA & 135.20 & 103.01 & 0.80 & 180.80 & 6.46 \\
Model NU & 134.65 & 104.88 & 0.81 & 178.54 & 6.46 \\
Model BY & 136.72 & 103.51 & 0.79 & 186.47 & 6.56 \\
Model NG & 133.53 & 102.87 & 0.80 & 179.89 & 6.41 \\
best_pred & 131.28 & 101.88 & 0.81 & 175.17 & 6.31 \\
weighted_pred & 136.74 & 110.04 & 0.81 & 179.10 & 6.47 \\
Res_pred & 177.35 & 154.13 & 0.69 & 225.15 & 8.55 \\
Adj_pred & 152.76 & 129.90 & 0.77 & 195.85 & 7.32 \\


In [30]:
methods # Used for best_pred and weighted_pred

['Model PK', 'Model SA', 'Model NU', 'Model BY', 'Model NG']