## This code is to use ML methods for Solar Irradiance Forecasting

In [None]:
# Commonly used python functions and display settings
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.core.display import display, HTML

import warnings
warnings.filterwarnings("ignore") # specify to ignore warning messages

In [None]:
# Key imports for this code (various ML and Stat Models)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
import pmdarima as pm
from pmdarima import model_selection
from pmdarima import auto_arima

In [None]:
# import viz libraries
import matplotlib.pyplot as plt
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
from matplotlib import pyplot
from pandas.plotting import autocorrelation_plot

### Get data

In [None]:
# fetch data from the CSV file
prev_year = pd.read_csv('solarTAC_hourly_2016_4_months_scaled.csv', parse_dates = ['datetime'])
solar_test = pd.read_csv('solarTAC_hourly_2017_4_months_scaled.csv', parse_dates = ['datetime'])
prev_year.rename(columns={'scaled_global' : 'global'}, inplace = True)
solar_test.rename(columns={'scaled_global' : 'global'}, inplace = True)

# NOTE: The temp and weather are forecasts made one day ahead

prev_year.head()
prev_year.tail()

solar_test.head()
solar_test.tail()

In [None]:
# The set of values the categorical variables take
set(solar_test['Denver_weather'])
set(prev_year['Denver_weather'])

In [None]:
# Going to make the weather variable as 4 categories
solar_test.replace(to_replace=['broken clouds', 'few clouds', 'scattered clouds'],
           value='some clouds', inplace=True)
solar_test.replace(to_replace=['fog', 'haze', 'light rain', 'light snow', 'mist', 'overcast clouds'],
           value='very cloudy', inplace=True)
prev_year.replace(to_replace=['broken clouds', 'few clouds', 'scattered clouds'],
           value='some clouds', inplace=True)
prev_year.replace(to_replace=['fog', 'haze', 'light rain', 'mist', 'overcast clouds'],
           value='very cloudy', inplace=True)

In [None]:
# Creating dummy variables for categorical weather data (Test data)
solar_test = pd.get_dummies(solar_test, drop_first=True) 
solar_test.head()

In [None]:
# Adding features based on irradiance from 1 day ago, 1 day & 1 hour ago, 2 days ago for test data
solar_test['lags'] = solar_test['global'].shift(periods = 8)
solar_test['lags-1'] = solar_test['global'].shift(periods = 9)
solar_test['lag2s'] = solar_test['global'].shift(periods = 16)
solar_test.dropna(inplace = True) # This will drop the first two full days
solar_test.head()

In [None]:
# Creating dummy variables for categorical weather data (Train data)
gb_train_data = pd.get_dummies(prev_year, drop_first=True) 
gb_train_data.head()

In [None]:
# Adding features based on irradiance from 1 day ago, 1 day & 1 hour ago, 2 days ago for train data
gb_train_data['lags'] = gb_train_data['global'].shift(periods = 8)
gb_train_data['lags-1'] = gb_train_data['global'].shift(periods = 9)
gb_train_data['lag2s'] = gb_train_data['global'].shift(periods = 16)
gb_train_data.dropna(inplace = True) # This will drop the first two full days
gb_train_data.head()

In [None]:
# One shot training based on previous year
X_train = gb_train_data.drop(columns = ['datetime', 'global'])
y_train = gb_train_data['global']
X_train.head()

# defining the model and parameters
gb = GradientBoostingRegressor(n_estimators = 100, max_depth = 5, min_samples_leaf = 2)

# Asking the model to fit the training data
gb = gb.fit(X_train, y_train) 

# Asking what the importance of features
gb.feature_importances_

In [None]:
# Make forecasts using Gradient Boosting for current year

X_test = solar_test.drop(columns = ['datetime', 'global'])
y_test = solar_test['global']

# Make predictions
y_preds = gb.predict(X_test)

# Calculate percentage and absolute errors
perc_errors = np.abs(y_test-y_preds)/y_test
abs_errors = np.abs(y_test-y_preds)

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_errors))
print('Median absolute percentage error:', np.median(perc_errors))
print('75th percentile of absolute percentage error:', np.percentile(perc_errors, 75))
print('90th percentile of absolute percentage error:', np.percentile(perc_errors, 90))

In [None]:
# Print the absolute error ratio results
avg_global = prev_year['global'].mean()
print('Mean absolute error ratio:', np.mean(abs_errors)/avg_global)
print('Median absolute error ratio:', np.median(abs_errors)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(abs_errors, 75)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(abs_errors, 90)/avg_global)

## XGB

In [None]:
# Define the XGBoost regressor with specific hyperparameters
model = XGBRegressor(
    n_estimators=50,
    max_depth=4,
    learning_rate=0.1,
    subsample=1.0,
    min_child_weight = 5.0, 
    colsample_bytree=1.0,
    gamma = 50.0,
    objective='reg:absoluteerror',
    random_state=42
    )

# Train the model
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_preds = model.predict(X_test)
# Calculate percentage and absolute errors
perc_errors = np.abs(y_test-y_preds)/y_test
abs_errors = np.abs(y_test-y_preds)

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_errors))
print('Median absolute percentage error:', np.median(perc_errors))
print('75th percentile of absolute percentage error:', np.percentile(perc_errors, 75))
print('90th percentile of absolute percentage error:', np.percentile(perc_errors, 90))

In [None]:
# Print the absolute error ratio results
print('Mean absolute error ratio:', np.mean(abs_errors)/avg_global)
print('Median absolute error ratio:', np.median(abs_errors)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(abs_errors, 75)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(abs_errors, 90)/avg_global)

## Persistent Forecast (simply forecast the same value as previous day at that time)

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(abs(1-X_test['lags']/y_test), axis = 0))
print('Median absolute percentage error:', np.median(abs(1-X_test['lags']/y_test), axis = 0))
print('75th percentile of absolute percentage error:', np.percentile(abs(1-X_test['lags']/y_test), 75, axis = 0))
print('90th percentile of absolute percentage error:', np.percentile(abs(1-X_test['lags']/y_test), 90, axis = 0))

In [None]:
# Print the absolute error ratio results
print('Mean absolute error ratio:', np.mean(abs(X_test['lags']-y_test), axis = 0)/avg_global)
print('Median absolute error ratio:', np.median(abs(X_test['lags']-y_test), axis = 0)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(abs(X_test['lags']-y_test), 75, axis = 0)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(abs(X_test['lags']-y_test), 90, axis = 0)/avg_global)