## This code is to use ML methods for Solar Irradiance Forecasting

In [None]:
# Commonly used python functions and display settings
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, HTML

import warnings
warnings.filterwarnings("ignore") # specify to ignore warning messages

In [None]:
# Key imports for this code (various ML and Stat Models)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.api import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
import pmdarima as pm
from pmdarima import model_selection
from pmdarima import auto_arima

In [None]:
# import viz libraries
import matplotlib.pyplot as plt
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
from matplotlib import pyplot
from pandas.plotting import autocorrelation_plot

### Get data

In [None]:
# Load the CSV files
all_types_weekly_df = pd.read_csv("../Datasets/avocado.csv", parse_dates=["Date"])
temp_df = pd.read_csv("../Datasets/temperature.csv", parse_dates=["datetime"])
weekly_df = all_types_weekly_df[all_types_weekly_df['type'] == 'conventional']
weekly_df["region"] = weekly_df["region"].replace("SanDiego", "San Diego")

# check how the deaders look
weekly_df.head()
temp_df.head()

# Filter weekly.csv for the specified cities
cities = ["Atlanta", "Boston", "Houston", "San Diego"]
weekly_df = weekly_df[weekly_df["region"].isin(cities)].copy()

# Convert datetime to date-only format in temp_df
temp_df["Date"] = pd.to_datetime(temp_df["datetime"]).dt.normalize()

# Remove weekly entries beyond available temperature data 
latest_temp_date = temp_df["Date"].max()
weekly_df = weekly_df[weekly_df["Date"] <= latest_temp_date]

# Initialize a list to collect results
merged_data = []

# Process each row in weekly_df
for _, row in weekly_df.iterrows():
    city = row["region"]  # Get the specific city for the row
    start_date = row["Date"]
    end_date = start_date + pd.Timedelta(days=6)

    # Compute weekly statistics for the specific city
    if city in temp_df.columns:
        city_weekly_temps = temp_df[(temp_df["Date"] >= start_date) & 
                                    (temp_df["Date"] <= end_date)][city].dropna()
        temp_stats = {
            "min_temp": city_weekly_temps.min(),
            "max_temp": city_weekly_temps.max(),
            "avg_temp": city_weekly_temps.mean(),
            "stdev_temp": city_weekly_temps.std(),
        }
    else:
        temp_stats = {"min_temp": None, "max_temp": None, "avg_temp": None, "stdev_temp": None}

    # Append row with computed statistics
    merged_data.append({**row.to_dict(), **temp_stats})

# Convert results to DataFrame and sort by region and date
final_df = pd.DataFrame(merged_data).sort_values(by=["region", "Date"]).reset_index(drop=True)
final_df.head()


In [None]:
# Creating a plot of Weekly Average Price of Avocado in Atlanta
plot_data = []
atl_df = final_df[final_df['region'] == 'Atlanta']
plot_data.append(go.Scatter(x= final_df['Date'], y= atl_df['AveragePrice']))
layout = go.Layout(xaxis = dict(title='Date'), yaxis = dict(title= 'Average Price of Avocado'), 
                   title = 'Time Series of Average Price of Avocado')
fig = go.Figure(data= plot_data, layout=layout)

plotly.offline.iplot(fig)

In [None]:
# Creating a plot of Total Volume of acocado in Atlanta
plot_data = []
atl_df = final_df[final_df['region'] == 'Atlanta']
plot_data.append(go.Scatter(x= final_df['Date'], y= atl_df['Total Volume']))
layout = go.Layout(xaxis = dict(title='Date'), yaxis = dict(title= 'Total Volumee of Avocado'), 
                   title = 'Time Series of Total Volume of Avocado')
fig = go.Figure(data= plot_data, layout=layout)

plotly.offline.iplot(fig)

In [None]:
# This creates a graph of the autocorrelation function versus lags for the avocado
sm.graphics.tsa.plot_acf(atl_df['AveragePrice'].values.squeeze(), lags=40)
sm.graphics.tsa.plot_acf(atl_df['Total Volume'].values.squeeze(), lags=40)

In [None]:
# This creates a graph of the partial autocorrelation function versus lags for the avocado
sm.graphics.tsa.plot_pacf(atl_df['AveragePrice'].values.squeeze(), lags=40)
sm.graphics.tsa.plot_pacf(atl_df['Total Volume'].values.squeeze(), lags=40)

In [None]:
# First we get the time series
time_series = atl_df['AveragePrice']

# Define the number of predictions to make 
h = 1

# Define the length of each training set
T = 100

# Initialize the lists to store the percentage and absolute errors
perc_error_list = []
abs_error_list = []

es_preds_train = np.zeros(T+h) # In case we wish to use the ES predictions

# Loop through the data frame and make predictions using exponential smoothing
for i in range(len(time_series) - T - h):
    # Define the training and testing data sets
    train = time_series.iloc[i:i+T].values
    test = time_series.iloc[i+T:i+T+h].values
    
    # Fit the exponential smoothing model
    model = ExponentialSmoothing(train, trend='add') 
    fit_model = model.fit()
    
    # Make predictions
    pred_list = fit_model.forecast(h)
    preds = pred_list[h-1]

    # Calculate percentage and absolute errors
    perc_errors = np.abs(test[h-1]-preds)/test[h-1]
    abs_errors = np.abs(test[h-1]-preds)

    # Store the percentage and absolute errors
    perc_error_list.append(perc_errors)
    abs_error_list.append(abs_errors)
    
    # Get the ES predictions
    es_preds_train = np.append(es_preds_train, preds) 

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_error_list, axis = 0))
print('Median absolute percentage error:', np.median(perc_error_list, axis = 0))
print('75th percentile of absolute percentage error:', np.percentile(perc_error_list, 75, axis = 0))
print('90th percentile of absolute percentage error:', np.percentile(perc_error_list, 90, axis = 0))

In [None]:
# Print the absolute error ratio results
avg_global = atl_df['AveragePrice'][T+h:].mean()
print('Mean absolute error ratio:', np.mean(abs_error_list, axis = 0)/avg_global)
print('Median absolute error ratio:', np.median(abs_error_list, axis = 0)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(abs_error_list, 75, axis = 0)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(abs_error_list, 90, axis = 0)/avg_global)

In [None]:
# First we get the time series
time_series = atl_df['Total Volume']

# Define the number of predictions to make 
h = 1

# Define the length of each training set
T = 100

# Initialize the lists to store the percentage and absolute errors
perc_error_list = []
abs_error_list = []

es_preds_train = np.zeros(T+h) # In case we wish to use the ES predictions

# Loop through the data frame and make predictions using exponential smoothing
for i in range(len(time_series) - T - h):
    # Define the training and testing data sets
    train = time_series.iloc[i:i+T].values
    test = time_series.iloc[i+T:i+T+h].values
    
    # Fit the exponential smoothing model
    model = ExponentialSmoothing(train, trend='add') 
    fit_model = model.fit()
    
    # Make predictions
    pred_list = fit_model.forecast(h)
    preds = pred_list[h-1]

    # Calculate percentage and absolute errors
    perc_errors = np.abs(test[h-1]-preds)/test[h-1]
    abs_errors = np.abs(test[h-1]-preds)

    # Store the percentage and absolute errors
    perc_error_list.append(perc_errors)
    abs_error_list.append(abs_errors)
    
    # Get the ES predictions
    es_preds_train = np.append(es_preds_train, preds) 

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_error_list, axis = 0))
print('Median absolute percentage error:', np.median(perc_error_list, axis = 0))
print('75th percentile of absolute percentage error:', np.percentile(perc_error_list, 75, axis = 0))
print('90th percentile of absolute percentage error:', np.percentile(perc_error_list, 90, axis = 0))

In [None]:
# Print the absolute error ratio results
avg_global = atl_df['Total Volume'][T+h:].mean()
print('Mean absolute error ratio:', np.mean(abs_error_list, axis = 0)/avg_global)
print('Median absolute error ratio:', np.median(abs_error_list, axis = 0)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(abs_error_list, 75, axis = 0)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(abs_error_list, 90, axis = 0)/avg_global)

In [None]:
# Getting optimal differencing 
d_opt = pm.arima.ndiffs(atl_df['AveragePrice'].iloc[0:T])
d_opt
d_opt = pm.arima.ndiffs(atl_df['Total Volume'].iloc[0:T])
d_opt

In [None]:
# First we get the time series
time_series = atl_df['AveragePrice']

# Define the number of predictions to make 
h = 1

# Define the length of each training set
T = 100

# Initialize the lists to store the percentage and absolute errors
ar_perc_error_list = []
ar_abs_error_list = []

ar_preds_train = np.zeros(T+h) # In case we wish to use the ARIMA predictions

# Loop through the data frame and make predictions using ARIMA
for i in range(len(time_series) - T - h):
    # Define the training and testing data sets
    train = time_series.iloc[i:i+T].values
    test = time_series.iloc[i+T:i+T+h].values

    # Using a specified order (this would need to be fine-tuned)
    order = (2, 0, 1) 
    # seasonal_order = (1, 0, 0, seasonal_periods) 

    # Fit the SARIMAX or ARIMA model
    model = SARIMAX(endog=train, exog=None, order=order, seasonal_order=None)
    fit_model = model.fit(disp=False)

    # Make predictions
    pred_list = fit_model.forecast(steps=len(test), exog=None)
    preds = pred_list[h-1]

    # Calculate percentage and absolute errors
    ar_perc_errors = np.abs(test[h-1]-preds)/test[h-1]
    ar_abs_errors = np.abs(test[h-1]-preds)

    # Store the percentage and absolute errors
    ar_perc_error_list.append(ar_perc_errors)
    ar_abs_error_list.append(ar_abs_errors)
    
    # Get the ARIMA predictions
    ar_preds_train = np.append(ar_preds_train, preds) 

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(ar_perc_error_list, axis = 0))
print('Median absolute percentage error:', np.median(ar_perc_error_list, axis = 0))
print('75th percentile of absolute percentage error:', np.percentile(ar_perc_error_list, 75, axis = 0))
print('90th percentile of absolute percentage error:', np.percentile(ar_perc_error_list, 90, axis = 0))

In [None]:
# Print the absolute error ratio results
avg_global = atl_df['AveragePrice'][T+h:].mean()
print('Mean absolute error ratio:', np.mean(ar_abs_error_list, axis = 0)/avg_global)
print('Median absolute error ratio:', np.median(ar_abs_error_list, axis = 0)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(ar_abs_error_list, 75, axis = 0)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(ar_abs_error_list, 90, axis = 0)/avg_global)

In [None]:
# First we get the time series
time_series = atl_df['Total Volume']

# Define the number of predictions to make 
h = 1

# Define the length of each training set
T = 100

# Initialize the lists to store the percentage and absolute errors
ar_perc_error_list = []
ar_abs_error_list = []

ar_preds_train = np.zeros(T+h) # In case we wish to use the ARIMA predictions

# Loop through the data frame and make predictions using ARIMA
for i in range(len(time_series) - T - h):
    # Define the training and testing data sets
    train = time_series.iloc[i:i+T].values
    test = time_series.iloc[i+T:i+T+h].values

    # Using a specified order (this would need to be fine-tuned)
    order = (2, 1, 1) 
    # seasonal_order = (1, 0, 0, seasonal_periods) 

    # Fit the SARIMAX or ARIMA model
    model = SARIMAX(endog=train, exog=None, order=order, seasonal_order=None)
    fit_model = model.fit(disp=False)

    # Make predictions
    pred_list = fit_model.forecast(steps=len(test), exog=None)
    preds = pred_list[h-1]

    # Calculate percentage and absolute errors
    ar_perc_errors = np.abs(test[h-1]-preds)/test[h-1]
    ar_abs_errors = np.abs(test[h-1]-preds)

    # Store the percentage and absolute errors
    ar_perc_error_list.append(ar_perc_errors)
    ar_abs_error_list.append(ar_abs_errors)
    
    # Get the ARIMA predictions
    ar_preds_train = np.append(ar_preds_train, preds) 

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(ar_perc_error_list, axis = 0))
print('Median absolute percentage error:', np.median(ar_perc_error_list, axis = 0))
print('75th percentile of absolute percentage error:', np.percentile(ar_perc_error_list, 75, axis = 0))
print('90th percentile of absolute percentage error:', np.percentile(ar_perc_error_list, 90, axis = 0))

In [None]:
# Print the absolute error ratio results
avg_global = atl_df['Total Volume'][T+h:].mean()
print('Mean absolute error ratio:', np.mean(ar_abs_error_list, axis = 0)/avg_global)
print('Median absolute error ratio:', np.median(ar_abs_error_list, axis = 0)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(ar_abs_error_list, 75, axis = 0)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(ar_abs_error_list, 90, axis = 0)/avg_global)

In [None]:
# Initialize the lists to store the percentage and absolute errors
ar_perc_error_list = []
ar_abs_error_list = []

ar_preds_train = np.zeros(T+h) # In case we wish to use the ARIMA predictions

# Loop through the data frame and make predictions using ARIMA
for i in range(len(time_series) - T - h):
    # Define the training and testing data sets
    train = time_series.iloc[i:i+T].values
    test = time_series.iloc[i+T:i+T+h].values
    exog_df = atl_df[['min_temp', 'max_temp', 'avg_temp', 'stdev_temp']].iloc[i:i+T].values
    exog_test = atl_df[['min_temp', 'max_temp', 'avg_temp', 'stdev_temp']].iloc[i+T:i+T+h].values

    # Using a specified order (this would need to be fine-tuned)
    order = (2, 1, 1) 
    # seasonal_order = (1, 0, 0, seasonal_periods) 

    # Fit the SARIMAX or ARIMA model
    model = SARIMAX(endog=train, exog=exog_df, order=order, seasonal_order=None)
    fit_model = model.fit(disp=False)

    # Make predictions
    pred_list = fit_model.forecast(steps=len(test), exog=exog_test)
    preds = pred_list[h-1]

    # Calculate percentage and absolute errors
    ar_perc_errors = np.abs(test[h-1]-preds)/test[h-1]
    ar_abs_errors = np.abs(test[h-1]-preds)

    # Store the percentage and absolute errors
    ar_perc_error_list.append(ar_perc_errors)
    ar_abs_error_list.append(ar_abs_errors)
    
    # Get the ARIMA predictions
    ar_preds_train = np.append(ar_preds_train, preds) 

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(ar_perc_error_list, axis = 0))
print('Median absolute percentage error:', np.median(ar_perc_error_list, axis = 0))
print('75th percentile of absolute percentage error:', np.percentile(ar_perc_error_list, 75, axis = 0))
print('90th percentile of absolute percentage error:', np.percentile(ar_perc_error_list, 90, axis = 0))

In [None]:
# Print the absolute error ratio results
avg_global = atl_df['Total Volume'][T+h:].mean()
print('Mean absolute error ratio:', np.mean(ar_abs_error_list, axis = 0)/avg_global)
print('Median absolute error ratio:', np.median(ar_abs_error_list, axis = 0)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(ar_abs_error_list, 75, axis = 0)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(ar_abs_error_list, 90, axis = 0)/avg_global)

In [None]:
# Splitting Atlanta data into train and test
train_atl = atl_df[atl_df['Date'] <= '2016-11-27']
test_atl = atl_df[atl_df['Date'] >= '2016-11-28']

train_atl.head()
train_atl.tail()

test_atl.head()
test_atl.tail()

In [None]:
# Adding lag features 
test_atl['lag1'] = test_atl['AveragePrice'].shift(periods = 1)
test_atl['lag2'] = test_atl['AveragePrice'].shift(periods = 2)
test_atl['lag3'] = test_atl['AveragePrice'].shift(periods = 3)
test_atl['lag1tv'] = test_atl['Total Volume'].shift(periods = 1)
test_atl['lag2tv'] = test_atl['Total Volume'].shift(periods = 2)
test_atl['lag3tv'] = test_atl['Total Volume'].shift(periods = 3)
test_atl.dropna(inplace = True) # This will drop the first three full weeks
test_atl.head()

In [None]:
# Adding lag features 
train_atl['lag1'] = train_atl['AveragePrice'].shift(periods = 1)
train_atl['lag2'] = train_atl['AveragePrice'].shift(periods = 2)
train_atl['lag3'] = train_atl['AveragePrice'].shift(periods = 3)
train_atl['lag1tv'] = train_atl['Total Volume'].shift(periods = 1)
train_atl['lag2tv'] = train_atl['Total Volume'].shift(periods = 2)
train_atl['lag3tv'] = train_atl['Total Volume'].shift(periods = 3)
train_atl.dropna(inplace = True) # This will drop the first three full weeks
train_atl.head()

In [None]:
# Adding month feature
test_atl['month'] = pd.to_datetime(test_atl['Date']).dt.month
train_atl['month'] = pd.to_datetime(train_atl['Date']).dt.month
train_atl.head()

In [None]:
# One shot training 
X_train = train_atl[['lag1', 'lag2', 'lag3', 'month', 'min_temp', 'max_temp', 'avg_temp', 'stdev_temp']]
y_train = train_atl['AveragePrice']
X_train.head()

# defining the model and parameters
gb = GradientBoostingRegressor(n_estimators = 100, max_depth = 5, min_samples_leaf = 2)

# Asking the model to fit the training data
gb = gb.fit(X_train, y_train) 

# Asking what the importance of features
gb.feature_importances_

In [None]:
# Make forecasts using Gradient Boosting 

X_test = test_atl[['lag1', 'lag2', 'lag3', 'month', 'min_temp', 'max_temp', 'avg_temp', 'stdev_temp']]
y_test = test_atl['AveragePrice']

# Make predictions
y_preds = gb.predict(X_test)

# Calculate percentage and absolute errors
perc_errors = np.abs(y_test-y_preds)/y_test
abs_errors = np.abs(y_test-y_preds)

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_errors))
print('Median absolute percentage error:', np.median(perc_errors))
print('75th percentile of absolute percentage error:', np.percentile(perc_errors, 75))
print('90th percentile of absolute percentage error:', np.percentile(perc_errors, 90))

In [None]:
# Print the absolute error ratio results
avg_global = atl_df['AveragePrice'][T+h:].mean()
print('Mean absolute error ratio:', np.mean(abs_errors)/avg_global)
print('Median absolute error ratio:', np.median(abs_errors)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(abs_errors, 75)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(abs_errors, 90)/avg_global)

## XGB

In [None]:
# One shot training based on previous year
X_train = train_atl[['lag1tv', 'lag2tv', 'lag3tv', 'month', 'min_temp', 'max_temp', 'avg_temp', 'stdev_temp']]
y_train = train_atl['Total Volume']
X_train.head()

# Make forecasts using Gradient Boosting for current year

X_test = test_atl[['lag1tv', 'lag2tv', 'lag3tv', 'month', 'min_temp', 'max_temp', 'avg_temp', 'stdev_temp']]
y_test = test_atl['Total Volume']

In [None]:
# Define the XGBoost regressor with specific hyperparameters
model = XGBRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    subsample=1.0,
    min_child_weight = 5.0, 
    colsample_bytree=1.0,
    gamma = 5.0,
    objective='reg:absoluteerror',
    random_state=42
    )

# Train the model
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_preds = model.predict(X_test)
# Calculate percentage and absolute errors
perc_errors = np.abs(y_test-y_preds)/y_test
abs_errors = np.abs(y_test-y_preds)

In [None]:
# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_errors))
print('Median absolute percentage error:', np.median(perc_errors))
print('75th percentile of absolute percentage error:', np.percentile(perc_errors, 75))
print('90th percentile of absolute percentage error:', np.percentile(perc_errors, 90))

In [None]:
# Print the absolute error ratio results
avg_global = atl_df['Total Volume'][T+h:].mean()
print('Mean absolute error ratio:', np.mean(abs_errors)/avg_global)
print('Median absolute error ratio:', np.median(abs_errors)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(abs_errors, 75)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(abs_errors, 90)/avg_global)

## XGB For Boston Data

In [None]:
bos_df = final_df[final_df['region'] == 'Boston']
hou_df = final_df[final_df['region'] == 'Houston']
san_df = final_df[final_df['region'] == 'San Diego']

In [None]:
volume_df = []
hou_df.rename(columns={'Total Volume': 'Total Volume Hou'}, inplace = True)
san_df.rename(columns={'Total Volume': 'Total Volume San'}, inplace = True)

volume_df = bos_df.merge(hou_df[['Date', 'Total Volume Hou']], how = 'left', on = ['Date'])
volume_df = volume_df.merge(san_df[['Date', 'Total Volume San']], how = 'left', on = ['Date'])
volume_df.head()

In [None]:
volume_df['lag1tv'] = volume_df['Total Volume'].shift(periods = 1)
volume_df['lag2tv'] = volume_df['Total Volume'].shift(periods = 2)
volume_df['lag3tv'] = volume_df['Total Volume'].shift(periods = 3)
volume_df['houtv'] = volume_df['Total Volume Hou'].shift(periods = 1)
volume_df['santv'] = volume_df['Total Volume San'].shift(periods = 1)
volume_df.dropna(inplace = True) # This will drop the first three full weeks
volume_df.head()

In [None]:
volume_df['month'] = pd.to_datetime(volume_df['Date']).dt.month
train_bos = volume_df[volume_df['Date'] <= '2016-11-27']
test_bos = volume_df[volume_df['Date'] >= '2016-11-28']

# One shot training based on previous year
X_train = train_bos[['lag1tv', 'lag2tv', 'lag3tv', 'month', 'min_temp', 'max_temp', 'avg_temp', 'stdev_temp']]
y_train = train_bos['Total Volume']
X_train.head()

# Make forecasts using Gradient Boosting for current year

X_test = test_bos[['lag1tv', 'lag2tv', 'lag3tv', 'month', 'min_temp', 'max_temp', 'avg_temp', 'stdev_temp']]
y_test = test_bos['Total Volume']

# Define the XGBoost regressor with specific hyperparameters
model = XGBRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    subsample=1.0,
    min_child_weight = 5.0, 
    colsample_bytree=1.0,
    gamma = 5.0,
    objective='reg:absoluteerror',
    random_state=42
    )

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_preds = model.predict(X_test)
# Calculate percentage and absolute errors
perc_errors = np.abs(y_test-y_preds)/y_test
abs_errors = np.abs(y_test-y_preds)

# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_errors))
print('Median absolute percentage error:', np.median(perc_errors))
print('75th percentile of absolute percentage error:', np.percentile(perc_errors, 75))
print('90th percentile of absolute percentage error:', np.percentile(perc_errors, 90))

# Print the absolute error ratio results
avg_global = atl_df['Total Volume'][T+h:].mean()
print('Mean absolute error ratio:', np.mean(abs_errors)/avg_global)
print('Median absolute error ratio:', np.median(abs_errors)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(abs_errors, 75)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(abs_errors, 90)/avg_global)

In [None]:
# One shot training based on previous year
X_train = train_bos[['lag1tv', 'lag2tv', 'lag3tv', 'month']]
y_train = train_bos['Total Volume']
X_train.head()

# Make forecasts using Gradient Boosting for current year

X_test = test_bos[['lag1tv', 'lag2tv', 'lag3tv', 'month']]
y_test = test_bos['Total Volume']

# Define the XGBoost regressor with specific hyperparameters
model = XGBRegressor(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    subsample=1.0,
    min_child_weight = 5.0, 
    colsample_bytree=1.0,
    gamma = 5.0,
    objective='reg:absoluteerror',
    random_state=42
    )

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_preds = model.predict(X_test)
# Calculate percentage and absolute errors
perc_errors = np.abs(y_test-y_preds)/y_test
abs_errors = np.abs(y_test-y_preds)

# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_errors))
print('Median absolute percentage error:', np.median(perc_errors))
print('75th percentile of absolute percentage error:', np.percentile(perc_errors, 75))
print('90th percentile of absolute percentage error:', np.percentile(perc_errors, 90))

# Print the absolute error ratio results
avg_global = atl_df['Total Volume'][T+h:].mean()
print('Mean absolute error ratio:', np.mean(abs_errors)/avg_global)
print('Median absolute error ratio:', np.median(abs_errors)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(abs_errors, 75)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(abs_errors, 90)/avg_global)

In [None]:
# One shot training based on previous year
X_train = train_bos[['lag1tv', 'lag2tv', 'lag3tv', 'month', 'min_temp', 'max_temp', 'avg_temp', 'stdev_temp', 'houtv', 'santv']]
y_train = train_bos['Total Volume']
X_train.head()

# Make forecasts using Gradient Boosting for current year

X_test = test_bos[['lag1tv', 'lag2tv', 'lag3tv', 'month', 'min_temp', 'max_temp', 'avg_temp', 'stdev_temp', 'houtv', 'santv']]
y_test = test_bos['Total Volume']

# Define the XGBoost regressor with specific hyperparameters
model = XGBRegressor(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=1.0,
    min_child_weight = 5.0, 
    colsample_bytree=1.0,
    gamma = 5.0,
    objective='reg:absoluteerror',
    random_state=42
    )

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_preds = model.predict(X_test)
# Calculate percentage and absolute errors
perc_errors = np.abs(y_test-y_preds)/y_test
abs_errors = np.abs(y_test-y_preds)

# Print the percentage-error results
print('Mean absolute percentage error:', np.mean(perc_errors))
print('Median absolute percentage error:', np.median(perc_errors))
print('75th percentile of absolute percentage error:', np.percentile(perc_errors, 75))
print('90th percentile of absolute percentage error:', np.percentile(perc_errors, 90))

# Print the absolute error ratio results
avg_global = atl_df['Total Volume'][T+h:].mean()
print('Mean absolute error ratio:', np.mean(abs_errors)/avg_global)
print('Median absolute error ratio:', np.median(abs_errors)/avg_global)
print('75th percentile absolute error ratio:', np.percentile(abs_errors, 75)/avg_global)
print('90th percentile absolute error ratio:', np.percentile(abs_errors, 90)/avg_global)