In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error, mean_absolute_error
import ipywidgets as widgets
from IPython.display import display

url = 'https://raw.githubusercontent.com/nathankalvaitis/ipython-dashboard/main/Steel_industry_data.csv'

data = pd.read_csv(url)

data.head()

data['date'] = pd.to_datetime(data['date'], format='%d/%m/%Y %H:%M')
data.set_index('date', inplace=True)

data = data.asfreq('15T')

data['DayOfWeek'] = data.index.day_name()
data['HourOfDay'] = data.index.hour

# Create the widgets
start_training = widgets.DatePicker(description='Start Training:', disabled=False)
end_training = widgets.DatePicker(description='End Training:', disabled=False)
start_test = widgets.DatePicker(description='Start Test:', disabled=False)
end_test = widgets.DatePicker(description='End Test:', disabled=False)
days_of_week = widgets.SelectMultiple(options=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], description='Days of Week:')
hours_of_day = widgets.SelectMultiple(options=list(range(24)), description='Hours of Day:')
load_type = widgets.SelectMultiple(options=['Light_Load', 'Medium_Load', 'Maximum_Load'], description='Load Type:')
run_button = widgets.Button(description='Run')

def filter_and_plot(start_training, end_training, start_test, end_test, days_of_week, hours_of_day, load_type):
    
    train = data.loc[start_training:end_training]
    test = data.loc[start_test:end_test]
    
    model = ExponentialSmoothing(train['Usage_kWh'], seasonal='add', seasonal_periods=96*7).fit()

    
    predictions = model.predict(start=test.index[0], end=test.index[-1])

    # Ensure predictions do not go below the minimum value observed in the training data
    min_value = train['Usage_kWh'].min()
    predictions = predictions.clip(lower=min_value)
    
    predictions = pd.DataFrame(predictions, columns=['Predicted_Usage'])
    predictions['DayOfWeek'] = predictions.index.day_name()
    predictions['HourOfDay'] = predictions.index.hour

    if days_of_week:
        days_of_week = list(days_of_week)
        train = train[train['DayOfWeek'].isin(days_of_week)]
        test = test[test['DayOfWeek'].isin(days_of_week)]
        predictions = predictions[predictions['DayOfWeek'].isin(days_of_week)]
    
    if hours_of_day:
        hours_of_day = list(hours_of_day)
        train = train[train['HourOfDay'].isin(hours_of_day)]
        test = test[test['HourOfDay'].isin(hours_of_day)]
        predictions = predictions[predictions['HourOfDay'].isin(hours_of_day)]
    
    if load_type:
        load_type = list(load_type)
        train = train[train['Load_Type'].isin(load_type)]
        test = test[test['Load_Type'].isin(load_type)]
        #predictions = predictions[predictions['Load_Type'].isin(load_type)] currently troubleshooting

    mse = mean_squared_error(test['Usage_kWh'], predictions['Predicted_Usage'])
    mae = mean_absolute_error(test['Usage_kWh'], predictions['Predicted_Usage'])
    errors = test['Usage_kWh'] - predictions['Predicted_Usage']
    errors_by_hour = errors.groupby(errors.index.hour).mean()
    
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')

    # calculate and print overall stats across time period
    overall_min = data['Usage_kWh'].min()
    overall_max = data['Usage_kWh'].max()
    overall_mean = data['Usage_kWh'].mean()
    overall_sum = data['Usage_kWh'].sum()
    
    train_min = train['Usage_kWh'].min()
    train_max = train['Usage_kWh'].max()
    train_mean = train['Usage_kWh'].mean()
    train_sum = train['Usage_kWh'].sum()
    
    test_min = test['Usage_kWh'].min()
    test_max = test['Usage_kWh'].max()
    test_mean = test['Usage_kWh'].mean()
    test_sum = test['Usage_kWh'].sum()
    
    print(f'Overall Usage_kWh Statistics:')
    print(f'Min: {overall_min:.2f}, Max: {overall_max:.2f}, Mean: {overall_mean:.2f}, Sum: {overall_sum:.2f}')
    
    print(f'\nTraining Period Usage_kWh Statistics:')
    print(f'Min: {train_min:.2f}, Max: {train_max:.2f}, Mean: {train_mean:.2f}, Sum: {train_sum:.2f}')
    
    print(f'\nTest Period Usage_kWh Statistics:')
    print(f'Min: {test_min:.2f}, Max: {test_max:.2f}, Mean: {test_mean:.2f}, Sum: {test_sum:.2f}')

    
    # boilerplate
    fig, axs = plt.subplots(2, 2, figsize=(15, 10))

    axs[0, 0].plot(train['Usage_kWh'], label='Training Data')
    axs[0, 0].plot(test['Usage_kWh'], label='Actual Usage')
    axs[0, 0].plot(predictions['Predicted_Usage'], label='Predicted Usage', color='red')
    axs[0, 0].axhline(overall_min, color='green', linestyle='--', label='Overall Min')
    axs[0, 0].axhline(overall_max, color='purple', linestyle='--', label='Overall Max')
    axs[0, 0].axhline(overall_mean, color='orange', linestyle='--', label='Overall Mean')
    axs[0, 0].legend()
    axs[0, 0].set_title('Training, Actual, and Predicted Usage')
    axs[0, 0].grid(True)

    # forecast period zoom
    axs[0, 1].plot(test['Usage_kWh'], label='Actual Usage')
    axs[0, 1].plot(predictions['Predicted_Usage'], label='Predicted Usage', color='red')
    axs[0, 1].axhline(overall_min, color='green', linestyle='--', label='Overall Min')
    axs[0, 1].axhline(overall_max, color='purple', linestyle='--', label='Overall Max')
    axs[0, 1].axhline(overall_mean, color='orange', linestyle='--', label='Overall Mean')
    axs[0, 1].set_xlabel('Date')
    axs[0, 1].set_ylabel('Usage (kWh)')
    axs[0, 1].set_title('Actual vs Predicted Usage (Zoomed into Forecast Period)')
    axs[0, 1].legend()
    axs[0, 1].grid(True)
    
    #residual by hour
    axs[1, 0].plot(errors_by_hour.index.values, errors_by_hour.values, marker='o')
    axs[1, 0].set_xlabel('Hour of Day')
    axs[1, 0].set_ylabel('Mean Error (kWh)')
    axs[1, 0].set_title('Mean Forecast Error by Hour')
    axs[1, 0].grid(True)

    # heatmap -- originally wanted to do vs hour of day
    heatmap_data = train.pivot_table(values='Usage_kWh', index='DayOfWeek', columns='Load_Type', aggfunc='sum', margins=True, margins_name='Total')
    sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='YlGnBu', ax=axs[1, 1])
    axs[1, 1].set_title('Sum of Usage by Day of Week and Load Type - Training Period Only')

    plt.tight_layout()
    plt.show()

#  output
interactive_plot = widgets.interactive_output(filter_and_plot, {
    'start_training': start_training,
    'end_training': end_training,
    'start_test': start_test,
    'end_test': end_test,
    'days_of_week': days_of_week,
    'hours_of_day': hours_of_day,
    'load_type': load_type
})

# display the widgets and the output
display(start_training, end_training, start_test, end_test, days_of_week, hours_of_day, load_type, run_button, interactive_plot)


DatePicker(value=None, description='Start Training:')

DatePicker(value=None, description='End Training:')

DatePicker(value=None, description='Start Test:')

DatePicker(value=None, description='End Test:')

SelectMultiple(description='Days of Week:', options=('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', '…

SelectMultiple(description='Hours of Day:', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…

SelectMultiple(description='Load Type:', options=('Light_Load', 'Medium_Load', 'Maximum_Load'), value=())

Button(description='Run', style=ButtonStyle())

Output()