# Simple Heuristics

_by Nick Brooks, March 2020_

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import metrics 

import math
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import time

%matplotlib inline
sns.set_style("whitegrid")

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
notebookstart = time.time()

In [None]:
train_sales = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
submission_file = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')

In [None]:
print("Train Sales Shape: {} Rows, {} Columns".format(*train_sales.shape))
display(train_sales.head())
print("Submission File Shape: {} Rows, {} Columns".format(*submission_file.shape))

In [None]:
days = range(1, 1913 + 1)
time_series_columns = [f'd_{i}' for i in days]
time_series_data = train_sales[time_series_columns]

In [None]:
simple_models = [
    ('Flat Mean', np.mean, dict(axis = 1)),
    ('Flat Median', np.median, dict(axis = 1))
]

sub_size = 28
ts_metrics = ['rmse','mse','mae']
full_eval = {}
for method, func, kwargs in simple_models:
    print(f"Start {method}")
    test_windows = [7,14,28,56]
    columns = 2
    rows = math.ceil(len(test_windows)/columns)
    n_plots = rows*columns
    f,ax = plt.subplots(rows, columns, figsize = [20,10])
    palette = itertools.cycle(sns.color_palette("Dark2", 15))

    for plot_i, prediction_size in enumerate(test_windows):
        time_splits = time_series_data.shape[1] // prediction_size
        rows_to_consider = time_splits * prediction_size

        time_split_results_list = []
        # Time-Split Backtesting..
        for i in range(0, time_splits -1):
            tmp_matrix = time_series_data.iloc[:, -rows_to_consider:].values
            train = tmp_matrix[:,(i)*prediction_size:(i+1)*prediction_size]
            validation = tmp_matrix[:,(i+1)*prediction_size:(i+2)*prediction_size]

            train = func(train, **kwargs)
            train = np.tile(train.transpose(), (prediction_size, 1)).transpose()

            rmse = metrics.mean_squared_error(validation, train, squared = False)
            mse = metrics.mean_squared_error(validation, train, squared = True)
            mae = metrics.mean_absolute_error(validation, train)

            time_split_results_list.append([i, [mse,rmse, mae]])


        tmp_matrix = time_series_data.iloc[:, -sub_size:].mean(axis=1).values
        forecast = pd.DataFrame(np.tile(tmp_matrix.transpose(), (prediction_size, 1)).transpose(),
                                columns = [f'F{i}' for i in range(1, prediction_size + 1)])

        validation_ids = train_sales['id'].values
        evaluation_ids = [i.replace('validation', 'evaluation') for i in validation_ids]
        ids = np.concatenate([validation_ids, evaluation_ids])

        predictions = pd.DataFrame(ids, columns=['id'])
        forecast = pd.concat([forecast] * 2).reset_index(drop=True)
        predictions = pd.concat([predictions, forecast], axis=1)
        predictions.to_csv('f"{method}-{prediction_size}_sub.csv', index=False)


        # Evaluation
        time_split_results = pd.DataFrame(time_split_results_list, columns = ['time_slice', 'metrics'])
        time_split_results[ts_metrics] = pd.DataFrame(time_split_results['metrics'].values.tolist(), index=time_split_results.index)
        time_split_results.drop(["metrics"],axis =1, inplace=True)

        # Overall Eval
        overall_eval = time_split_results[ts_metrics].mean(axis = 0).to_dict()
        for k, v in overall_eval.items():
            overall_eval[k] = round(v, 2)

        full_eval[f"{method}-{str(prediction_size)}"] = overall_eval

        # Plot
        ax = plt.subplot(rows, columns, plot_i+1)
        time_split_results[ts_metrics].plot(ax=ax)
        ax.set_title(f"Method: {method}, Days Window: {prediction_size} Time-Split Validation Metrics\n{overall_eval}")

    plt.tight_layout(pad=1)
    plt.show()

In [None]:
results = pd.DataFrame(full_eval).T
results

In [None]:
# ts_metrics

# f,ax = plt.subplots(1,len(ts_metrics), figsize = [10,10])
# for col in ts_metrics:
    

In [None]:
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))