# Simplest Prophet ('ds', 'y')

In [None]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

In [None]:
import numpy as np
import pandas as pd
from prophet import Prophet
from pathlib import Path
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from mentorship.ml.cv.split import DateTimeSeriesSplit

In [None]:
DATA_ROOT = Path('data', 'kaggle', 'store-sales-time-series-forecasting')

In [None]:
train = pd.read_csv(DATA_ROOT / 'train.csv')
train.head()

In [None]:
train = train[['date', 'sales', 'store_nbr', 'family']].rename(columns={'date': 'ds', 'sales': 'y'})
train['family'] = train['family'].str.lower()
train.head()

In [None]:
X = train.copy()

In [None]:
y = X['y'].copy()

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * X['family'].nunique() * X['store_nbr'].nunique(), n_splits=4, test_size=15 * X['family'].nunique() * X['store_nbr'].nunique())

In [None]:
splitter = DateTimeSeriesSplit()

In [None]:
for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    
    scores_RMSLE = []
    scores_RMSE = []
    scores_MAE = []
    scores_R2 = []

    for current_family in X['family'].unique():
        for current_store_nbr in X['store_nbr'].unique():
            X_current_train = X_train[(X_train['family'] == current_family) & (X_train['store_nbr'] == current_store_nbr)].drop(columns=['family', 'store_nbr'])
            model = Prophet()
            model.fit(X_current_train)
        
            current_test_indices = X_test[(X_test['family'] == current_family) & (X_test['store_nbr'] == current_store_nbr)].index
            X_current_test = X_test[(X_test['family'] == current_family) & (X_test['store_nbr'] == current_store_nbr)].drop(columns=['family'])
            y_current_pred = model.predict(X_current_test)['yhat']
            y_current_pred[y_current_pred < 0] = 0
            y_current_test = y.loc[current_test_indices]
        
            scores_RMSLE.append(np.sqrt(mean_squared_log_error(y_current_test, y_current_pred)))
            scores_RMSE.append(mean_squared_error(y_current_test, y_current_pred, squared=False))
            scores_MAE.append(mean_absolute_error(y_current_test, y_current_pred))
            scores_R2.append(r2_score(y_current_test, y_current_pred))
          
    print(mean(scores_RMSLE), mean(scores_RMSE), mean(scores_MAE), mean(scores_R2))

In [None]:
test = pd.read_csv(DATA_ROOT / 'test.csv')
test.head()

In [None]:
test_prepared_data = test.drop(['onpromotion', 'id'], axis=1)
test_prepared_data['family'] = test_prepared_data['family'].str.lower()
test_prepared_data = test_prepared_data.rename(columns={'date': 'ds'})
test_prepared_data.head()

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
for family in train_prepared_data['family'].unique():
    for store_nbr in train_prepared_data['store_nbr'].unique():
        X_train = train_prepared_data.loc[(train_prepared_data['family'] == family) & (train_prepared_data['store_nbr'] == store_nbr)]
        model = Prophet()
        model.fit(X_train)
        test_indexer = test_prepared_data.loc[(train_prepared_data['family'] == family) & (train_prepared_data['store_nbr'] == store_nbr)].index
        X_test = test_prepared_data.loc[(train_prepared_data['family'] == family) & (train_prepared_data['store_nbr'] == store_nbr)]
        X_test = X_test.drop(['store_nbr', 'family'], axis=1)
        submission.loc[test_indexer, 'sales'] = list(model.predict(X_test)['yhat'])

In [None]:
submission[submission['sales'] < 0]['sales'] = 0

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/simplest_prophet.csv', index = False)

# Prophet with holidays

In [None]:
X = train.copy()
y = X['y'].copy()

In [None]:
first_day_of_last_year = pd.to_datetime(X['ds'].unique()[-1]) - pd.Timedelta(days=365)
indexer = X[X['ds'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['ds'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
holidays_data = pd.read_csv(DATA_ROOT / 'holidays_events.csv')
holidays_data.head()

In [None]:
holidays_events_data = pd.DataFrame({
    'holiday': 'holidays_events',
    'ds': pd.to_datetime(holidays_data[(holidays_data['transferred'] == False) & (holidays_data['date'] >= str(first_day_of_last_year))]['date'].unique()),
    'lower_window': 0,
    'upper_window': 0,
})

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv')
test_data = test_data.drop(columns=['onpromotion', 'id'])
test_data['family'] = test_data['family'].str.lower()
test_data = test_data.rename(columns={'date': 'ds'})
test_data.head()

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
number = 1
for family in train_prepared_data_copy['family'].unique():
    for store_nbr in train_prepared_data_copy['store_nbr'].unique():
        print(number)
        X_current_train = X_train.loc[(X_train['family'] == family) & (X_train['store_nbr'] == store_nbr)]
        X_current_train = X_current_train.drop(['store_nbr', 'family'], axis=1)
        model = Prophet(holidays=holidays_events_data)
        model.fit(X_current_train)
        test_indexer = test_data.loc[(test_data['family'] == family) & (test_data['store_nbr'] == store_nbr)].index
        X_test = test_data.loc[(test_data['family'] == family) & (test_data['store_nbr'] == store_nbr)]
        X_test = X_test.drop(columns=['store_nbr', 'family'])
        submission.loc[test_indexer, 'sales'] = list(model.predict(X_test)['yhat'])
        number += 1

In [None]:
submission.loc[list(submission[submission['sales'] < 0].index), 'sales'] = 0

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/simple_prophet_with_holidays.csv', index = False)