# Orders Forecasting Challenge

## References

## Import Python Libraries

In [541]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from scipy.stats import boxcox, boxcox_normplot
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, RFE, SelectFromModel
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error, mean_absolute_error, median_absolute_error
import requests
import datetime as dt

# Import Notebook Functions

In [4]:
# get regression metric functions from stored python file
filepath = "https://raw.githubusercontent.com/notfakearcher/julian/main/01_practice/machine_learning/supervised_learning/regression/regression_metrics.py"
request = requests.get(filepath)
with open("regression_metrics.py", "wb") as f:
  f.write(request.content)
from regression_metrics import jra_regression_metrics

## Global Variables

In [398]:
# random state seed for repeatability
random_state = 4781

# colors
c1 = 'grey'
c2 = 'red'
c3 = 'blue'

# root folder
root_folder = 'c:/Users/80148956/Desktop/Upskill/github_repositories/julian/04_projects/orders_forecasting/'

# data folder
data_folder = root_folder + 'data/'

## Import Dataset

In [399]:
# Source: 
# 1. ....

# Dataset Column Overview: 
# ------------------------------------------------------------------------------

# X01: 
# X02: 
# X03: 
# X04: 
# X05: 
# X06: 
# X07: 
# X08: 
# X09: 
# X10: 
# X11: 
# X12: 
# X13: 
# X14: 
# X15: 
# X16: 
# X17: 
# X18: 
# X19: 
# X20: 
# X21: 
# X22: 
# X23: 
# X24: 
# X25: 
#   y: 


In [400]:
# load train.csv
filepath1 = data_folder + 'train.csv'
df_train = pd.read_csv(filepath1, header = 0)
df_train.sample(5)

Unnamed: 0,warehouse,date,orders,holiday_name,holiday,shutdown,mini_shutdown,shops_closed,winter_school_holidays,school_holidays,blackout,mov_change,frankfurt_shutdown,precipitation,snow,user_activity_1,user_activity_2,id
3600,Prague_3,2020-12-27,3874.0,,0,0,0,0,0,0,0,0.0,0,,,987.0,17111.0,Prague_3_2020-12-27
284,Prague_1,2021-09-16,7118.0,,0,0,0,0,0,0,0,0.0,0,12.5,0.0,1650.0,31722.0,Prague_1_2021-09-16
4879,Munich_1,2021-12-20,2485.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,862.0,13642.0,Munich_1_2021-12-20
3347,Prague_2,2023-07-27,4671.0,,0,0,0,0,0,0,0,0.0,0,1.1,0.0,1270.0,21259.0,Prague_2_2023-07-27
3343,Prague_2,2023-07-23,4921.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1268.0,21226.0,Prague_2_2023-07-23


In [401]:
# load train_calendar.csv
filepath1 = data_folder + 'train_calendar.csv'
df_train_calendar = pd.read_csv(filepath1, header = 0)

# add id column
df_train_calendar['id'] = df_train_calendar['warehouse'] + "_" + df_train_calendar['date']
df_train_calendar.sample(5)

Unnamed: 0,date,holiday_name,holiday,shutdown,mini_shutdown,warehouse_limited,shops_closed,winter_school_holidays,school_holidays,blackout,mov_change,frankfurt_shutdown,precipitation,snow,warehouse,id
6238,2023-06-24,,0,0,0,0,0,0,0,0,1.0,0,0.5,0.0,Prague_3,Prague_3_2023-06-24
12962,2023-09-17,,0,0,0,0,0,0,0,0,0.0,0,0.0,0.0,Budapest_1,Budapest_1_2023-09-17
3883,2020-04-06,,0,0,0,0,0,0,0,0,0.0,0,0.0,0.0,Prague_2,Prague_2_2020-04-06
6532,2019-07-06,Jan Hus,1,0,0,0,0,0,0,0,0.0,0,,,Prague_3,Prague_3_2019-07-06
3990,2023-08-09,,0,0,0,0,0,0,0,0,0.0,0,11.5,0.0,Prague_2,Prague_2_2023-08-09


In [427]:
# left join train and train_calendar
df_all = pd.merge(df_train, df_train_calendar, on = 'id', how = 'left')

# order columns
ordered_cols = np.sort(df_all.columns)
df_all = df_all[ordered_cols]

# replace object columns that are null with ''
object_cols = df_all.select_dtypes(include = 'object').columns.copy()
cond = df_all.loc[:, object_cols].isna()
df_all[cond] = ''

# replace numeric columns that are null with 0
number_cols = df_all.select_dtypes(include = 'number').columns.copy()
cond = df_all.loc[:, number_cols].isna()
df_all[cond] = 0

df_all.sample(5)

Unnamed: 0,blackout_x,blackout_y,date_x,date_y,frankfurt_shutdown_x,frankfurt_shutdown_y,holiday_name_x,holiday_name_y,holiday_x,holiday_y,...,shutdown_y,snow_x,snow_y,user_activity_1,user_activity_2,warehouse_limited,warehouse_x,warehouse_y,winter_school_holidays_x,winter_school_holidays_y
4699,0,0,2024-01-03,2024-01-03,0,0,,,0,0,...,0,0.0,0.0,946.0,23140.0,0,Prague_3,Prague_3,0,0
3788,0,0,2021-07-03,2021-07-03,0,0,,,0,0,...,0,0.0,0.0,929.0,16684.0,0,Prague_3,Prague_3,0,0
5414,0,0,2023-09-25,2023-09-25,0,0,,,0,0,...,0,0.0,0.0,584.0,17714.0,0,Munich_1,Munich_1,0,0
1587,0,0,2022-01-05,2022-01-05,0,0,,,0,0,...,0,0.0,0.0,2354.0,32162.0,0,Brno_1,Brno_1,0,0
6626,0,0,2022-03-06,2022-03-06,0,0,,,0,0,...,0,0.0,0.0,2989.0,23430.0,0,Budapest_1,Budapest_1,0,0


In [540]:
# get columns with x in name
x_cols = df_all.filter(like = 'x').columns

for x_col in x_cols:
  # get y_col
  y_col = re.sub(pattern = 'x', repl = 'y', string = x_col)

  # check where x_col and y_col do not match in dataframe
  cond = df_all[x_col] != df_all[y_col]
  
  # replace where does not match with max value
  df_all.loc[cond, x_col] = np.max(df_all[[x_col, y_col]], axis = 1)
  
  # add new column
  new_col = re.sub(pattern = '_x', repl = '', string = x_col)
  df_all[new_col] = df_all[x_col]
  
  # drop existing x_col and y_col
  df_all = df_all.drop([x_col, y_col], errors = 'ignore', axis = 1)

# order columns
ordered_cols = np.sort(df_all.columns)
df_all = df_all[ordered_cols]

df_all.sample(5)

Unnamed: 0,blackout,date,frankfurt_shutdown,holiday,holiday_name,id,mini_shutdown,mov_change,orders,precipitation,school_holidays,shops_closed,shutdown,snow,user_activity_1,user_activity_2,warehouse,warehouse_limited,winter_school_holidays
3622,0,2021-01-18,0,0,,Prague_3_2021-01-18,0,0.0,4635.0,0.0,0,0,0,0.0,986.0,17659.0,Prague_3,0,0
3445,0,2023-11-02,0,0,,Prague_2_2023-11-02,0,0.0,5353.0,4.8,0,0,0,0.0,1289.0,23971.0,Prague_2,0,0
4016,0,2022-02-17,0,0,,Prague_3_2022-02-17,0,0.0,5110.0,1.75,0,0,0,0.0,969.0,21372.0,Prague_3,0,0
112,0,2021-03-28,0,0,,Prague_1_2021-03-28,0,0.0,7692.0,1.0,0,0,0,0.0,1734.0,35148.0,Prague_1,0,0
3438,0,2023-10-26,0,0,,Prague_2_2023-10-26,0,0.0,5565.0,0.3,0,0,0,0.0,1299.0,23808.0,Prague_2,0,0


## Feature Engineering

In [544]:
# convert date text column to date type
df_all['date'] = df_all['date'].astype('datetime64[ns]')
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7340 entries, 0 to 7339
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   blackout                7340 non-null   int64         
 1   date                    7340 non-null   datetime64[ns]
 2   frankfurt_shutdown      7340 non-null   int64         
 3   holiday                 7340 non-null   int64         
 4   holiday_name            7340 non-null   object        
 5   id                      7340 non-null   object        
 6   mini_shutdown           7340 non-null   int64         
 7   mov_change              7340 non-null   float64       
 8   orders                  7340 non-null   float64       
 9   precipitation           7340 non-null   float64       
 10  school_holidays         7340 non-null   int64         
 11  shops_closed            7340 non-null   int64         
 12  shutdown                7340 non-null   int64   

In [568]:
# get column for day
df_all['day'] = df_all['date'].dt.day

# get column for month
df_all['month'] = df_all['date'].dt.month

# get column for year
df_all['year'] = df_all['date'].dt.year

# get column for weekday
df_all['weekday'] = df_all['date'].dt.weekday + 1

# get column for day of year
df_all['day_of_year'] = df_all['date'].dt.dayofyear

# get column for quarter
df_all['quarter'] = df_all['date'].dt.quarter

# get column for is leap year
df_all['is_leap_year'] = df_all['date'].dt.is_leap_year.astype('int')

# order columns
ordered_cols = np.sort(df_all.columns)
df_all = df_all[ordered_cols]

df_all.sample(5)

Unnamed: 0,blackout,date,day,day_of_year,frankfurt_shutdown,holiday,holiday_name,id,is_leap_year,mini_shutdown,...,shops_closed,shutdown,snow,user_activity_1,user_activity_2,warehouse,warehouse_limited,weekday,winter_school_holidays,year
19,0,2020-12-24,24,359,0,1,Christmas Eve,Prague_1_2020-12-24,1,0,...,0,0,0.0,1652.0,32332.0,Prague_1,0,4,0,2020
6904,0,2022-12-19,19,353,0,0,,Budapest_1_2022-12-19,0,0,...,0,0,0.0,3037.0,23794.0,Budapest_1,0,1,0,2022
3752,0,2021-05-28,28,148,0,0,,Prague_3_2021-05-28,0,0,...,0,0,0.0,944.0,17438.0,Prague_3,0,5,0,2021
6618,0,2022-02-26,26,57,0,0,,Budapest_1_2022-02-26,0,0,...,0,0,0.0,2991.0,23525.0,Budapest_1,0,6,0,2022
2362,0,2024-02-21,21,52,0,0,,Brno_1_2024-02-21,1,0,...,0,0,0.0,2361.0,38743.0,Brno_1,0,3,0,2024


## Outputs

In [569]:
# save all data
filepath1 = data_folder + 'all_train.csv'
df_all.to_csv(path_or_buf = filepath1, header = True, index = False)