## Setting up the Development Environment

In [None]:
# setup development environment 
from google.colab import drive
import warnings
import logging
drive.mount('/content/drive', force_remount=True)
warnings.filterwarnings("ignore")
logger = logging.getLogger('cmdstanpy')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)

# import libraries in Python
import pandas as pd
import numpy as np
from downcast import reduce
from tqdm import tqdm
import random

# setup working directory 
WORK_DIR = 'drive/MyDrive/TaoBin'

## Data pre-processing

In [None]:
class Preprocess():
  """
  Preprocess contains all the data preprocessing steps to convert raw data into the processed form for machine learning
  """

  random.seed(23)

  def read_data(self):
    "Read a comma-separated values (csv) file into DataFrame"

    self.sales = pd.read_csv(f'{WORK_DIR}/raw_data/sales_train_evaluation.csv')
    self.sell_prices = pd.read_csv(f'{WORK_DIR}/raw_data/sell_prices.csv')
    self.calendar = pd.read_csv(f'{WORK_DIR}/raw_data/calendar.csv')

    self.sales, self.sell_prices, self.calendar = self.downcast(self.sales, self.sell_prices, self.calendar)

    return self.sales, self.sell_prices, self.calendar

  def downcast(self, sales, sell_prices, calendar):
    "Reduce pandas data-frame size using downcast"
    sales = reduce(sales)
    sell_prices = reduce(sell_prices)
    calendar = reduce(calendar)

    return sales, sell_prices, calendar
  
  def lag_features(self, data):
    "Add Lag features. a.k.a values at prior time steps."

    day_lags = [28, 35, 42, 49, 56, 63, 70]
    print(day_lags)

    for lag in tqdm(day_lags):
      data["lag_" + str(lag)] = data.groupby("id")["sales"].shift(lag).astype(np.float16)
  
    return data

  def feature_engineering(self, sales, sell_prices, calendar):
    "Selecting, manipulating, and transforming raw data into features that can be used in supervised learning."

    # create new date features required for the time-series prediction.
    start_date = 1942
    end_date = 1942+28
    for day in tqdm(range(start_date, end_date)):
      sales['d_' + str(day)] = np.int32(0)

    # define all categorical columns.
    cat_cols = ["id", "item_id", "dept_id", "store_id", "cat_id", "state_id"]
    self.data = pd.melt(sales, id_vars=cat_cols, var_name='day', value_name='sales').dropna()
    
    # merge DataFrame
    self.data = self.data.merge(calendar, left_on='day', right_on='d')
    self.data = self.data.merge(sell_prices,on=['store_id','item_id', 'wm_yr_wk'], how='left')
    
    # compute mean of groups, excluding missing values.
    self.data['sell_price'].fillna(self.data.groupby('id')['sell_price'].transform('mean'), inplace=True)
    
    # strip the 'd_' from the day column to make it an integer feature.
    self.data['day'] = self.data['day'].apply(lambda x: x.split('_')[1]).astype(np.int16)
    
    # drop specified labels from rows or columns.
    self.data.drop(['d','weekday','date'], axis=1, inplace=True) 

    # categorical data encoding
    for i in self.data.columns:
        try:
            self.data[i] = self.data[i].cat.codes
        except AttributeError:
            pass

    # add lag features
    self.data = self.lag_features(self.data)

    return self.data

In [None]:
if __name__ == "__main__":

    pre = Preprocess()

    # sales, sell_prices, calendar = pre.read_data()
    # sales.to_pickle(f'{WORK_DIR}/processed_data/downcast_sales.pkl')
    # sell_prices.to_pickle(f'{WORK_DIR}/processed_data/downcast_sell_prices.pkl')
    # calendar.to_pickle(f'{WORK_DIR}/processed_data/downcast_calendar.pkl')

    # read dowcast data
    sales = pd.read_pickle(f'{WORK_DIR}/processed_data/downcast_sales.pkl')
    sell_prices = pd.read_pickle(f'{WORK_DIR}/processed_data/downcast_sell_prices.pkl')
    calendar = pd.read_pickle(f'{WORK_DIR}/processed_data/downcast_calendar.pkl')

    # feature engineering
    data = pre.feature_engineering(sales, sell_prices, calendar)
    data = data[data['day']>1000]

    # remove all rows with NaN value
    data.dropna(inplace=True)
    data.isna().sum().sum()

    # save processed data
    data.to_pickle(f'{WORK_DIR}/processed_data/processed_data.pkl')
