In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime
from math import ceil

## Load data

In [2]:
train_data = pd.read_csv(r'../datasets/train_data.csv')
validation_data = pd.read_csv(r'../datasets/validation_data.csv')
test_data = pd.read_csv(r'../datasets/test_data.csv')

In [3]:
train_data.head()

Unnamed: 0,CategoryCode,ItemCode,DateID,DailySales
0,category_2,117610,11/6/2021,7
1,category_4,836584,11/18/2021,16
2,category_1,370195,1/24/2022,6
3,category_2,172582,10/30/2021,5
4,category_2,1006009,10/30/2021,5


In [4]:
validation_data.head()

Unnamed: 0,CategoryCode,ItemCode,Week,WeeklySales
0,category_2,1044502,w1,11
1,category_2,1105009,w1,11
2,category_2,913561,w4,5
3,category_1,1048975,w4,30
4,category_1,17287,w2,60


In [5]:
test_data.head()

Unnamed: 0,CategoryCode,ItemCode,Week,PredictedSales
0,category_1,43738,w4,
1,category_2,1006090,w1,
2,category_2,1076929,w4,
3,category_1,1081321,w3,
4,category_2,216151,w4,


## Preprocess

In [14]:
category_codes = np.unique([*train_data['CategoryCode'].values, *validation_data['CategoryCode'].values, *test_data['CategoryCode'].values])
category_map = {}

for i in range(len(category_codes)):
    category_map[category_codes[i]] = i

In [16]:
item_codes = np.unique([*train_data['ItemCode'].values, *test_data['ItemCode'].values, *validation_data['ItemCode'].values])
item_map = {}

for i in range(len(item_codes)):
    item_map[item_codes[i]] = i

In [17]:
def string_to_date(d):
    return datetime(int(d.split('/')[2]), int(d.split('/')[0]), int(d.split('/')[1]))

In [19]:
def get_year(date):
    return date.year

def get_month(date):
    return date.month

def get_annual_week_id(date):
    return pd.Period(date).week

def get_monthly_week_id(date):
    first_day = date.replace(day=1)

    dom = date.day
    adjusted_dom = dom + first_day.weekday()

    return int(ceil(adjusted_dom/7.0))

def get_category_id(id):
    return category_map[id]

def get_item_code_id(id):
    return item_map[id]

In [21]:
def week_to_weekid(week):
    if (week == "w1"):
        return get_annual_week_id(('02/14/2022'))
    if (week == "w2"):
        return get_annual_week_id(('02/21/2022'))
    if (week == "w3"):
        return get_annual_week_id(('02/28/2022'))
    if (week == "w4"):
        return get_annual_week_id(('03/07/2022'))

#### Preprocess train data

In [20]:
train_data['WeekID'] = train_data['DateID'].apply(get_annual_week_id)
train_data['DateID'] = train_data['DateID'].apply(string_to_date)
train_data['DailySales'] = train_data['DailySales']
train_data['Year'] = train_data['DateID'].apply(get_year)
train_data['ItemCode'] = train_data['ItemCode'].apply(get_item_code_id)
train_data['CategoryCode'] = train_data['CategoryCode'].apply(get_category_id)

#### Preprocess validation data

In [None]:
validation_data['ItemCode'] = validation_data['ItemCode'].apply(get_item_code_id)
validation_data['CategoryCode'] = validation_data['CategoryCode'].apply(get_category_id)