In [2]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import dotenv
import os
import sys
import numpy as np
from tqdm import tqdm

In [3]:
# add root project directory
sys.path.append("../")
# get environment path file
dotenv_path = dotenv.find_dotenv()
# load environment variables
dotenv.load_dotenv(dotenv_path)

CALENDAR_FILE_PATH = os.environ.get("CALENDAR_FILE_PATH")
SALES_TRAIN_EVALUATION_FILE_PATH = os.environ.get("SALES_TRAIN_EVALUATION_FILE_PATH")
SALES_TRAIN_VALIDATION_FILE_PATH = os.environ.get("SALES_TRAIN_VALIDATION_FILE_PATH")
SAMPLE_SUBMISSION_FILE_PATH = os.environ.get("SAMPLE_SUBMISSION_FILE_PATH")
SELL_PRICES_FILE_PATH = os.environ.get("SELL_PRICES_FILE_PATH")

In [4]:
# load dataset
cal_df = pd.read_csv(CALENDAR_FILE_PATH)
sell_price_df = pd.read_csv(SELL_PRICES_FILE_PATH)
df = pd.read_csv(SALES_TRAIN_VALIDATION_FILE_PATH)

# Change Data Type
Change the data type that requires lower memory usage. float64 and int64 is a lot of memory.

In [5]:
display(cal_df.info())
display(sell_price_df.info())
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          1969 non-null   object
 1   wm_yr_wk      1969 non-null   int64 
 2   weekday       1969 non-null   object
 3   wday          1969 non-null   int64 
 4   month         1969 non-null   int64 
 5   year          1969 non-null   int64 
 6   d             1969 non-null   object
 7   event_name_1  162 non-null    object
 8   event_type_1  162 non-null    object
 9   event_name_2  5 non-null      object
 10  event_type_2  5 non-null      object
 11  snap_CA       1969 non-null   int64 
 12  snap_TX       1969 non-null   int64 
 13  snap_WI       1969 non-null   int64 
dtypes: int64(7), object(7)
memory usage: 215.5+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   store_id    object 
 1   item_id     object 
 2   wm_yr_wk    int64  
 3   sell_price  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 208.8+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int64(1913), object(6)
memory usage: 446.4+ MB


None

In [6]:
np.finfo(np.float16)

finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

In [7]:
display(cal_df.select_dtypes(include=[int, float]).max(axis=0))
print("Maximum Value Int=", np.iinfo(np.int16).max)
print("Minimum Value Int=", np.iinfo(np.int16).min)
print("Maximum Value Float=", np.finfo(np.float16).max)
print("Minimum Value Float=", np.finfo(np.float16).min)

wm_yr_wk    11621
wday            7
month          12
year         2016
snap_CA         1
snap_TX         1
snap_WI         1
dtype: int64

Maximum Value Int= 32767
Minimum Value Int= -32768
Maximum Value Float= 65500.0
Minimum Value Float= -65500.0


In [8]:
display(sell_price_df.select_dtypes(include=[int, float]).max(axis=0))
print("Maximum Value Int=", np.iinfo(np.int16).max)
print("Minimum Value Int=", np.iinfo(np.int16).min)
print("Maximum Value Float=", np.finfo(np.float16).max)
print("Minimum Value Float=", np.finfo(np.float16).min)

wm_yr_wk      11621.00
sell_price      107.32
dtype: float64

Maximum Value Int= 32767
Minimum Value Int= -32768
Maximum Value Float= 65500.0
Minimum Value Float= -65500.0


In [9]:
display(df.select_dtypes(include=[int, float]).max(axis=0))
print("Maximum Value Int=", np.iinfo(np.int16).max)
print("Minimum Value Int=", np.iinfo(np.int16).min)
print("Maximum Value Float=", np.finfo(np.float16).max)
print("Minimum Value Float=", np.finfo(np.float16).min)

d_1       360
d_2       436
d_3       207
d_4       323
d_5       296
         ... 
d_1909     88
d_1910     77
d_1911    141
d_1912    171
d_1913    130
Length: 1913, dtype: int64

Maximum Value Int= 32767
Minimum Value Int= -32768
Maximum Value Float= 65500.0
Minimum Value Float= -65500.0


In [10]:
# add more days for the predictions
for d in range(1914, 1942):
    df['d_'+str(d)] = np.nan

In [11]:
# filter to fewer recent dates 
temp = df.drop(columns=[c for c in df.columns if c.find('d_')==0 and int(c.split('_')[1]) < 1100])

In [12]:
temp_df = temp.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    value_vars=[c for c in temp.columns if c.find('d_')==0],
    var_name = 'day', value_name = 'sales'
)
del temp

In [18]:
df_merged = temp_df.merge(
    cal_df.drop(columns=['date', 'wm_yr_wk', 'weekday']), 
    left_on='day', 
    right_on='d')\
    .drop(columns=['d'])