In [1]:
import gc
import numpy as np 
import pandas as pd 
from category_encoders.ordinal import OrdinalEncoder

In [17]:


from category_encoders.ordinal import OrdinalEncoder
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    
    for col in props.columns:
        if np.issubdtype(props[col].dtype, np.number):            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True
  
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

***
## data loading

In [3]:
sales_train = pd.read_csv("../input/sales_train_validation.csv")
sales_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int64(1913), object(6)
memory usage: 446.4+ MB


In [4]:
calendar = pd.read_csv("../input/calendar.csv", parse_dates=["date"])
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          1969 non-null   datetime64[ns]
 1   wm_yr_wk      1969 non-null   int64         
 2   weekday       1969 non-null   object        
 3   wday          1969 non-null   int64         
 4   month         1969 non-null   int64         
 5   year          1969 non-null   int64         
 6   d             1969 non-null   object        
 7   event_name_1  162 non-null    object        
 8   event_type_1  162 non-null    object        
 9   event_name_2  5 non-null      object        
 10  event_type_2  5 non-null      object        
 11  snap_CA       1969 non-null   int64         
 12  snap_TX       1969 non-null   int64         
 13  snap_WI       1969 non-null   int64         
dtypes: datetime64[ns](1), int64(7), object(6)
memory usage: 215.5+ KB


In [5]:
sell_prices = pd.read_csv("../input/sell_prices.csv")
sell_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   store_id    object 
 1   item_id     object 
 2   wm_yr_wk    int64  
 3   sell_price  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 208.8+ MB


In [6]:
# there are missing prices
(sell_prices.groupby(["store_id", "item_id"])["wm_yr_wk"]
 .count()
 .reset_index(name="n_weeks")
 .n_weeks.describe()
)

count    30490.000000
mean       224.372614
std         68.000808
min         19.000000
25%        173.000000
50%        260.000000
75%        282.000000
max        282.000000
Name: n_weeks, dtype: float64

***
## data wrangling

In [7]:
hierarchy = (sales_train.loc[:, ["item_id", "dept_id", "cat_id", "store_id", "state_id"]]
             .drop_duplicates())

# hierarchy encoder
item_encoder = OrdinalEncoder()
item_encoder.fit(hierarchy.loc[:, ["item_id"]])

dept_encoder = OrdinalEncoder()
dept_encoder.fit(hierarchy.loc[:, ["dept_id"]])

cat_encoder = OrdinalEncoder()
cat_encoder.fit(hierarchy.loc[:, ["cat_id"]])

store_encoder = OrdinalEncoder()
store_encoder.fit(hierarchy.loc[:, ["store_id"]])

state_encoder = OrdinalEncoder()
state_encoder.fit(hierarchy.loc[:, ["state_id"]]);

In [8]:
sales_train.loc[:, "item_id"]  = item_encoder.transform(sales_train.loc[:, ["item_id"]])
sales_train.loc[:, "dept_id"]  = dept_encoder.transform(sales_train.loc[:, ["dept_id"]])
sales_train.loc[:, "cat_id"]   = cat_encoder.transform(sales_train.loc[:, ["cat_id"]])
sales_train.loc[:, "store_id"] = store_encoder.transform(sales_train.loc[:, ["store_id"]])
sales_train.loc[:, "state_id"] = state_encoder.transform(sales_train.loc[:, ["state_id"]])

In [9]:
sell_prices.loc[:, "store_id"] = store_encoder.transform(sell_prices.loc[:, ["store_id"]])
sell_prices.loc[:, "item_id"]  = item_encoder.transform(sell_prices.loc[:, ["item_id"]]) 

In [10]:
data = pd.melt(sales_train, 
               id_vars=["item_id","dept_id","cat_id","store_id","state_id"],
               value_vars=[f"d_{i}" for i in range(1,1914)],
               var_name="d",
               value_name="q")

In [11]:
data = pd.merge(data, 
                calendar.loc[:, ["date","wm_yr_wk", "d", "snap_CA", "snap_TX", "snap_WI"]],
                how="left",
                on="d")

In [12]:
data = pd.merge(data, sell_prices,
                on=["store_id", "item_id", "wm_yr_wk"],
                how="left")

In [16]:
data.date

0          2011-01-29
1          2011-01-29
2          2011-01-29
3          2011-01-29
4          2011-01-29
              ...    
58327365   2016-04-24
58327366   2016-04-24
58327367   2016-04-24
58327368   2016-04-24
58327369   2016-04-24
Name: date, Length: 58327370, dtype: datetime64[ns]

In [18]:
data,_ = reduce_mem_usage(data)
gc.collect()

Memory usage of properties dataframe is : 4005.022659301758  MB
******************************
Column:  item_id
dtype before:  uint16
dtype after:  uint16
******************************
******************************
Column:  dept_id
dtype before:  uint8
dtype after:  uint8
******************************
******************************
Column:  cat_id
dtype before:  uint8
dtype after:  uint8
******************************
******************************
Column:  store_id
dtype before:  uint8
dtype after:  uint8
******************************
******************************
Column:  state_id
dtype before:  uint8
dtype after:  uint8
******************************
******************************
Column:  q
dtype before:  uint16
dtype after:  uint16
******************************
******************************
Column:  wm_yr_wk
dtype before:  int64
dtype after:  uint16
******************************
******************************
Column:  snap_CA
dtype before:  int64
dtype after:  uint8
******

48

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58327370 entries, 0 to 58327369
Data columns (total 13 columns):
 #   Column      Dtype         
---  ------      -----         
 0   item_id     uint16        
 1   dept_id     uint8         
 2   cat_id      uint8         
 3   store_id    uint8         
 4   state_id    uint8         
 5   d           object        
 6   q           uint16        
 7   date        datetime64[ns]
 8   wm_yr_wk    uint16        
 9   snap_CA     uint8         
 10  snap_TX     uint8         
 11  snap_WI     uint8         
 12  sell_price  float32       
dtypes: datetime64[ns](1), float32(1), object(1), uint16(3), uint8(7)
memory usage: 2.2+ GB


***

In [20]:
(data
 .drop(["d","wm_yr_wk"], axis=1)
 .to_parquet("../input/train_dataframe.parquet", index=False)
)

***

In [22]:
submission = pd.read_csv("../input/sample_submission.csv")

In [23]:
submission

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60976,FOODS_3_824_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60977,FOODS_3_825_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60978,FOODS_3_826_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
(6.13218)**0.5

2.4763238883474026

In [25]:
(6.03533)**0.5

2.4566908637433404