In [None]:
import numpy as np   
import pandas as pd  
import os 
import re
import warnings
import datetime as dt

In [None]:
sales = pd.read_csv('walmart_sales_data/sales_train_validation.csv')
calendar = pd.read_csv('walmart_sales_data/calenda_data_combined.csv',index_col=[0])
sell_prices = pd.read_csv('walmart_sales_data/sell_prices.csv')

In [None]:
print(f'Size of sales is {sales.memory_usage(deep=True).sum()/(1024 * 1024)} MB')
print(f'Size of calendar is {calendar.memory_usage(deep=True).sum()/(1024 * 1024)} MB')
print(f'Size of sell_prices is {sell_prices.memory_usage(deep=True).sum()/(1024 * 1024)} MB')

Size of sales is 456.7783622741699 MB
Size of calendar is 1.3249435424804688 MB
Size of sell_prices is 957.5197134017944 MB


In [None]:
#Downcast in order to save memory
def downcast_dataframes(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  

In [None]:
def basic_eda(df):
    print("-------------------------------TOP 5 RECORDS-----------------------------")
    print(df.head(5))
    print()
    
    print("-------------------------------INFO--------------------------------------")
    print(df.info())
    print()
    
    print("-------------------------------Describe----------------------------------")
    print(df.describe())
    print()
    
    print("-------------------------------Columns-----------------------------------")
    print(df.columns)
    print()
    
    print("-------------------------------Data Types--------------------------------")
    print(df.dtypes)
    print()
    
    print("----------------------------Missing Values-------------------------------")
    print(df.isnull().sum())
    print()
    
    print("----------------------------NULL values----------------------------------")
    print(df.isna().sum())
    print()
    
    print("--------------------------Shape Of Data---------------------------------")
    print(df.shape)
    print()
    
    print("============================================================================ \n")

In [None]:
sales = downcast_dataframes(sales)
sell_prices = downcast_dataframes(sell_prices)
calendar = downcast_dataframes(calendar)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif t == np.object:


In [None]:
print(f'Size of sales is {sales.memory_usage(deep=True).sum()/(1024 * 1024)} MB')
print(f'Size of calendar is {calendar.memory_usage(deep=True).sum()/(1024 * 1024)} MB')
print(f'Size of sell_prices is {sell_prices.memory_usage(deep=True).sum()/(1024 * 1024)} MB')

Size of sales is 97.85959911346436 MB
Size of calendar is 0.3480653762817383 MB
Size of sell_prices is 45.95134258270264 MB


In [None]:
basic_eda(sales)

-------------------------------TOP 5 RECORDS-----------------------------
                              id        item_id    dept_id   cat_id store_id  \
0  HOBBIES_1_001_CA_1_validation  HOBBIES_1_001  HOBBIES_1  HOBBIES     CA_1   
1  HOBBIES_1_002_CA_1_validation  HOBBIES_1_002  HOBBIES_1  HOBBIES     CA_1   
2  HOBBIES_1_003_CA_1_validation  HOBBIES_1_003  HOBBIES_1  HOBBIES     CA_1   
3  HOBBIES_1_004_CA_1_validation  HOBBIES_1_004  HOBBIES_1  HOBBIES     CA_1   
4  HOBBIES_1_005_CA_1_validation  HOBBIES_1_005  HOBBIES_1  HOBBIES     CA_1   

  state_id  d_1  d_2  d_3  d_4  ...  d_1904  d_1905  d_1906  d_1907  d_1908  \
0       CA    0    0    0    0  ...       1       3       0       1       1   
1       CA    0    0    0    0  ...       0       0       0       0       0   
2       CA    0    0    0    0  ...       2       1       2       1       1   
3       CA    0    0    0    0  ...       1       0       5       4       1   
4       CA    0    0    0    0  ...       2       

In [None]:
basic_eda(sell_prices)

-------------------------------TOP 5 RECORDS-----------------------------
  store_id        item_id  wm_yr_wk  sell_price
0     CA_1  HOBBIES_1_001     11325    9.578125
1     CA_1  HOBBIES_1_001     11326    9.578125
2     CA_1  HOBBIES_1_001     11327    8.257812
3     CA_1  HOBBIES_1_001     11328    8.257812
4     CA_1  HOBBIES_1_001     11329    8.257812

-------------------------------INFO--------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 4 columns):
 #   Column      Dtype   
---  ------      -----   
 0   store_id    category
 1   item_id     category
 2   wm_yr_wk    int16   
 3   sell_price  float16 
dtypes: category(2), float16(1), int16(1)
memory usage: 45.8 MB
None

-------------------------------Describe----------------------------------
           wm_yr_wk    sell_price
count  6.841121e+06  6.841121e+06
mean   1.138294e+04           NaN
std    1.486100e+02  0.000000e+00
min    1.110100

In [None]:
basic_eda(calendar)

-------------------------------TOP 5 RECORDS-----------------------------
        date  wm_yr_wk    weekday  wday  month  year    d event_name_1  \
0 2011-01-29     11101   Saturday     1      1  2011  d_1          NaN   
1 2011-01-30     11101     Sunday     2      1  2011  d_2          NaN   
2 2011-01-31     11101     Monday     3      1  2011  d_3          NaN   
3 2011-02-01     11101    Tuesday     4      2  2011  d_4          NaN   
4 2011-02-02     11101  Wednesday     5      2  2011  d_5          NaN   

  event_type_1 event_name_2  ... total_sales_WI_HOBBIES  \
0          NaN          NaN  ...                 1083.0   
1          NaN          NaN  ...                  926.0   
2          NaN          NaN  ...                  684.0   
3          NaN          NaN  ...                  455.0   
4          NaN          NaN  ...                  132.0   

   total_sales_CA_HOUSEHOLD  total_sales_TX_HOUSEHOLD  \
0                    2292.0                    1706.0   
1           

In [None]:
final = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sold').dropna()

In [None]:
final.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


In [None]:
final = pd.merge(final, calendar, on='d', how='left')

In [None]:
final = pd.merge(final, sell_prices, on=['store_id','item_id','wm_yr_wk'], how='left') 

In [None]:
final.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold,date,wm_yr_wk,...,total_sales_CA_HOUSEHOLD,total_sales_TX_HOUSEHOLD,total_sales_WI_HOUSEHOLD,total_sales_CA_FOODS,total_sales_TX_FOODS,total_sales_WI_FOODS,Customer_price_index,Unemployement_Rate,gas_price,sell_price
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,2292.0,1706.0,1691.0,6852.0,6852.0,6224.0,221.125,9.101562,3.123047,
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,2292.0,1706.0,1691.0,6852.0,6852.0,6224.0,221.125,9.101562,3.123047,
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,2292.0,1706.0,1691.0,6852.0,6852.0,6224.0,221.125,9.101562,3.123047,
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,2292.0,1706.0,1691.0,6852.0,6852.0,6224.0,221.125,9.101562,3.123047,
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,2292.0,1706.0,1691.0,6852.0,6852.0,6224.0,221.125,9.101562,3.123047,


In [None]:
print(f'Size of final is {final.memory_usage(deep=True).sum()/(1024 * 1024 * 1024)} GB')

Size of final is 8.283706583082676 GB


In [None]:
basic_eda(final)

-------------------------------TOP 5 RECORDS-----------------------------
                              id        item_id    dept_id   cat_id store_id  \
0  HOBBIES_1_001_CA_1_validation  HOBBIES_1_001  HOBBIES_1  HOBBIES     CA_1   
1  HOBBIES_1_002_CA_1_validation  HOBBIES_1_002  HOBBIES_1  HOBBIES     CA_1   
2  HOBBIES_1_003_CA_1_validation  HOBBIES_1_003  HOBBIES_1  HOBBIES     CA_1   
3  HOBBIES_1_004_CA_1_validation  HOBBIES_1_004  HOBBIES_1  HOBBIES     CA_1   
4  HOBBIES_1_005_CA_1_validation  HOBBIES_1_005  HOBBIES_1  HOBBIES     CA_1   

  state_id    d  sold       date  wm_yr_wk  ... total_sales_CA_HOUSEHOLD  \
0       CA  d_1     0 2011-01-29     11101  ...                   2292.0   
1       CA  d_1     0 2011-01-29     11101  ...                   2292.0   
2       CA  d_1     0 2011-01-29     11101  ...                   2292.0   
3       CA  d_1     0 2011-01-29     11101  ...                   2292.0   
4       CA  d_1     0 2011-01-29     11101  ...                  

id                                 0
item_id                            0
dept_id                            0
cat_id                             0
store_id                           0
state_id                           0
d                                  0
sold                               0
date                               0
wm_yr_wk                           0
weekday                            0
wday                               0
month                              0
year                               0
event_name_1                  243920
event_type_1                  243920
event_name_2                 2591650
event_type_2                 2591650
snap_CA                            0
snap_TX                            0
snap_WI                            0
ca_walmart                         0
ca_hobbies                         0
ca_household                       0
ca_foods                           0
tx_walmart                         0
tx_hobbies                         0
t