### Import

In [1]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import mean_squared_error

import lightgbm as lgb
import xgboost as xgb

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from matplotlib import pyplot as plt
import seaborn as sns

import os

In [10]:
# Reference: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of Dataframe is {:.3f} MB'.format(start_mem))
    
    for col in tqdm(df.columns):
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.3f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

### Creating Features

In [4]:
# Downloading data (using wget)

file_path="favorita-grocery-sales-forecasting.zip"

if not os.path.exists(file_path):
    !wget --header="Host: storage.googleapis.com" --header="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" --header="Accept-Language: en-US,en;q=0.9" --header="Referer: https://www.kaggle.com/" "https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/7391/44328/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1593984946&Signature=TZ8WhKQzNyAp%2B8IRIjBE3f9IPhSdR%2B8izTu2DDZLt1ZJS9M5q5pZsNpMGYYOCFwROdvxHPUf%2FIVoPslSOiRMcBdkBhumDs6xiOt9A5dzgUh6QqH3%2BzX%2F%2Be2FVjW2dg3a%2B%2FmqIwQLD7y%2B8gfRP82VlEMdGcxLLbRliMfy2ZK0BlMZgRZJ7%2BNmsdbm3V6Y%2Fk7YnIiDGH3bBopFwLN02mOhiqb96GC4gD813iLV5DRoSzegViOZjddjSBtKeNlFu86bo9oj2cjI%2BQrxQV%2F2I6IU1lKqXxkkdAl0oFzzfNUwlLForPg0nd8GMaYgdlM6Ga1liBl2QFahMYkwJUM6Hvv%2F6w%3D%3D&response-content-disposition=attachment%3B+filename%3Dfavorita-grocery-sales-forecasting.zip" -c -O 'favorita-grocery-sales-forecasting.zip'
else:
    print("File Already Present")

--2020-07-02 23:18:52--  https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/7391/44328/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1593984946&Signature=TZ8WhKQzNyAp%2B8IRIjBE3f9IPhSdR%2B8izTu2DDZLt1ZJS9M5q5pZsNpMGYYOCFwROdvxHPUf%2FIVoPslSOiRMcBdkBhumDs6xiOt9A5dzgUh6QqH3%2BzX%2F%2Be2FVjW2dg3a%2B%2FmqIwQLD7y%2B8gfRP82VlEMdGcxLLbRliMfy2ZK0BlMZgRZJ7%2BNmsdbm3V6Y%2Fk7YnIiDGH3bBopFwLN02mOhiqb96GC4gD813iLV5DRoSzegViOZjddjSBtKeNlFu86bo9oj2cjI%2BQrxQV%2F2I6IU1lKqXxkkdAl0oFzzfNUwlLForPg0nd8GMaYgdlM6Ga1liBl2QFahMYkwJUM6Hvv%2F6w%3D%3D&response-content-disposition=attachment%3B+filename%3Dfavorita-grocery-sales-forecasting.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.124.128, 172.217.212.128, 172.217.214.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.124.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 480014675 (458M) [application/zip]
Saving to: ‘favor

In [5]:
# unzipping favorita-grocery-sales-forecasting.zip

if os.path.exists('favorita-grocery-sales-forecasting.zip'):
    !unzip 'favorita-grocery-sales-forecasting.zip'
    print("File unzipped Successfully")
else:
    print("File Not Present to unzip")

Archive:  favorita-grocery-sales-forecasting.zip
  inflating: holidays_events.csv.7z  
  inflating: items.csv.7z            
  inflating: oil.csv.7z              
  inflating: sample_submission.csv.7z  
  inflating: stores.csv.7z           
  inflating: test.csv.7z             
  inflating: train.csv.7z            
  inflating: transactions.csv.7z     
File unzipped Successfully


In [6]:
#installing 7zip for extracting .7z files
!apt-get install p7zip-full

Reading package lists... Done
Building dependency tree       
Reading state information... Done
p7zip-full is already the newest version (16.02+dfsg-6).
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 33 not upgraded.


In [7]:
#Extracting .7z files if they are not already extracted.

for file in os.listdir():
    if file[-3:]=='.7z':
        if os.path.exists(file[:-3]):
            print("="*50)
            print("'{}'Extracted File is Already Present".format(file[:-3]))
        elif file=='oil.csv.7z':
            !p7zip -d 'oil.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='train.csv.7z':
            !p7zip -d 'train.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='stores.csv.7z':
            !p7zip -d 'stores.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='transactions.csv.7z':
            !p7zip -d 'transactions.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='items.csv.7z':
            !p7zip -d 'items.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='holidays_events.csv.7z':
            !p7zip -d 'holidays_events.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='test.csv.7z':
            !p7zip -d 'test.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='sample_submission.csv.7z':
            !p7zip -d 'sample_submission.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        print("="*50)



7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 666528 bytes (651 KiB)

Extracting archive: sample_submission.csv.7z
--
Path = sample_submission.csv.7z
Type = 7z
Physical Size = 666528
Headers Size = 146
Method = LZMA2:24
Solid = -
Blocks = 1

  0%     82% - sample_submission.csv                            Everything is Ok

Size:       40445582
Compressed: 666528
'sample_submission.csv.7z' File Extracted Successfully

7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 47409

In [8]:
#Creating features by excecuting Pre_Processing Feature_engineering.py

exec(open('Pre_Processing Feature_engineering.py').read())

# Train Dataset Initial Date 28/6/2017
# 2 weeks

# Validation Dataset Initial Date 26/7/2017
# 1 week

# Test Dataset Initial Date 16/8/2017
# 1 week


Data Pre-Processing ...


HBox(children=(FloatProgress(value=0.0, max=23808261.0), HTML(value='')))


Enter the following for Train Data :

Starting Date (Day/Month/Year) --> 28/6/2017
No. of weeks --> 2

Creating Features for data between Dates --> 2017-06-28 - 2017-07-12 (i.e. 2 weeks) 



HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Saving 'X_train.csv' File ...
Saving 'y_train.csv' File ...

Enter the following for Validation Data :

Starting Date (Day/Month/Year) --> 26/7/2017
No. of weeks --> 1

Creating Features for data between Dates --> 2017-07-26 - 2017-08-02 (i.e. 1 weeks) 



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Saving 'X_val.csv' File ...
Saving 'y_val.csv' File ...

Enter the following for Test Data :

Starting Date (Day/Month/Year) --> 16/8/2017
No. of weeks --> 1

Creating Features for data between Dates --> 2017-08-16 - 2017-08-23 (i.e. 1 weeks) 



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Saving 'X_test.csv' File ...

Saving 'sales_2017.csv' File ...
Saving 'stores_items.csv' File ...


### Reading Data

In [None]:
# Reading X_train.csv and reducing memory usage
X_train=pd.read_csv("X_train.csv")
X_train=reduce_mem_usage(X_train)

# Reading y_train.csv and converting into numpy array
y_train = np.array(pd.read_csv( 'y_train.csv'))

Memory usage of Dataframe is 1617.996 MB


HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))


Memory usage after optimization is: 351.461 MB
Decreased by 78.3%


In [None]:
# Reading X_val.csv and reducing memory usage
X_val=pd.read_csv("X_val.csv")
X_val=reduce_mem_usage(X_val)

# Reading y_val.csv and converting into numpy array
y_val = np.array(pd.read_csv( 'y_val.csv'))

Memory usage of Dataframe is 808.998 MB


HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))


Memory usage after optimization is: 175.091 MB
Decreased by 78.4%


In [None]:
# Reading X_test.csv and reducing memory usage
X_test=pd.read_csv("X_test.csv")
X_test=reduce_mem_usage(X_test)


Memory usage of Dataframe is 808.998 MB


HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))


Memory usage after optimization is: 175.730 MB
Decreased by 78.3%


In [11]:
# Reading stores_items.csv
stores_items = pd.read_csv('stores_items.csv', index_col=['store_nbr','item_nbr'])

# Reading items.csv and setting index as item_nbr
items = pd.read_csv( 'items.csv' ).set_index("item_nbr")

items = items.reindex( stores_items.index.get_level_values(1) )
items=reduce_mem_usage(items)

Memory usage of Dataframe is 5.112 MB


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Memory usage after optimization is: 1.919 MB
Decreased by 62.5%


### Defining Random Forest

In [14]:
def RandomForest_FeatureSelection(X_train,y_train,params,n_days,items):
    '''
    Trains 16 different Random forest models for predicting next 16 days sales .
    and Stores sorted feature importances of each model in a list. 

    Returns --> * feature_imp_all i.e.feature importances of each model in a list.
                
    '''

    #Standalone Random Forest With XGBoost API

    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'rmse'
    params['tree_method'] = 'gpu_hist'

    #num_boost_round should be set to 1 to prevent XGBoost from boosting multiple random forests. 
    num_boost_rounds=1
    #eta (alias: learning_rate) must be set to 1 when training random forest regression
    params ['eta']= 1

    feature_imp_all = []

    #Training 16 different models for predicting next 16 days sales.
    for i in range(16):
        print("=" * 50)
        print("Step %d" % (i+1))
        print("=" * 50)

        #Creating Train Dmatrix (DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.)
        dtrain = xgb.DMatrix( X_train, label=y_train[:, i],
                              weight=pd.concat([items["perishable"]] * n_days) * 0.25 + 1)
        
        # watchist is used to see the evaluation metrics of the datasets given while training
        watchlist = [ (dtrain,'train') ]

        #Training Random Forest using Xgboost
        model = xgb.train(params, dtrain, 1, watchlist, verbose_eval=1)

        #Fetching feature importance from model
        feature_imp = model.get_score(importance_type= 'gain')
        #Sorting features using their importance values
        feature_imp = sorted(feature_imp.items(), key=lambda x: x[1], reverse=True)
        #Appending sorted feature importance in a list 
        feature_imp_all.append(feature_imp)

        # Deleting unneccessary variables
        del model,dtrain,feature_imp

    return feature_imp_all

### Feature Selection Using Random forest

In [None]:
%%time

params= dict()
#keeping not too large max_depth,num_parallel_tree to maintain low time complexity
params['max_depth'] = 15
params["num_parallel_tree"] = 100

n_days=2

# Getting sorted feature importance for all the 16 models in a list
feature_imp = RandomForest_FeatureSelection(X_train,y_train,params,n_days,items)

Step 1
[0]	train-rmse:0.502756
Step 2
[0]	train-rmse:0.517825
Step 3
[0]	train-rmse:0.51848
Step 4
[0]	train-rmse:0.535173
Step 5
[0]	train-rmse:0.542377
Step 6
[0]	train-rmse:0.548207
Step 7
[0]	train-rmse:0.541883
Step 8
[0]	train-rmse:0.522936
Step 9
[0]	train-rmse:0.537185
Step 10
[0]	train-rmse:0.530408
Step 11
[0]	train-rmse:0.551563
Step 12
[0]	train-rmse:0.555245
Step 13
[0]	train-rmse:0.554349
Step 14
[0]	train-rmse:0.543721
Step 15
[0]	train-rmse:0.530702
Step 16
[0]	train-rmse:0.540752
CPU times: user 33min 35s, sys: 15min 46s, total: 49min 22s
Wall time: 49min 24s


In [2]:
#Using only top 300 features as to reduce time taken for Hyperparameter Tuning and final model training.

# Filtering top 300 feature names for all the 16 models seperately. 
top=300
filtered_features=[]
for i in range(16):
    temp=[]
    for feature in feature_imp[i][:top]:
        temp.append(feature[0])
    filtered_features.append(temp)
print("Filtered top {} features".format(len(filtered_features[0])))

Filtered top 300 features


In [3]:
import pickle
#Saving feature importance
with open('300_filtered_features.pkl','wb') as file:
    pickle.dump(filtered_features, file)