### Enable GPU

In [1]:
!git clone -- https://github.com/microsoft/LightGBM.git
%cd /content/LightGBM
!mkdir build
!cmake -DUSE_GPU=1 #avoid ..
!make -j$(nproc)
!sudo apt-get -y install python-pip
!sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
%cd /content/LightGBM/python-package
!sudo python setup.py install --precompile


Cloning into 'LightGBM'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 18303 (delta 13), reused 5 (delta 1), pack-reused 18262[K
Receiving objects: 100% (18303/18303), 12.34 MiB | 9.62 MiB/s, done.
Resolving deltas: 100% (13356/13356), done.
/content/LightGBM
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found OpenMP_C: -fopenmp (fou

### Import

In [1]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import mean_squared_error

import lightgbm as lgb
import xgboost as xgb

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from matplotlib import pyplot as plt
import seaborn as sns

import os

In [2]:
# Reference: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of Dataframe is {:.3f} MB'.format(start_mem))
    
    for col in tqdm(df.columns):
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.3f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

### Creating Features

In [3]:
# Downloading data (using wget)

file_path="favorita-grocery-sales-forecasting.zip"

if not os.path.exists(file_path):
    !wget --header="Host: storage.googleapis.com" --header="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" --header="Accept-Language: en-US,en;q=0.9" --header="Referer: https://www.kaggle.com/" "https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/7391/44328/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1593984946&Signature=TZ8WhKQzNyAp%2B8IRIjBE3f9IPhSdR%2B8izTu2DDZLt1ZJS9M5q5pZsNpMGYYOCFwROdvxHPUf%2FIVoPslSOiRMcBdkBhumDs6xiOt9A5dzgUh6QqH3%2BzX%2F%2Be2FVjW2dg3a%2B%2FmqIwQLD7y%2B8gfRP82VlEMdGcxLLbRliMfy2ZK0BlMZgRZJ7%2BNmsdbm3V6Y%2Fk7YnIiDGH3bBopFwLN02mOhiqb96GC4gD813iLV5DRoSzegViOZjddjSBtKeNlFu86bo9oj2cjI%2BQrxQV%2F2I6IU1lKqXxkkdAl0oFzzfNUwlLForPg0nd8GMaYgdlM6Ga1liBl2QFahMYkwJUM6Hvv%2F6w%3D%3D&response-content-disposition=attachment%3B+filename%3Dfavorita-grocery-sales-forecasting.zip" -c -O 'favorita-grocery-sales-forecasting.zip'
else:
    print("File Already Present")

--2020-07-04 17:57:38--  https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/7391/44328/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1593984946&Signature=TZ8WhKQzNyAp%2B8IRIjBE3f9IPhSdR%2B8izTu2DDZLt1ZJS9M5q5pZsNpMGYYOCFwROdvxHPUf%2FIVoPslSOiRMcBdkBhumDs6xiOt9A5dzgUh6QqH3%2BzX%2F%2Be2FVjW2dg3a%2B%2FmqIwQLD7y%2B8gfRP82VlEMdGcxLLbRliMfy2ZK0BlMZgRZJ7%2BNmsdbm3V6Y%2Fk7YnIiDGH3bBopFwLN02mOhiqb96GC4gD813iLV5DRoSzegViOZjddjSBtKeNlFu86bo9oj2cjI%2BQrxQV%2F2I6IU1lKqXxkkdAl0oFzzfNUwlLForPg0nd8GMaYgdlM6Ga1liBl2QFahMYkwJUM6Hvv%2F6w%3D%3D&response-content-disposition=attachment%3B+filename%3Dfavorita-grocery-sales-forecasting.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.119.128, 108.177.126.128, 172.217.218.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.119.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 480014675 (458M) [application/zip]
Saving to: ‘fav

In [4]:
# unzipping favorita-grocery-sales-forecasting.zip

if os.path.exists('favorita-grocery-sales-forecasting.zip'):
    !unzip 'favorita-grocery-sales-forecasting.zip'
    print("File unzipped Successfully")
else:
    print("File Not Present to unzip")

Archive:  favorita-grocery-sales-forecasting.zip
  inflating: holidays_events.csv.7z  
  inflating: items.csv.7z            
  inflating: oil.csv.7z              
  inflating: sample_submission.csv.7z  
  inflating: stores.csv.7z           
  inflating: test.csv.7z             
  inflating: train.csv.7z            
  inflating: transactions.csv.7z     
File unzipped Successfully


In [5]:
#installing 7zip for extracting .7z files
!apt-get install p7zip-full

Reading package lists... 0%Reading package lists... 0%Reading package lists... 0%Reading package lists... 6%Reading package lists... 6%Reading package lists... 6%Reading package lists... 6%Reading package lists... 62%Reading package lists... 62%Reading package lists... 63%Reading package lists... 63%Reading package lists... 70%Reading package lists... 70%Reading package lists... 71%Reading package lists... 71%Reading package lists... 77%Reading package lists... 80%Reading package lists... 80%Reading package lists... 80%Reading package lists... 80%Reading package lists... 80%Reading package lists... 80%Reading package lists... 80%Reading package lists... 80%Reading package lists... 86%Reading package lists... 86%Reading package lists... 87%Reading package lists... 87%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package 

In [6]:
#Extracting .7z files if they are not already extracted.

for file in os.listdir():
    if file[-3:]=='.7z':
        if os.path.exists(file[:-3]):
            print("="*50)
            print("'{}'Extracted File is Already Present".format(file[:-3]))
        elif file=='oil.csv.7z':
            !p7zip -d 'oil.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='train.csv.7z':
            !p7zip -d 'train.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='stores.csv.7z':
            !p7zip -d 'stores.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='transactions.csv.7z':
            !p7zip -d 'transactions.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='items.csv.7z':
            !p7zip -d 'items.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='holidays_events.csv.7z':
            !p7zip -d 'holidays_events.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='test.csv.7z':
            !p7zip -d 'test.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        elif file=='sample_submission.csv.7z':
            !p7zip -d 'sample_submission.csv.7z'
            print("="*50)
            print("'{}' File Extracted Successfully".format(file))

        print("="*50)






7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 666528 bytes (651 KiB)

Extracting archive: sample_submission.csv.7z
--
Path = sample_submission.csv.7z
Type = 7z
Physical Size = 666528
Headers Size = 146
Method = LZMA2:24
Solid = -
Blocks = 1

  0%     93% - sample_submission.csv                            Everything is Ok

Size:       40445582
Compressed: 666528
'sample_submission.csv.7z' File Extracted Successfully

7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 47409

In [None]:
#Creating features by excecuting Pre_Processing Feature_engineering.py

exec(open('Pre_Processing Feature_engineering.py').read())


# Train Dataset Initial Date 31/5/2017
# 6 weeks

# Validation Dataset Initial Date 26/7/2017
# 1 week

# Test Dataset Initial Date 16/8/2017
# 1 week

Data Pre-Processing ...


HBox(children=(FloatProgress(value=0.0, max=23808261.0), HTML(value='')))


Enter the following for Train Data :

Starting Date (Day/Month/Year) --> 31/5/2017
No. of weeks --> 6

Creating Features for data between Dates --> 2017-05-31 - 2017-07-12 (i.e. 6 weeks) 



HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


Saving 'X_train.csv' File ...
Saving 'y_train.csv' File ...

Enter the following for Validation Data :

Starting Date (Day/Month/Year) --> 26/7/2017
No. of weeks --> 1

Creating Features for data between Dates --> 2017-07-26 - 2017-08-02 (i.e. 1 weeks) 



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Saving 'X_val.csv' File ...
Saving 'y_val.csv' File ...

Enter the following for Test Data :

Starting Date (Day/Month/Year) --> 16/8/2017
No. of weeks --> 1

Creating Features for data between Dates --> 2017-08-16 - 2017-08-23 (i.e. 1 weeks) 



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Saving 'X_test.csv' File ...

Saving 'sales_2017.csv' File ...
Saving 'stores_items.csv' File ...


### Reading Data

In [3]:
# Reading X_train.csv and reducing memory usage
X_train=pd.read_csv("X_train.csv")
X_train=reduce_mem_usage(X_train)

# Reading y_train.csv and converting into numpy array
y_train = np.array(pd.read_csv( 'y_train.csv'))

Memory usage of Dataframe is 5662.987 MB


HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))


Memory usage after optimization is: 1337.467 MB
Decreased by 76.4%


In [None]:
# Reading X_val.csv and reducing memory usage
X_val=pd.read_csv("X_val.csv")
X_val=reduce_mem_usage(X_val)

# Reading y_val.csv and converting into numpy array
y_val = np.array(pd.read_csv( 'y_val.csv'))

In [4]:
# Reading X_test.csv and reducing memory usage
X_test=pd.read_csv("X_test.csv")
X_test=reduce_mem_usage(X_test)


Memory usage of Dataframe is 808.998 MB


HBox(children=(FloatProgress(value=0.0, max=633.0), HTML(value='')))


Memory usage after optimization is: 191.067 MB
Decreased by 76.4%


In [5]:
# Reading stores_items.csv
stores_items = pd.read_csv('stores_items.csv', index_col=['store_nbr','item_nbr'])

# Reading items.csv and setting index as item_nbr
items = pd.read_csv( 'items.csv' ).set_index("item_nbr")

items = items.reindex( stores_items.index.get_level_values(1) )
items=reduce_mem_usage(items)



Memory usage of Dataframe is 5.112 MB


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Memory usage after optimization is: 1.919 MB
Decreased by 62.5%


### Feature Selection

In [6]:
# Loading Top 300 Feature Names (got by training random forest)
import pickle
with open('300_filtered_features.pkl','rb') as file:
    filtered_features = pickle.load( file)

### Defining LGB

In [7]:
def train_lgb_model(X_train,y_train,X_val,y_val,params,num_boost_rounds,n_days,items,features,verbose,X_test=None):
    '''
    Filter features from the Dataset and then
    Trains 16 different lgb models for predicting next 16 days sales . 
    and Stores all the models into a list.
    Returns --> * val_pred i.e.predicted values of validation data
                * test_pred i.e.predicted values of test data if present
                * boost_rounds i.e List of best no. of trees for every model
    '''
    global models

    params['device_type']= 'gpu'
    params['objective'] = 'regression'
    params['metric'] = 'l2'
    params ['num_threads']= 16

    # num_boost_rounds parameter should be given for every model seperately as a list.
    # but if it is given as an inetger then a list is made  having same value 16 times.
    if not type(num_boost_rounds) == list:
        temp=num_boost_rounds
        num_boost_rounds=[]
        for i in range(16):
            num_boost_rounds.append(temp)

    val_pred = []
    test_pred = []
    boost_rounds=[]

    #Training 16 different models for predicting next 16 days sales.
    for i in range(16):
        print("=" * 50)
        print("Step %d" % (i+1))
        print("=" * 50)

        # Filtering features
        x_train = X_train[features[i]]

        #Filtering Features from test dataset if it exists or Validation data.
        try:
            x_val = X_val[features[i]]
        except:
            x_test = X_test[features[i]]


        #Creating Train lightgbm Dataset
        dtrain = lgb.Dataset( x_train, label=y_train[:, i],
                              weight=pd.concat([items["perishable"]] * n_days) * 0.25 + 1  )#As described on kaggle  Items marked as perishable have a score weight of 1.25; otherwise, the weight is 1.0.
        valid_sets=[dtrain]

        #Creating Val lightgbm Dataset if it exists
        try:
            dval = lgb.Dataset(  x_val, label=y_val[:, i], reference=dtrain,
                                weight=items["perishable"] * 0.25 + 1 )
            valid_sets=[dtrain,dval]
        except:
            pass
            
        #Training Lgbm
        model = lgb.train( params, dtrain, num_boost_rounds[i],
                        valid_sets=valid_sets, verbose_eval=verbose )
        
        #Storing each model
        models.append(model)

        # appending results of prediction on val set if it exists
        try:
            val_pred.append(model.predict(x_val, num_iteration=model.best_iteration or num_boost_rounds[i]))
        except:
            pass
        # appending results of prediction on test set if it exists
        try:
            test_pred.append(model.predict(x_test, num_iteration = model.best_iteration  or num_boost_rounds[i]))
        except:
            pass

        #Appending best no. of trees for every model 
        boost_rounds.append(model.best_iteration  or num_boost_rounds[i])

        # Deleting unneccessary variables
        try:
            del model,dtrain,x_train,x_val,dval
        except:
            pass

    if type(X_test) != type(None):
        return test_pred
    else:
        return val_pred,boost_rounds

### Performance Metric


**NWRMSLE** (Normalized Weighted Root Mean Squared Logarithmic Error)

In [8]:
def calculate_nwrmsle(true,pred,weight):
    ''' 
    Calculates Normalized Weighted Root Mean Squared Logarithmic Error (nwrmsle)

    true = true labels
    pred =  predicted labels
    weight = weights of datapoints

    returns nwrmsle '''

    temp = (true - np.array(pred).transpose())**2
    temp = temp.sum(axis=1) * weight
    nwrmsle = np.sqrt(temp.sum() / weight.sum() / 16)
    return nwrmsle

### LGBM Tuned Parameters

In [9]:
#Loading best model parameters
import pickle
with open('lgbm_params.pkl','rb') as file:
    params = pickle.load(file)
params

{'bagging_fraction': 0.792127,
 'bagging_freq': 1,
 'feature_fraction': 0.614,
 'learning_rate': 0.020756,
 'min_data_in_leaf': 180,
 'num_leaves': 71}

### Training Model for 6 weeks 

 Now using more previous data i.e (6 weeks) to train the model with best parameters and incresing  boosting rounds to further improve the model performance.

In [None]:
%%time

# Incresed boost_rounds to 4000 to improve the model performance
num_boost_rounds = 4000
# Using 6_weeks data
n_days=6
verbose=50
models=[]

val_pred,boost_rounds = train_lgb_model(X_train,y_train,X_val,y_val,params,num_boost_rounds,n_days,items,filtered_features,verbose)

val_mse = mean_squared_error(y_val, np.array(val_pred).transpose())
print("val_mse --> ",val_mse)

weight = items["perishable"] * 0.25 + 1
nwrmsle = calculate_nwrmsle(true,pred,weight)
print("nwrmsle --> ",nwrmsle)

Step 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 61412
[LightGBM] [Info] Number of data points in the train set: 1005090, number of used features: 300
[LightGBM] [Info] Using GPU Device: Tesla K80, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 264 dense feature groups (253.05 MB) transferred to GPU in 0.379920 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 1.039801
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.415821	valid_1's l2: 0.397041
[100]	training's l2: 0.317249	valid_1's l2: 0.306564
[150]	training's l2: 0.298967	valid_1's l2: 0.291667
[200]	training's l2: 0.292422	valid_1's l2: 0.2872
[250]	training's l2: 0.288919	valid_1's l2: 0.285149
[300]	training's l2: 0.286851	valid_1's l2: 0.28428
[350]	training's l2: 0.285129	valid_1's l2: 

In [10]:
#Saving boost_rounds
import pickle
with open('boost_rounds.pkl','wb') as file:
    pickle.dump(boost_rounds,file)


#### Observation
* The model performance has been increased by using more previous data and more boosting rounds.
* *Score (NWRMSLE)* = **0.5890**

### Final Model 

* Now Training the Final Model on Total Data (i.e. Combined Train Data 6weeks + Validation Data 1week ).
* Not using fixed boost rounds = 4000 as there is no validation data for eary stopping.
* Using different boost rounds for all 16 models that performed best during the previous training.

#### Creating Features for 7 weeks

In [12]:
#Creating features by excecuting Pre_Processing Feature_engineering.py

exec(open('Pre_Processing Feature_engineering.py').read())


# Train Dataset Initial Date 31/5/2017
# 7 weeks

# Validation Dataset Initial Date 26/7/2017
# 1 week

# Test Dataset Initial Date 16/8/2017
# 1 week

Data Pre-Processing ...


HBox(children=(FloatProgress(value=0.0, max=23808261.0), HTML(value='')))


Enter the following for Train Data :

Starting Date (Day/Month/Year) --> 31/5/2017
No. of weeks --> 7

Creating Features for data between Dates --> 2017-05-31 - 2017-07-19 (i.e. 7 weeks) 



HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


Saving 'X_train.csv' File ...
Saving 'y_train.csv' File ...

Enter the following for Validation Data :

Starting Date (Day/Month/Year) --> 26/7/2017
No. of weeks --> 1

Creating Features for data between Dates --> 2017-07-26 - 2017-08-02 (i.e. 1 weeks) 



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Saving 'X_val.csv' File ...
Saving 'y_val.csv' File ...

Enter the following for Test Data :

Starting Date (Day/Month/Year) --> 16/8/2017
No. of weeks --> 1

Creating Features for data between Dates --> 2017-08-16 - 2017-08-23 (i.e. 1 weeks) 



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Saving 'X_test.csv' File ...

Saving 'sales_2017.csv' File ...
Saving 'stores_items.csv' File ...


#### Training Final Lgbm Model

In [11]:
%%time
# Using 6(train)+1(val) weeks data
n_days = 7
verbose = 200
models=[]

test_pred = train_lgb_model(X_train,y_train,None,None,params,boost_rounds,n_days,items,filtered_features,verbose,X_test)



Step 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 61419
[LightGBM] [Info] Number of data points in the train set: 1172605, number of used features: 300
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 264 dense feature groups (295.23 MB) transferred to GPU in 0.413887 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 1.034830
[200]	training's l2: 0.291423
[400]	training's l2: 0.283176
[600]	training's l2: 0.278751
[800]	training's l2: 0.275346
[1000]	training's l2: 0.272441
[1200]	training's l2: 0.269792
[1400]	training's l2: 0.267339
[1600]	training's l2: 0.264971
[1800]	training's l2: 0.262731
[2000]	training's l2: 0.260576
Step 2
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 61203
[LightGBM] [Info]

In [12]:
#Saving Models
import pickle
with open('final_models.pkl','wb') as file:
    pickle.dump(models,file)


### Predicting for Test Data

In [15]:
# Reading test.csv
df_test = pd.read_csv("test.csv",parse_dates=["date"])

# Reading sales_2017.csv
sales_2017 = pd.read_csv("sales_2017.csv")

# setting index as store_nbr,item_nbr
sales_2017 = sales_2017.set_index(["store_nbr", "item_nbr"])

# setting index as store_nbr,item_nbr,date
df_test = df_test.set_index(["store_nbr", "item_nbr",'date'])

In [16]:
#Converting predicitons on test data to numpy array and taking transpose.
y_test = np.array(test_pred).transpose()

# Creating Dataframe with test predicitons and setting index same as sales_2017 Dataframe (i.e. str_nbr, item_nbr)
df_preds = pd.DataFrame(y_test, index=sales_2017.index,
                        columns=pd.date_range("2017-08-16", periods=16) #Column names as Date starting from 16/8/2017 till next 16 days
                        ).stack().to_frame("unit_sales")                # Stacking date columns to index

# Setting names of the indices
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)


# Joining the given test dataset(df_test) for which predictions were to be made
# and the dataframe in which predicited values are present
final_lgb_predicitons = df_test[["id"]].join(df_preds, how="left").fillna(0) #The prediciton is only done for Items which were present in train data so the new items sales will be filled with 0

# Converting predicted unit_sales back to orginal form by taking exp(unit_sales) - 1 as it was previously converted using log(unit_sales) + 1
final_lgb_predicitons["unit_sales"] = np.clip(np.expm1(final_lgb_predicitons["unit_sales"]), 0, 1000)

# Saving file for submission.
print("Saving 'final_lgb_predicitons.csv' File ...")
final_lgb_predicitons.to_csv('final_lgb_predicitons.csv', float_format='%.4f', index=None)

Saving 'final_lgb_predicitons.csv' File ...


### Results (After Submitting):


* Model --> **LGBM**
* Best Parameters :
    * *learning_rate* = **0.020756**
    * *num_leaves* = **71**
    * *min_data_in_leaf* = **180**
    * *feature_fraction* = **0.614000**
    * *bagging_fraction* = **0.792127**
    * *bagging_freq* = **1** 
* Private Score (NWRMSLE) = **0.51241** (***Rank-2***)
* Public Score (NWRMSLE) = **0.50822**