In [1]:
!pip install category_encoders
!pip install bayesian-optimization



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 

from sklearn.metrics import mean_squared_error,mean_squared_log_error
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV,cross_val_score,RepeatedKFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler,Normalizer,RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures

import xgboost as xgb
import lightgbm as lgb
import sklearn.ensemble as ensemble
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier,RandomForestRegressor,BaggingRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,LogisticRegression,Lasso, Ridge,LogisticRegressionCV,RidgeCV,LassoCV,ElasticNetCV,OrthogonalMatchingPursuit,ElasticNet,LassoLarsCV,BayesianRidge
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC,SVR
from scipy import stats
from scipy.stats import norm, skew
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.kernel_ridge import KernelRidge


from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder
from scipy.special import boxcox1p
from bayes_opt import BayesianOptimization

  import pandas.util.testing as tm


In [3]:
warnings.filterwarnings('ignore')

In [5]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')

In [6]:
train.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
0,1,17/01/11,8091,216418,99.0375,111.8625,0,0,20
1,2,17/01/11,8091,216419,99.0375,99.0375,0,0,28
2,3,17/01/11,8091,216425,133.95,133.95,0,0,19
3,4,17/01/11,8091,216233,133.95,133.95,0,0,44
4,5,17/01/11,8091,217390,141.075,141.075,0,0,52


In [7]:
test.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku
0,212645,16/07/13,8091,216418,108.3,108.3,0,0
1,212646,16/07/13,8091,216419,109.0125,109.0125,0,0
2,212647,16/07/13,8091,216425,133.95,133.95,0,0
3,212648,16/07/13,8091,216233,133.95,133.95,0,0
4,212649,16/07/13,8091,217390,176.7,176.7,0,0


In [8]:
submission.head()

Unnamed: 0,record_ID,units_sold
0,212645,0
1,212646,0
2,212647,0
3,212648,0
4,212649,0


In [9]:
train.describe()

Unnamed: 0,record_ID,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
count,150150.0,150150.0,150150.0,150149.0,150150.0,150150.0,150150.0,150150.0
mean,106271.555504,9199.422511,254761.132468,206.626751,219.425927,0.095611,0.1332,51.674206
std,61386.037861,615.591445,85547.306447,103.308516,110.961712,0.294058,0.339792,60.207904
min,1.0,8023.0,216233.0,41.325,61.275,0.0,0.0,1.0
25%,53111.25,8562.0,217217.0,130.3875,133.2375,0.0,0.0,20.0
50%,106226.5,9371.0,222087.0,198.075,205.9125,0.0,0.0,35.0
75%,159452.75,9731.0,245338.0,233.7,234.4125,0.0,0.0,62.0
max,212644.0,9984.0,679023.0,562.1625,562.1625,1.0,1.0,2876.0


In [10]:
train.dtypes

record_ID            int64
week                object
store_id             int64
sku_id               int64
total_price        float64
base_price         float64
is_featured_sku      int64
is_display_sku       int64
units_sold           int64
dtype: object

In [11]:
train.isna().sum()

record_ID          0
week               0
store_id           0
sku_id             0
total_price        1
base_price         0
is_featured_sku    0
is_display_sku     0
units_sold         0
dtype: int64

In [12]:
#Imputing missing value with the relevant total price
train.total_price=train.total_price.fillna(469.5375)

In [13]:
print(train.isna().sum().sum())
print(test.isna().sum().sum())

0
0


# Feature Engineering

In [14]:
train.columns

Index(['record_ID', 'week', 'store_id', 'sku_id', 'total_price', 'base_price',
       'is_featured_sku', 'is_display_sku', 'units_sold'],
      dtype='object')

In [15]:
#New Feature Creation functions

def gen_count_id(train,test,col,name):
    temp=train.groupby(col)['record_ID'].count().reset_index().rename(columns={'record_ID':name})
    train=pd.merge(train,temp,how='left',on=col)
    test=pd.merge(test,temp,how='left',on=col)
    train[name]=train[name].astype(float)
    test[name]=test[name].astype(float)
    train[name].fillna(np.median(temp[name]),inplace=True)
    test[name].fillna(np.median(temp[name]),inplace=True)
    return train,test

def gen_average_units(train,test,col,name):
    temp=train.groupby(col)['units_sold'].mean().reset_index().rename(columns={'units_sold':name})
    train=pd.merge(train,temp,how='left',on=col)
    test=pd.merge(test,temp,how='left',on=col)
    train[name].fillna(np.median(temp[name]),inplace=True)
    test[name].fillna(np.median(temp[name]),inplace=True)
    return train,test

def gen_average_price(train,test,col,price='base_price',name='name'):
    temp=train.groupby(col)[price].mean().reset_index().rename(columns={price:name})
    train=pd.merge(train,temp,how='left',on=col)
    test=pd.merge(test,temp,how='left',on=col)
    train[name].fillna(np.median(temp[name]),inplace=True)
    test[name].fillna(np.median(temp[name]),inplace=True)
    return train,test

In [16]:
train,test = gen_count_id(train,test,col=['sku_id','store_id'],name='count_id_sku_store') #Genearting count of records per 'sku-id & store-id' 
train,test = gen_count_id(train,test,col=['sku_id'],name='count_id_sku') #Genearting count of records per 'sku-id'
train,test = gen_count_id(train,test,col=['store_id'],name='count_id_store') #Genearting count of records per 'store-id'

train,test = gen_average_units(train,test,col=['sku_id','store_id'],name='count_sku_store_id') #Genearting average units sold per 'sku-id & store-id'
train,test = gen_average_units(train,test,col=['store_id'],name='count_store_id') #Genearting average units sold per 'store-id'
train,test = gen_average_units(train,test,col=['sku_id'],name='count_sku_id') #Genearting average units sold per 'sku-id'

train,test = gen_average_price(train,test,col=['sku_id','store_id'],price='base_price',name='price_sku_store') #Genearting average base price per 'sku-id & store-id'
train,test = gen_average_price(train,test,col=['sku_id','store_id'],price='total_price',name='price_to_sku_store') #Genearting average total price per 'sku-id & store-id'
train,test = gen_average_price(train,test,col=['store_id'],price='base_price',name='price_store_id') #Genearting average base price per 'store-id'
train,test = gen_average_price(train,test,col=['sku_id'],price='base_price',name='price_sku_id') #Genearting average base price per 'sku-id'
train,test = gen_average_price(train,test,col=['store_id'],price='total_price',name='price_to_store_id') #Genearting average total price per 'store-id'
train,test = gen_average_price(train,test,col=['sku_id'],price='total_price',name='price_to_sku_id') #Genearting average total price per 'sku-id'

In [17]:
#Converting week feature
le = OrdinalEncoder()
train['week_1']=le.fit_transform(train['week'])
le = OrdinalEncoder()
test['week_1']=le.fit_transform(test['week'])+130

#Creating week number feature
train['week_num']=train.week_1%52
test['week_num']=test.week_1%52

train['week_num1']=train.week_1%4
test['week_num1']=test.week_1%4

# Encoding 'week' it using sine and cosine transform; considering it as a cyclic feature 
train['week_sin'] = np.sin(2 * np.pi * train['week_1'] / 52.143)
train['week_cos'] = np.cos(2 * np.pi * train['week_1'] / 52.143)
test['week_sin'] = np.sin(2 * np.pi * test['week_1'] / 52.143)
test['week_cos'] = np.cos(2 * np.pi * test['week_1'] / 52.143)

#Creating feature: percent difference between base price and checkout price.
train['price_diff_percent'] = (train['base_price'] - train['total_price']) / train['base_price']
test['price_diff_percent'] = (test['base_price'] - test['total_price']) / test['base_price']

In [18]:
train.tail()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,count_id_sku_store,count_id_sku,count_id_store,count_sku_store_id,count_store_id,count_sku_id,price_sku_store,price_to_sku_store,price_store_id,price_sku_id,price_to_store_id,price_to_sku_id,week_1,week_num,week_num1,week_sin,week_cos,price_diff_percent
150145,212638,09/07/13,9984,223245,235.8375,235.8375,0,0,38,130.0,9490.0,2210.0,61.384615,37.853394,68.510537,221.423077,207.69375,197.030107,214.878438,186.580537,203.073612,130,26,2,0.043065,-0.999072,0.0
150146,212639,09/07/13,9984,223153,235.8375,235.8375,0,0,30,130.0,7540.0,2210.0,53.023077,37.853394,60.40756,226.465385,211.892019,197.030107,220.461485,186.580537,206.303241,130,26,2,0.043065,-0.999072,0.0
150147,212642,09/07/13,9984,245338,357.675,483.7875,1,1,31,130.0,8320.0,2210.0,21.553846,37.853394,33.212139,467.838462,425.066538,197.030107,476.750449,186.580537,432.766248,130,26,2,0.043065,-0.999072,0.260677
150148,212643,09/07/13,9984,547934,141.7875,191.6625,0,1,12,130.0,4030.0,2210.0,14.1,37.853394,21.838213,172.644231,162.954231,197.030107,174.616247,186.580537,166.385369,130,26,2,0.043065,-0.999072,0.260223
150149,212644,09/07/13,9984,679023,234.4125,234.4125,0,0,15,130.0,1170.0,2210.0,13.861538,37.853394,16.617094,208.154135,197.033654,197.030107,209.117532,186.580537,198.309455,130,26,2,0.043065,-0.999072,0.0


In [19]:
test.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,count_id_sku_store,count_id_sku,count_id_store,count_sku_store_id,count_store_id,count_sku_id,price_sku_store,price_to_sku_store,price_store_id,price_sku_id,price_to_store_id,price_to_sku_id,week_1,week_num,week_num1,week_sin,week_cos,price_diff_percent
0,212645,16/07/13,8091,216418,108.3,108.3,0,0,130.0,8840.0,1170.0,26.376923,32.805983,88.923869,105.800769,102.117692,181.312372,94.688268,172.272756,91.982702,131,27,3,-0.077343,-0.997005,0.0
1,212646,16/07/13,8091,216419,109.0125,109.0125,0,0,130.0,8710.0,1170.0,28.307692,32.805983,72.182664,105.839135,102.473942,181.312372,94.908763,172.272756,92.349162,131,27,3,-0.077343,-0.997005,0.0
2,212647,16/07/13,8091,216425,133.95,133.95,0,0,130.0,8580.0,1170.0,25.669231,32.805983,34.019231,131.204135,126.671538,181.312372,128.28521,172.272756,125.156355,131,27,3,-0.077343,-0.997005,0.0
3,212648,16/07/13,8091,216233,133.95,133.95,0,0,130.0,9620.0,1170.0,29.107692,32.805983,46.821206,129.916154,126.167308,181.312372,128.101871,172.272756,124.942208,131,27,3,-0.077343,-0.997005,0.0
4,212649,16/07/13,8091,217390,176.7,176.7,0,0,130.0,9100.0,1170.0,29.469231,32.805983,62.312747,164.439519,152.968269,181.312372,158.990538,172.272756,150.867334,131,27,3,-0.077343,-0.997005,0.0


In [20]:
X=train[list(set(train.columns)-set(['record_ID','units_sold','week']))]
Y= np.log1p(train['units_sold'])
X_test=test[list(set(test.columns)-set(['record_ID','week']))]

In [21]:
X.head()

Unnamed: 0,count_id_store,count_id_sku_store,week_1,is_featured_sku,price_store_id,price_to_store_id,count_sku_store_id,week_num1,price_diff_percent,week_sin,price_to_sku_id,count_sku_id,is_display_sku,base_price,count_id_sku,week_cos,total_price,price_to_sku_store,sku_id,price_sku_store,store_id,week_num,count_store_id,price_sku_id
0,1170.0,130.0,1,0,181.312372,172.272756,26.376923,1,0.11465,0.120208,91.982702,88.923869,0,111.8625,8840.0,0.992749,99.0375,102.117692,216418,105.800769,8091,1,32.805983,94.688268
1,1170.0,130.0,1,0,181.312372,172.272756,28.307692,1,0.0,0.120208,92.349162,72.182664,0,99.0375,8710.0,0.992749,99.0375,102.473942,216419,105.839135,8091,1,32.805983,94.908763
2,1170.0,130.0,1,0,181.312372,172.272756,25.669231,1,0.0,0.120208,125.156355,34.019231,0,133.95,8580.0,0.992749,133.95,126.671538,216425,131.204135,8091,1,32.805983,128.28521
3,1170.0,130.0,1,0,181.312372,172.272756,29.107692,1,0.0,0.120208,124.942208,46.821206,0,133.95,9620.0,0.992749,133.95,126.167308,216233,129.916154,8091,1,32.805983,128.101871
4,1170.0,130.0,1,0,181.312372,172.272756,29.469231,1,0.0,0.120208,150.867334,62.312747,0,141.075,9100.0,0.992749,141.075,152.968269,217390,164.439519,8091,1,32.805983,158.990538


In [22]:
X.dtypes

count_id_store        float64
count_id_sku_store    float64
week_1                  int64
is_featured_sku         int64
price_store_id        float64
price_to_store_id     float64
count_sku_store_id    float64
week_num1               int64
price_diff_percent    float64
week_sin              float64
price_to_sku_id       float64
count_sku_id          float64
is_display_sku          int64
base_price            float64
count_id_sku          float64
week_cos              float64
total_price           float64
price_to_sku_store    float64
sku_id                  int64
price_sku_store       float64
store_id                int64
week_num                int64
count_store_id        float64
price_sku_id          float64
dtype: object

In [23]:
X=X.astype({'sku_id': 'category','store_id': 'category'})
X_test=X_test.astype({'sku_id': 'category','store_id': 'category'})

In [24]:
print(len(X_test.columns))
print(len(X.columns))

24
24


In [25]:
print(X_test.isna().sum().sum())
print(X.isna().sum().sum())

0
0


In [26]:
category_list=['store_id','sku_id']

In [27]:
encoder_final=MEstimateEncoder()
encoder_final.fit(X[category_list], Y)

cat_enc = encoder_final.transform(X[category_list], Y)
continuous_train = X.drop(columns= category_list)
X = pd.concat([cat_enc,continuous_train],axis=1)

test_enc=encoder_final.transform(X_test[category_list])
continuous_test=X_test.drop(columns= category_list)
X_test=pd.concat([test_enc,continuous_test],axis=1)

In [28]:
X.head()

Unnamed: 0,store_id,sku_id,count_id_store,count_id_sku_store,week_1,is_featured_sku,price_store_id,price_to_store_id,count_sku_store_id,week_num1,price_diff_percent,week_sin,price_to_sku_id,count_sku_id,is_display_sku,base_price,count_id_sku,week_cos,total_price,price_to_sku_store,price_sku_store,week_num,count_store_id,price_sku_id
0,3.250779,4.201821,1170.0,130.0,1,0,181.312372,172.272756,26.376923,1,0.11465,0.120208,91.982702,88.923869,0,111.8625,8840.0,0.992749,99.0375,102.117692,105.800769,1,32.805983,94.688268
1,3.250779,4.051982,1170.0,130.0,1,0,181.312372,172.272756,28.307692,1,0.0,0.120208,92.349162,72.182664,0,99.0375,8710.0,0.992749,99.0375,102.473942,105.839135,1,32.805983,94.908763
2,3.250779,3.303344,1170.0,130.0,1,0,181.312372,172.272756,25.669231,1,0.0,0.120208,125.156355,34.019231,0,133.95,8580.0,0.992749,133.95,126.671538,131.204135,1,32.805983,128.28521
3,3.250779,3.715659,1170.0,130.0,1,0,181.312372,172.272756,29.107692,1,0.0,0.120208,124.942208,46.821206,0,133.95,9620.0,0.992749,133.95,126.167308,129.916154,1,32.805983,128.101871
4,3.250779,3.850743,1170.0,130.0,1,0,181.312372,172.272756,29.469231,1,0.0,0.120208,150.867334,62.312747,0,141.075,9100.0,0.992749,141.075,152.968269,164.439519,1,32.805983,158.990538


In [29]:
X_test.head()

Unnamed: 0,store_id,sku_id,count_id_store,count_id_sku_store,week_1,is_featured_sku,price_store_id,price_to_store_id,count_sku_store_id,week_num1,price_diff_percent,week_sin,price_to_sku_id,count_sku_id,is_display_sku,base_price,count_id_sku,week_cos,total_price,price_to_sku_store,price_sku_store,week_num,count_store_id,price_sku_id
0,3.250779,4.201821,1170.0,130.0,131,0,181.312372,172.272756,26.376923,3,0.0,-0.077343,91.982702,88.923869,0,108.3,8840.0,-0.997005,108.3,102.117692,105.800769,27,32.805983,94.688268
1,3.250779,4.051982,1170.0,130.0,131,0,181.312372,172.272756,28.307692,3,0.0,-0.077343,92.349162,72.182664,0,109.0125,8710.0,-0.997005,109.0125,102.473942,105.839135,27,32.805983,94.908763
2,3.250779,3.303344,1170.0,130.0,131,0,181.312372,172.272756,25.669231,3,0.0,-0.077343,125.156355,34.019231,0,133.95,8580.0,-0.997005,133.95,126.671538,131.204135,27,32.805983,128.28521
3,3.250779,3.715659,1170.0,130.0,131,0,181.312372,172.272756,29.107692,3,0.0,-0.077343,124.942208,46.821206,0,133.95,9620.0,-0.997005,133.95,126.167308,129.916154,27,32.805983,128.101871
4,3.250779,3.850743,1170.0,130.0,131,0,181.312372,172.272756,29.469231,3,0.0,-0.077343,150.867334,62.312747,0,176.7,9100.0,-0.997005,176.7,152.968269,164.439519,27,32.805983,158.990538


# Model Building

In [30]:
X.columns

Index(['store_id', 'sku_id', 'count_id_store', 'count_id_sku_store', 'week_1',
       'is_featured_sku', 'price_store_id', 'price_to_store_id',
       'count_sku_store_id', 'week_num1', 'price_diff_percent', 'week_sin',
       'price_to_sku_id', 'count_sku_id', 'is_display_sku', 'base_price',
       'count_id_sku', 'week_cos', 'total_price', 'price_to_sku_store',
       'price_sku_store', 'week_num', 'count_store_id', 'price_sku_id'],
      dtype='object')

In [31]:
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size = 0.2,random_state=23)

In [32]:
len(x_train.columns)

24

In [33]:
rf_base = RandomForestRegressor()
rf_base.fit(x_train,y_train)


rf_tuned = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=10,
                      min_weight_fraction_leaf=0.0, n_estimators=600,
                      n_jobs=None, oob_score=True, random_state=None,
                      verbose=0, warm_start=False)
rf_tuned.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=30, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=600, n_jobs=None, oob_score=True,
                      random_state=None, verbose=0, warm_start=False)

In [34]:
model_lgb_base=lgb.LGBMRegressor(objective='regression')
model_lgb_base.fit(x_train,y_train)

model_lgb_tuned=lgb.LGBMRegressor(bagging_fraction=0.8, bagging_frequency=4, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.5,
              importance_type='split', learning_rate=0.1, max_depth=30,
              min_child_samples=20, min_child_weight=30, min_data_in_leaf=70,
              min_split_gain=0.0001, n_estimators=200, n_jobs=-1,
              num_leaves=1200, objective=None, random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

model_lgb_tuned.fit(x_train,y_train)

LGBMRegressor(bagging_fraction=0.8, bagging_frequency=4, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.5,
              importance_type='split', learning_rate=0.1, max_depth=30,
              min_child_samples=20, min_child_weight=30, min_data_in_leaf=70,
              min_split_gain=0.0001, n_estimators=200, n_jobs=-1,
              num_leaves=1200, objective=None, random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

In [35]:
prediction_rfb_valid=rf_base.predict(x_valid)
prediction_rft_valid=rf_tuned.predict(x_valid)
prediction_lgbmb_valid=model_lgb_base.predict(x_valid)
prediction_lgbmt_valid=model_lgb_tuned.predict(x_valid)

rf_base_msle=100*mean_squared_log_error(y_valid,prediction_rfb_valid)
rf_tuned_msle=100*mean_squared_log_error(y_valid,prediction_rft_valid)
lgbm_base_msle=100*mean_squared_log_error(y_valid,prediction_lgbmb_valid)
lgbm_tuned_msle=100*mean_squared_log_error(y_valid,prediction_lgbmt_valid)

prediction_ensemble_base=(((1-rf_base_msle)*prediction_rfb_valid)+((1-lgbm_base_msle)*prediction_lgbmb_valid))/(2-rf_base_msle-lgbm_base_msle)
prediction_ensemble_tuned=(((1-rf_tuned_msle)*prediction_rft_valid)+((1-lgbm_tuned_msle)*prediction_lgbmt_valid))/(2-rf_tuned_msle-lgbm_tuned_msle)

ensemble_base_msle=100*mean_squared_log_error(y_valid,prediction_ensemble_base)
ensemble_tuned_msle=100*mean_squared_log_error(y_valid,prediction_ensemble_tuned)


print("RF Base: {}; RF Tuned: {}".format(rf_base_msle,rf_tuned_msle))
print("LGBM Base: {}; LGBM Tuned: {}".format(lgbm_base_msle,lgbm_tuned_msle))
print("Ensemble Base: {}; Ensemble Tuned: {}".format(ensemble_base_msle,ensemble_tuned_msle))

RF Base: 0.8342987367398164; RF Tuned: 0.8484082177813135
LGBM Base: 0.9018330274446573; LGBM Tuned: 0.7321971492038865
Ensemble Base: 0.8151712987896794; Ensemble Tuned: 0.7477485765001536


In [36]:
model = lgb.LGBMRegressor(bagging_fraction=0.8, bagging_frequency=4, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.5,
              importance_type='split', learning_rate=0.1, max_depth=30,
              min_child_samples=20, min_child_weight=30, min_data_in_leaf=70,
              min_split_gain=0.0001, n_estimators=100, n_jobs=-1,
              num_leaves=1400, objective=None, random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

model.fit(X,Y)

LGBMRegressor(bagging_fraction=0.8, bagging_frequency=4, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.5,
              importance_type='split', learning_rate=0.1, max_depth=30,
              min_child_samples=20, min_child_weight=30, min_data_in_leaf=70,
              min_split_gain=0.0001, n_estimators=100, n_jobs=-1,
              num_leaves=1400, objective=None, random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

# Final Submission 

In [37]:
X_test.head()

Unnamed: 0,store_id,sku_id,count_id_store,count_id_sku_store,week_1,is_featured_sku,price_store_id,price_to_store_id,count_sku_store_id,week_num1,price_diff_percent,week_sin,price_to_sku_id,count_sku_id,is_display_sku,base_price,count_id_sku,week_cos,total_price,price_to_sku_store,price_sku_store,week_num,count_store_id,price_sku_id
0,3.250779,4.201821,1170.0,130.0,131,0,181.312372,172.272756,26.376923,3,0.0,-0.077343,91.982702,88.923869,0,108.3,8840.0,-0.997005,108.3,102.117692,105.800769,27,32.805983,94.688268
1,3.250779,4.051982,1170.0,130.0,131,0,181.312372,172.272756,28.307692,3,0.0,-0.077343,92.349162,72.182664,0,109.0125,8710.0,-0.997005,109.0125,102.473942,105.839135,27,32.805983,94.908763
2,3.250779,3.303344,1170.0,130.0,131,0,181.312372,172.272756,25.669231,3,0.0,-0.077343,125.156355,34.019231,0,133.95,8580.0,-0.997005,133.95,126.671538,131.204135,27,32.805983,128.28521
3,3.250779,3.715659,1170.0,130.0,131,0,181.312372,172.272756,29.107692,3,0.0,-0.077343,124.942208,46.821206,0,133.95,9620.0,-0.997005,133.95,126.167308,129.916154,27,32.805983,128.101871
4,3.250779,3.850743,1170.0,130.0,131,0,181.312372,172.272756,29.469231,3,0.0,-0.077343,150.867334,62.312747,0,176.7,9100.0,-0.997005,176.7,152.968269,164.439519,27,32.805983,158.990538


In [38]:
prediction=model.predict(X_test) 

In [39]:
final_prediction=np.round(np.expm1(prediction))
submission['units_sold']=final_prediction

In [40]:
submission.head()

Unnamed: 0,record_ID,units_sold
0,212645,17.0
1,212646,19.0
2,212647,28.0
3,212648,30.0
4,212649,21.0


In [42]:
submission.to_csv('final.csv',index=False)