# Sales Forecasting

This model trains machine learning algorithms to perform sales forecasting. We consider XGBoost, LightGBM, and a blending model

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
sns.set(style="darkgrid")

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd 
import re
from itertools import product
import time

In [3]:
# load data
items=pd.read_csv("competitive-data-science-predict-future-sales/items.csv")
shops=pd.read_csv("competitive-data-science-predict-future-sales/shops.csv")
cats=pd.read_csv("competitive-data-science-predict-future-sales/item_categories.csv")
train=pd.read_csv("competitive-data-science-predict-future-sales/sales_train.csv")
test=pd.read_csv("competitive-data-science-predict-future-sales/test.csv")

We do some cleaning to the data, i.e remove outliers and clip the sales between 0 and 20

In [4]:
#drop shops 9 and 20 from training set, they don't appear in the test set
train = train.drop(labels = train[train.shop_id == 9].index, axis = 0)
train = train.drop(labels = train[train.shop_id == 20].index, axis = 0)

#get rid of obvious outliers
train = train[(train.item_price < 300000 )& (train.item_cnt_day < 1000)]

train.loc[train.item_cnt_day < 0, "item_cnt_day"] = 0 # or -train.loc[train.item_cnt_day < 0, "item_cnt_day"]
train.loc[train.item_cnt_day < 1, "item_cnt_day"] = 0

train['item_cnt_day_unclipped'] = train['item_cnt_day']

#We want to clip the target value before aggregating so that mean values are not distorted due to outliers. We retain the unclipped value for use in features that do not aggregate the sales data.
train['item_cnt_day'] = train['item_cnt_day'].clip(0, 20)   #early clipping helps get better results

Check what we have in the test set. We output the occurence of the 3 item categories in the test set. This is discussed in Section 2.3 of the report

In [5]:
good_sales = test.merge(train, on=['item_id','shop_id'], how='left').dropna()
good_pairs = test[test['ID'].isin(good_sales['ID'])]   #so for these test sample we have training data
no_data_items = test[~(test['item_id'].isin(train['item_id']))]   #these items are not in the training set

print('1. Number of good pairs:', len(good_pairs))
print('2. No Data Items:', len(no_data_items))
print('3. Only Item_id Info:', len(test)-len(no_data_items)-len(good_pairs))  #for these the item is in the training set but the item/shop combination is not

1. Number of good pairs: 111404
2. No Data Items: 16086
3. Only Item_id Info: 86710


We are handling the shop duplicates now

In [6]:
#there are dublicate shops

# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

We now extract properties like city, item/shop category and item names from the data

In [7]:
#define some properties of the shops (city, category, name)
shops.loc[ shops.shop_name == 'Сергиев Посад ТЦ "7Я"',"shop_name" ] = 'СергиевПосад ТЦ "7Я"'
shops["city"] = shops.shop_name.str.split(" ").map( lambda x: x[0] )
shops["category"] = shops.shop_name.str.split(" ").map( lambda x: x[1] )
shops.loc[shops.city == "!Якутск", "city"] = "Якутск"

category = []
for cat in shops.category.unique():
    if len(shops[shops.category == cat]) >= 5:
        category.append(cat)
shops.category = shops.category.apply( lambda x: x if (x in category) else "other" )

shops["shop_category"] = LabelEncoder().fit_transform( shops.category )
shops["shop_city"] = LabelEncoder().fit_transform( shops.city )
shops = shops[["shop_id", "shop_category", "shop_city"]]

#define some properties of the items

cats["type_code"] = cats.item_category_name.apply( lambda x: x.split(" ")[0] ).astype(str)
cats.loc[ (cats.type_code == "Игровые")| (cats.type_code == "Аксессуары"), "category" ] = "Игры"

category = []
for cat in cats.type_code.unique():
    if len(cats[cats.type_code == cat]) >= 5: 
        category.append( cat )
cats.type_code = cats.type_code.apply(lambda x: x if (x in category) else "etc")

cats.type_code = LabelEncoder().fit_transform(cats.type_code)
cats["split"] = cats.item_category_name.apply(lambda x: x.split("-"))
cats["subtype"] = cats.split.apply(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats["subtype_code"] = LabelEncoder().fit_transform( cats["subtype"] )
cats = cats[["item_category_id", "subtype_code", "type_code"]]

We are cleaning the item data, there are some inconsistencies in the string variables describing the names

In [8]:
def name_correction(x):
    x = x.lower() # all letters lower case
    x = x.partition('[')[0] # partition by square brackets
    x = x.partition('(')[0] # partition by curly brackets
    x = re.sub('[^A-Za-z0-9А-Яа-я]+', ' ', x) # remove special characters
    x = x.replace('  ', ' ') # replace double spaces with single spaces
    x = x.strip() # remove leading and trailing white space
    return x

In [9]:
# split item names by first bracket
items["name1"], items["name2"] = items.item_name.str.split("[", 1).str
items["name1"], items["name3"] = items.item_name.str.split("(", 1).str

# replace special characters and turn to lower case
items["name2"] = items.name2.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
items["name3"] = items.name3.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()

# fill nulls with '0'
items = items.fillna('0')

items["item_name"] = items["item_name"].apply(lambda x: name_correction(x))

# return all characters except the last if name 2 is not "0" - the closing bracket
items.name2 = items.name2.apply( lambda x: x[:-1] if x !="0" else "0")



items["type"] = items.name2.apply(lambda x: x[0:8] if x.split(" ")[0] == "xbox" else x.split(" ")[0] )
items.loc[(items.type == "x360") | (items.type == "xbox360") | (items.type == "xbox 360") ,"type"] = "xbox 360"
items.loc[ items.type == "", "type"] = "mac"
items.type = items.type.apply( lambda x: x.replace(" ", "") )
items.loc[ (items.type == 'pc' )| (items.type == 'pс') | (items.type == "pc"), "type" ] = "pc"
items.loc[ items.type == 'рs3' , "type"] = "ps3"


group_sum = items.groupby(["type"]).agg({"item_id": "count"})
group_sum = group_sum.reset_index()
drop_cols = []
for cat in group_sum.type.unique():
    if group_sum.loc[(group_sum.type == cat), "item_id"].values[0] <40:
        drop_cols.append(cat)
items.name2 = items.name2.apply( lambda x: "other" if (x in drop_cols) else x )
items = items.drop(["type"], axis = 1)

We now find that there are possible item duplicates in the data. For potential duplicates, we check if their item IDs are present in the test set. If both their IDs exist in the test
set, we decide to leave the data unchanged. When only one of the IDs is in the test set, we assign
the other item to this ID. By doing so, we reduce the cases of items in the test set for which we
have no sales history to 7% (i.e by 630 items, see Section 2.3 in the report). If none of the IDs are in the test
set, we assign both items to the same ID

In [10]:
#Duplicate rows exist in the item list. The following cell creates a dictionary that will allow us to reassign item id's where appropriate.

dupes = items[(items.duplicated(subset=['item_name','item_category_id'],keep=False))]   #gets the dublicate rows


dupes['in_test'] = dupes.item_id.isin(test.item_id.unique())   #checks if one of the dublicate items is in the test set

dupes = dupes.groupby('item_name').agg({'item_id':['first','last'],'in_test':['first','last']})  #puts the dublicates in the same row

#if both item id's are in the test set do nothing
dupes = dupes[(dupes[('in_test', 'first')]==False) | (dupes[('in_test', 'last')]==False)]  #we only consider cases where one is not in the test set

#if only the first id is in the test set assign this id to both
temp = dupes[dupes[('in_test', 'first')]==True]
keep_first = dict(zip(temp[('item_id', 'last')], temp[('item_id',  'first')]))
#if neither id or only the second id is in the test set, assign the second id to both
temp = dupes[dupes[('in_test', 'first')]==False]
keep_second = dict(zip(temp[('item_id', 'first')], temp[('item_id',  'last')]))
item_map = {**keep_first, **keep_second}

train = (train.replace({'item_id': item_map}))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dupes['in_test'] = dupes.item_id.isin(test.item_id.unique())   #checks if one of the dublicate items is in the test set


In [11]:
good_sales = test.merge(train, on=['item_id','shop_id'], how='left').dropna()
good_pairs = test[test['ID'].isin(good_sales['ID'])]   #so for these test sample we have training data
no_data_items = test[~(test['item_id'].isin(train['item_id']))]   #these items are not in the training set

print('1. Number of good pairs:', len(good_pairs))
print('2. No Data Items:', len(no_data_items))
print('3. Only Item_id Info:', len(test)-len(no_data_items)-len(good_pairs))  #for these the item is in the training set but the item/shop combination is not

1. Number of good pairs: 113143
2. No Data Items: 15456
3. Only Item_id Info: 85601


In [12]:
items.name2 = LabelEncoder().fit_transform(items.name2)
items.name3 = LabelEncoder().fit_transform(items.name3)
items.drop(["item_name", "name1"],axis = 1, inplace= True)

#### Feature engineering
We now perform the feature engineering process where we add properties to the data which are useful for the forecastin task. This is similar to what is being done in the other notebooks. The description of all the features is given in Table A1 in the report.

In [13]:
ts = time.time()
matrix = []
cols  = ["date_block_num", "shop_id", "item_id"]
for i in range(34):
    sales = train[train.date_block_num == i]
    matrix.append( np.array(list( product( [i], sales.shop_id.unique(), sales.item_id.unique() ) ), dtype = np.int16) )

matrix = pd.DataFrame( np.vstack(matrix), columns = cols )
matrix["date_block_num"] = matrix["date_block_num"].astype(np.int8)
matrix["shop_id"] = matrix["shop_id"].astype(np.int8)
matrix["item_id"] = matrix["item_id"].astype(np.int16)
matrix.sort_values( cols, inplace = True )
time.time()- ts

# add revenue to train df
train["revenue"] = train["item_cnt_day"] * train["item_price"]

In [14]:
ts = time.time()
group = train.groupby( ["date_block_num", "shop_id", "item_id"] ).agg( {"item_cnt_day": ["sum"]} )
group.columns = ["item_cnt_month"]
group.reset_index( inplace = True)
matrix = pd.merge( matrix, group, on = cols, how = "left" )
matrix["item_cnt_month"] = matrix["item_cnt_month"].fillna(0).astype(np.float16)
time.time() - ts


test["date_block_num"] = 34
test["date_block_num"] = test["date_block_num"].astype(np.int8)
test["shop_id"] = test.shop_id.astype(np.int8)
test["item_id"] = test.item_id.astype(np.int16)


ts = time.time()
matrix = pd.concat([matrix, test.drop(["ID"],axis = 1)], ignore_index=True, sort=False, keys=cols)
matrix.fillna( 0, inplace = True )
time.time() - ts


ts = time.time()
matrix = pd.merge(matrix, items, on = ["item_id"], how = "left")
matrix = pd.merge( matrix, cats, on = ["item_category_id"], how = "left" )
matrix["item_category_id"] = matrix["item_category_id"].astype(np.int8)
matrix["subtype_code"] = matrix["subtype_code"].astype(np.int8)
matrix["name2"] = matrix["name2"].astype(np.int8)
matrix["name3"] = matrix["name3"].astype(np.int16)
matrix["type_code"] = matrix["type_code"].astype(np.int8)
time.time() - ts

2.283644914627075

In [15]:
#clustering shops
shops_cats = pd.DataFrame(
    np.array(list(product(*[train['shop_id'].unique(), matrix['item_category_id'].unique()]))),
    columns =['shop_id', 'item_category_id']
)
temp = matrix.groupby(['item_category_id', 'shop_id']).agg({'item_cnt_month':'sum'}).reset_index()
temp2 = temp.groupby('shop_id').agg({'item_cnt_month':'sum'}).rename(columns={'item_cnt_month':'shop_total'})
temp = temp.join(temp2, on='shop_id')
temp['category_proportion'] = temp['item_cnt_month']/temp['shop_total']
temp = temp[['shop_id', 'item_category_id', 'category_proportion']]
shops_cats = pd.merge(shops_cats, temp, on=['shop_id','item_category_id'], how='left')
shops_cats = shops_cats.fillna(0)

shops_cats = shops_cats.pivot(index='shop_id', columns=['item_category_id'])
kmeans = KMeans(n_clusters=7, random_state=0).fit(shops_cats)
shops_cats['shop_cluster'] = kmeans.labels_.astype('int8')

#adding these clusters to the shops dataframe
shops = shops.join(shops_cats['shop_cluster'], on='shop_id')


matrix = pd.merge( matrix, shops, on = ["shop_id"], how = "left" )
matrix["shop_city"] = matrix["shop_city"].astype(np.int8)
matrix["shop_category"] = matrix["shop_category"].astype(np.int8)

In [16]:
# Define a lag feature function
def lag_feature( df,lags, cols ):
    for col in cols:
        print(col)
        tmp = df[["date_block_num", "shop_id","item_id",col ]]
        for i in lags:
            shifted = tmp.copy()
            shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_"+str(i)]
            shifted.date_block_num = shifted.date_block_num + i
            df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [17]:
ts = time.time()
matrix = lag_feature( matrix, [1,2,3], ["item_cnt_month"] )
time.time() - ts


ts = time.time()
group = matrix.groupby( ["date_block_num"] ).agg({"item_cnt_month" : ["mean"]})
group.columns = ["date_avg_item_cnt"]
group.reset_index(inplace = True)
matrix = pd.merge(matrix, group, on = ["date_block_num"], how = "left")
matrix.date_avg_item_cnt = matrix["date_avg_item_cnt"].astype(np.float16)
matrix = lag_feature( matrix, [1], ["date_avg_item_cnt"] )
matrix.drop( ["date_avg_item_cnt"], axis = 1, inplace = True )
time.time() - ts


ts = time.time()
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix.date_item_avg_item_cnt = matrix['date_item_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1,2,3], ['date_item_avg_item_cnt'])
matrix.drop(['date_item_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts


ts = time.time()
group = matrix.groupby( ["date_block_num","shop_id"] ).agg({"item_cnt_month" : ["mean"]})
group.columns = ["date_shop_avg_item_cnt"]
group.reset_index(inplace = True)
matrix = pd.merge(matrix, group, on = ["date_block_num","shop_id"], how = "left")
matrix.date_avg_item_cnt = matrix["date_shop_avg_item_cnt"].astype(np.float16)
matrix = lag_feature( matrix, [1,2,3,4], ["date_shop_avg_item_cnt"] )
matrix.drop( ["date_shop_avg_item_cnt"], axis = 1, inplace = True )
time.time() - ts


ts = time.time()
group = matrix.groupby(['date_block_num', 'shop_id', 'subtype_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_subtype_avg_item_cnt']
group.reset_index(inplace=True)
matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'subtype_code'], how='left')
matrix.date_shop_subtype_avg_item_cnt = matrix['date_shop_subtype_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1, 2], ['date_shop_subtype_avg_item_cnt'])
matrix.drop(['date_shop_subtype_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts



ts = time.time()
group = matrix.groupby(['date_block_num', 'shop_city']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_city_avg_item_cnt']
group.reset_index(inplace=True)
matrix = pd.merge(matrix, group, on=['date_block_num', "shop_city"], how='left')
matrix.date_city_avg_item_cnt = matrix['date_city_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], ['date_city_avg_item_cnt'])
matrix.drop(['date_city_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

ts = time.time()
group = matrix.groupby(['date_block_num', 'item_id', 'shop_city']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_city_avg_item_cnt' ]
group.reset_index(inplace=True)
matrix = pd.merge(matrix, group, on=['date_block_num', 'item_id', 'shop_city'], how='left')
matrix.date_item_city_avg_item_cnt = matrix['date_item_city_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], ['date_item_city_avg_item_cnt'])
matrix.drop(['date_item_city_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

item_cnt_month
date_avg_item_cnt
date_item_avg_item_cnt


  matrix.date_avg_item_cnt = matrix["date_shop_avg_item_cnt"].astype(np.float16)


date_shop_avg_item_cnt
date_shop_subtype_avg_item_cnt
date_city_avg_item_cnt
date_item_city_avg_item_cnt


11.41569995880127

In [18]:
ts = time.time()
group = train.groupby( ["item_id"] ).agg({"item_price": ["mean"]})
group.columns = ["item_avg_item_price"]
group.reset_index(inplace = True)

matrix = matrix.merge( group, on = ["item_id"], how = "left" )
matrix["item_avg_item_price"] = matrix.item_avg_item_price.astype(np.float16)


group = train.groupby( ["date_block_num","item_id"] ).agg( {"item_price": ["mean"]} )
group.columns = ["date_item_avg_item_price"]
group.reset_index(inplace = True)

matrix = matrix.merge(group, on = ["date_block_num","item_id"], how = "left")
matrix["date_item_avg_item_price"] = matrix.date_item_avg_item_price.astype(np.float16)
lags = [1]
matrix = lag_feature( matrix, lags, ["date_item_avg_item_price"] )
for i in lags:
    matrix["delta_price_lag_" + str(i) ] = (matrix["date_item_avg_item_price_lag_" + str(i)]- matrix["item_avg_item_price"] )/ matrix["item_avg_item_price"]


features_to_drop = ["item_avg_item_price", "date_item_avg_item_price"]
for i in lags:
    features_to_drop.append("date_item_avg_item_price_lag_" + str(i) )


matrix.drop(features_to_drop, axis = 1, inplace = True)
time.time() - ts

date_item_avg_item_price


10.502816915512085

In [19]:
ts = time.time()
group = train.groupby( ["date_block_num","shop_id"] ).agg({"revenue": ["sum"] })
group.columns = ["date_shop_revenue"]
group.reset_index(inplace = True)

matrix = matrix.merge( group , on = ["date_block_num", "shop_id"], how = "left" )
matrix['date_shop_revenue'] = matrix['date_shop_revenue'].astype(np.float32)

group = group.groupby(["shop_id"]).agg({ "date_block_num":["mean"] })
group.columns = ["shop_avg_revenue"]
group.reset_index(inplace = True )

matrix = matrix.merge( group, on = ["shop_id"], how = "left" )
matrix["shop_avg_revenue"] = matrix.shop_avg_revenue.astype(np.float32)
matrix["delta_revenue"] = (matrix['date_shop_revenue'] - matrix['shop_avg_revenue']) / matrix['shop_avg_revenue']
matrix["delta_revenue"] = matrix["delta_revenue"]. astype(np.float32)

matrix = lag_feature(matrix, [1], ["delta_revenue"])
matrix["delta_revenue_lag_1"] = matrix["delta_revenue_lag_1"].astype(np.float32)
matrix.drop( ["date_shop_revenue", "shop_avg_revenue", "delta_revenue"] ,axis = 1, inplace = True)
time.time() - ts

delta_revenue


10.62042498588562

In [20]:
matrix["month"] = matrix["date_block_num"] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix["days"] = matrix["month"].map(days).astype(np.int8)

ts = time.time()
matrix["item_shop_first_sale"] = matrix["date_block_num"] - matrix.groupby(["item_id","shop_id"])["date_block_num"].transform('min')
matrix["item_first_sale"] = matrix["date_block_num"] - matrix.groupby(["item_id"])["date_block_num"].transform('min')
time.time() - ts


ts = time.time()
matrix = matrix[matrix["date_block_num"] > 0] #> 0 gives best result
time.time() - ts

1.458292007446289

In [2]:
#matrix.to_csv('matrix_opt4.csv')

### Modelling (Starting to train Models begins here)

In [87]:
matrix = pd.read_csv('matrix_opt4.csv', index_col = 0)
matrix.head().T

Unnamed: 0,1445631,1445632,1445633,1445634,1445635
date_block_num,4.0,4.0,4.0,4.0,4.0
shop_id,2.0,2.0,2.0,2.0,2.0
item_id,27.0,28.0,29.0,30.0,31.0
item_cnt_month,0.0,0.0,0.0,0.0,0.0
item_category_id,19.0,30.0,23.0,40.0,37.0
name2,76.0,107.0,123.0,4.0,4.0
name3,42.0,42.0,42.0,42.0,562.0
subtype_code,10.0,55.0,16.0,4.0,1.0
type_code,3.0,3.0,3.0,5.0,5.0
shop_category,4.0,4.0,4.0,4.0,4.0


In [3]:
#get the training, validation and test set
import gc
data = matrix.copy()
del matrix
gc.collect()



X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1).fillna(0)
Y_train = data[data.date_block_num < 33]['item_cnt_month'].fillna(0)
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1).fillna(0)
Y_valid = data[data.date_block_num == 33]['item_cnt_month'].fillna(0)
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1).fillna(0)

Y_train = Y_train.clip(0, 20)
Y_valid = Y_valid.clip(0, 20)

#### XGBoost Model 

Here we train our XGBoost Model

In [4]:
import pickle
from xgboost import XGBRegressor
from matplotlib.pylab import rcParams

In [5]:
#ts = time.time()


model = XGBRegressor(
    max_depth=10,
    n_estimators=1000,
    min_child_weight=0.5, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 20)

#time.time() - ts



[0]	validation_0-rmse:1.21494	validation_1-rmse:1.08808
[1]	validation_0-rmse:1.18236	validation_1-rmse:1.06828
[2]	validation_0-rmse:1.15125	validation_1-rmse:1.05010
[3]	validation_0-rmse:1.12148	validation_1-rmse:1.03303
[4]	validation_0-rmse:1.09306	validation_1-rmse:1.01696
[5]	validation_0-rmse:1.06586	validation_1-rmse:1.00148
[6]	validation_0-rmse:1.03982	validation_1-rmse:0.98725
[7]	validation_0-rmse:1.01487	validation_1-rmse:0.97433
[8]	validation_0-rmse:0.99108	validation_1-rmse:0.96235
[9]	validation_0-rmse:0.96823	validation_1-rmse:0.95113
[10]	validation_0-rmse:0.94624	validation_1-rmse:0.93971
[11]	validation_0-rmse:0.92527	validation_1-rmse:0.92912
[12]	validation_0-rmse:0.90528	validation_1-rmse:0.91915
[13]	validation_0-rmse:0.88600	validation_1-rmse:0.91003
[14]	validation_0-rmse:0.86761	validation_1-rmse:0.90191
[15]	validation_0-rmse:0.85004	validation_1-rmse:0.89414
[16]	validation_0-rmse:0.83328	validation_1-rmse:0.88744
[17]	validation_0-rmse:0.81724	validation

In [6]:
#performing the prediction on the test set
Y_test = model.predict(X_test).clip(0, 20)
Y_test

array([0.77816284, 1.4667418 , 1.8849636 , ..., 0.11341064, 0.09529134,
       0.1393329 ], dtype=float32)

In [9]:
#saving the prediction
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('submission_xgb_new.csv', index=False)

#### LightGBM Model

Here we train our LightGBM Model

In [9]:
def build_lgb_model(params, X_train, X_val, y_train, y_val, cat_features):
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    model = lgb.train(params=params, train_set=lgb_train, valid_sets=(lgb_train, lgb_val), verbose_eval=10,
                     categorical_feature=cat_features)
    return model

In [None]:
import lightgbm as lgb
import shap

pd.set_option('display.max_rows', 160)
pd.set_option('display.max_columns', 160)
pd.set_option('display.max_colwidth', 30)


In [None]:
params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'num_leaves': 1100,
    'min_data_in_leaf':10,
    'feature_fraction':0.7,
    'learning_rate': 0.04, 
    'num_rounds': 1000,
    'early_stopping_rounds': 20,
    'seed': 1
}

#designating the categorical features which should be focused on
cat_features = ['item_category_id','month','shop_id','shop_city']

lgb_model = build_lgb_model(params, X_train, X_valid, Y_train, Y_valid, cat_features)


In [None]:
#perform the prediction

Y_test = lgb_model.predict(X_test).clip(0,20)

In [None]:
#save the prediction

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
#submission.to_csv('submission_lightgbm_opt.csv', index=False)

### Blending model

You basically have most of the code for this actually. After you run the XGBoost and lightGBM models, you want to add the predictions to a list such that you stack them i.e. append the XGBoost predictions, then the lightGBM or vice-versa. 

In [None]:
# Results for blending
Y_train_pred = []
Y_test_pred  = []
Y_val_pred   = []

# ------ Run LightGBM Model ------

# LightGBM predictions
Y_train_lgb = lgb_model.predict(X_train).clip(0,20)
Y_test_lgb  = lgb_model.predict(X_test).clip(0,20)
Y_val_lgb   = lgb_model.predict(X_valid).clip(0,20)

# Add predictions to results list
Y_train_pred.append(Y_train_lgb)
Y_test_pred.append(Y_test_lgb)
Y_val_pred.append(Y_val_lgb)

# ------ Run XGBoost Model ------

# XGBoost predictions
Y_train_xgb = xgb_model.predict(X_train).clip(0, 20)
Y_test_xgb  = xgb_model.predict(X_test).clip(0, 20)
Y_val_xgb   = xgb_model.predict(X_valid).clip(0, 20)

# Add predictions to results list
Y_train_pred.append(Y_train_xgb)
Y_test_pred.append(Y_test_xgb)
Y_val_pred.append(Y_val_xgb)

### Blending
This consists of combining different models using a so-called "meta-model". The meta-model is trained on the outputs of our XGBoost and LightGBM models to obtain a final prediction that may improve upon the individual predictions made by our  models.

The general procedure when creating a blending model is to extract a holdout sample from the data to use as the validation set. It is the predictions made on the validation sets that are fed into the meta-model as input for training (i.e. holdout predictions are used as training data for the meta-model). The final predictions are then computed using the test predictions made by the XGBoost and LightGBM models as input for the meta-model.

However, in the following we train the meta-model on the entire training set rather than the holdout set only as the best results were found in this way.

In [None]:
from sklearn.linear_model import LinearRegression
meta_model = LinearRegression() # this will be the regressor for our meta-model (worked best out of those tried)

def blending_model(meta_model, train_preds, val_preds, test_preds, Y_train, Y_val):
    stacked_train_pred = np.column_stack(train_preds)
    stacked_test_pred  = np.column_stack(test_preds)
    stacked_val_pred   = np.column_stack(val_preds)
    
    # Fit meta model on stacked predictions
    meta_model.fit(stacked_train_pred, Y_train)
    print('RMSE =', mean_squared_error(meta_model.predict(stacked_val_pred).clip(0, 20), Y_val, squared=False))
    return meta_model.predict(stacked_test_pred).clip(0, 20)


blending_Y_test = blending_model(meta_model, Y_train_pred, Y_val_pred, Y_test_pred, Y_train, Y_valid)

# submission = pd.DataFrame({
#     "ID": test.index, 
#     "item_cnt_month": blending_Y_test
# })
# submission.to_csv('blending_model.csv', index=False)

I'll put everything in the cell below (including the models) in case there's any confusion:

In [None]:
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


# ----------------------------------------------------


# Features
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1).fillna(0)
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1).fillna(0)

# Targets
Y_train = data[data.date_block_num < 33]['item_cnt_month'].fillna(0)
Y_valid = data[data.date_block_num == 33]['item_cnt_month'].fillna(0)

# Test data
X_test  = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1).fillna(0)

# Clipping to keep outliers out of computations
Y_train = Y_train.clip(0, 20)
Y_valid = Y_valid.clip(0, 20)

# Results for blending
Y_train_pred = []
Y_test_pred  = []
Y_val_pred   = []


# ----------------------------------------------------


# LightGBM Model
params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'num_leaves': 1100,
    'min_data_in_leaf':10,
    'feature_fraction':0.7,
    'learning_rate': 0.04, 
    'num_rounds': 1000,
    'early_stopping_rounds': 20,
    'seed': 1
}


# Categorical features to focus on
cat_features = ['item_category_id', 'month', 'shop_id', 'shop_city']

# LGB train and valid dataset
lgb_train = lgb.Dataset(X_train, Y_train)
lgb_val   = lgb.Dataset(X_valid, Y_valid)
 
# Train LightGBM model
lgb_model = lgb.train(params=params,
                      train_set=lgb_train,
                      valid_sets=(lgb_train, lgb_val),
                      verbose_eval=10,
                      categorical_feature=cat_features)  

# LightGBM predictions
Y_train_lgb = lgb_model.predict(X_train).clip(0,20)
Y_test_lgb  = lgb_model.predict(X_test).clip(0,20)
Y_val_lgb   = lgb_model.predict(X_valid).clip(0,20)

# Add predictions to results list
Y_train_pred.append(Y_train_lgb)
Y_test_pred.append(Y_test_lgb)
Y_val_pred.append(Y_val_lgb)


# ----------------------------------------------------


# XGBoost Model
xgb_model = XGBRegressor(
    max_depth=10,
    n_estimators=1000,
    min_child_weight=0.5, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,
    seed=42,
    eval_metric='rmse',
    early_stopping_rounds=20)

xgb_model.fit(
    X_train, 
    Y_train, 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True)

# XGBoost predictions
Y_train_xgb = xgb_model.predict(X_train).clip(0, 20)
Y_test_xgb  = xgb_model.predict(X_test).clip(0, 20)
Y_val_xgb   = xgb_model.predict(X_valid).clip(0, 20)

# Add predictions to results list
Y_train_pred.append(Y_train_xgb)
Y_test_pred.append(Y_test_xgb)
Y_val_pred.append(Y_val_xgb)


# ----------------------------------------------------


# Blending model
meta_model = LinearRegression() # this will be the regressor for our meta-model

def blending_model(meta_model, train_preds, val_preds, test_preds, Y_train, Y_val):
    stacked_train_pred = np.column_stack(train_preds)
    stacked_test_pred  = np.column_stack(test_preds)
    stacked_val_pred   = np.column_stack(val_preds)
    
    # Fit meta model on stacked predictions
    meta_model.fit(stacked_train_pred, Y_train)
    print('RMSE =', mean_squared_error(meta_model.predict(stacked_val_pred).clip(0, 20), Y_val, squared=False))
    return meta_model.predict(stacked_test_pred).clip(0, 20)


blending_Y_test = blending_model(meta_model, Y_train_pred, Y_val_pred, Y_test_pred, Y_train, Y_valid)

# submission = pd.DataFrame({
#     "ID": test.index, 
#     "item_cnt_month": blending_Y_test
# })
# submission.to_csv('blending_model.csv', index=False)