# Sale Prediction: Training Models

## LSTM

In [None]:
from tensorflow import keras
import tensorflow as tf

from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler,ReduceLROnPlateau

from sklearn.model_selection import KFold,GroupKFold
from tensorflow.keras import layers

In [None]:
from sklearn.preprocessing import RobustScaler,StandardScaler
rb = RobustScaler()
sc = StandardScaler()

In [18]:
train = pd.read_csv('data/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('data/competitive-data-science-predict-future-sales/test.csv')
shops = pd.read_csv('data/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('data/competitive-data-science-predict-future-sales/items.csv')
categories = pd.read_csv('data/competitive-data-science-predict-future-sales/item_categories.csv')

In [10]:
dataset = train.pivot_table(index = ['shop_id','item_id'],values = ['item_cnt_day'],columns = ['date_block_num'],fill_value = 0,aggfunc='sum')

In [11]:
dataset.reset_index(inplace = True)

In [12]:
dataset = pd.merge(test,dataset,on = ['item_id','shop_id'],how = 'left')

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [13]:
dataset.fillna(0,inplace = True)

In [14]:
dataset.drop(['shop_id','item_id','ID'],inplace = True, axis = 1)

In [15]:
dataset.shape

(214200, 34)

In [None]:
# X we will keep all columns execpt the last one 
X_train = np.expand_dims(dataset.values[:,:-1],axis = 2)
# the last column is our label
y_train = dataset.values[:,-1:]

# for test we keep all the columns execpt the first one
X_test = np.expand_dims(dataset.values[:,1:],axis = 2)

# lets have a look on the shape 
print(X_train.shape,y_train.shape,X_test.shape)

In [None]:
save_best = tf.keras.callbacks.ModelCheckpoint("Model.h5", monitor='val_loss',verbose=1, save_best_only=True)

In [None]:
def build_model():
    
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True), input_shape=(33, 1)))

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
    model.add(tf.keras.layers.Dropout(0.2))

    model.add(tf.keras.layers.Flatten())
    
    model.add(tf.keras.layers.Dense(32, activation='relu', kernel_initializer='uniform'))
    model.add(tf.keras.layers.Dense(1))
    
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.002), loss = 'mse', metrics=['mse'])

    model.summary()
    
    return model

In [None]:
model = build_model()

In [None]:
model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=512, verbose=1, callbacks=[save_best])

In [None]:
model = tf.keras.models.load_model('./Model.h5')

## FastAI

In [1]:
from fastai.tabular.all import *

In [2]:
import pandas as pd

In [3]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from math import ceil

from itertools import product
import gc
from dateutil import parser

In [4]:
PATH = 'data/'

In [5]:
cat_vars = [
    'date_block_num',
    'item_id',
    'month',
    'shop_id',
    'year',
    'item_category_id',
    'item_category_name'
]

dep = 'target'

In [19]:
display(categories.head())
display(items.head())
display(shops.head())
display(train.head())
display(test.head())

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [20]:
index_cols  = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in train['date_block_num'].unique():
    cur_shops = train[train['date_block_num']==block_num]['shop_id'].unique()
    cur_items = train[train['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))


In [21]:
#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)



In [22]:
#get aggregated values for (shop_id, item_id, month)
gb = train.groupby(['shop_id', 'item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'sum',np.mean}})


In [23]:
#fix column names
gb.columns = ['shop_id', 'item_id', 'date_block_num','target','mean']


In [24]:
gb.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,mean
0,0,30,1,31.0,3.444444
1,0,31,1,11.0,1.571429
2,0,32,0,6.0,1.5
3,0,32,1,10.0,1.428571
4,0,33,0,3.0,1.0


In [25]:
#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
#all_data = pd.merge(grid,gb,how='left',on=index_cols)


In [26]:
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

# del grid,cur_shops,cur_items,gb
# del train
gc.collect()

64

In [27]:
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,mean
139255,0,19,0,0.0,0.0
141495,0,27,0,0.0,0.0
144968,0,28,0,0.0,0.0
142661,0,29,0,0.0,0.0
138947,0,32,0,6.0,1.5


In [28]:
gb.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,mean
0,0,30,1,31.0,3.444444
1,0,31,1,11.0,1.571429
2,0,32,0,6.0,1.5
3,0,32,1,10.0,1.428571
4,0,33,0,3.0,1.0


In [29]:
all_data.columns

Index(['shop_id', 'item_id', 'date_block_num', 'target', 'mean'], dtype='object')

In [30]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

all_data.fillna(0.0,inplace=True)
all_data['target'] = all_data['target'].clip(0.0,40.0)
all_data['month'] = all_data['date_block_num'] % 12
all_data['year'] = all_data['date_block_num'] // 12

In [31]:
test['date_block_num'] = 34
test['month'] = test['date_block_num']%12
test['year'] = test['date_block_num']//12

all_data = pd.concat([all_data,test])

# del sales_test
gc.collect()

16

In [32]:
all_data = all_data.merge(items, on="item_id")
all_data.drop('item_name',axis = 1,inplace=True)
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

all_data = downcast_dtypes(all_data)

gc.collect()

16

In [33]:
lag_variable = ['target']
lags = [1 ,2 ,3 ,4, 5, 12]
for lag in lags:
    sales_new_df = all_data[['date_block_num','shop_id','item_id']+lag_variable].copy()
    sales_new_df.date_block_num+=lag
    sales_new_df.columns = ['date_block_num','shop_id','item_id']+ [lag_feat+'_lag_'+str(lag) for lag_feat in lag_variable]
    all_data = pd.merge(all_data, sales_new_df,on=['date_block_num','shop_id','item_id'] ,how='left')
    del sales_new_df
    gc.collect()

for feat in all_data.columns:
    if 'target' in feat:
        all_data[feat]=all_data[feat].fillna(0)

In [34]:
all_data['avg_sales_per_shop_id'] = 0.0
all_data['avg_sales_per_item_id'] = 0.0
all_data['avg_sales_per_item_cat_id'] = 0.0

avg_shop_sales_per_month = all_data.groupby(['shop_id','date_block_num'],as_index=False).agg({'target':'mean'})
avg_item_sales_per_month = all_data.groupby(['item_id','date_block_num'],as_index=False).agg({'target':'mean'})
avg_item_cat_sales_per_month = all_data.groupby(['item_category_id','date_block_num', ],as_index=False).agg({'target':'mean'})

#lets calculate  6 lags
lag_variable = ['item_id','shop_id','item_category_id']
lags = [1 ,2 ,3 ,4, 5, 12]
curr = 0
for lag in lags:
    diff = lag - curr
    curr = lag
    avg_shop_sales_per_month.date_block_num+=diff
    avg_item_cat_sales_per_month.date_block_num+=diff
    avg_item_sales_per_month.date_block_num+=diff
    
    avg_shop_sales_per_month.columns = ['shop_id','date_block_num']+ [lag_variable[1]+'_lag_'+str(lag)]
    avg_item_sales_per_month.columns = ['item_id','date_block_num']+ [lag_variable[0]+'_lag_'+str(lag)]
    avg_item_cat_sales_per_month.columns = ['item_category_id','date_block_num']+ [lag_variable[2]+'_lag_'+str(lag)]
    
    all_data = pd.merge(all_data, avg_shop_sales_per_month,on=['date_block_num','shop_id'] ,how='left')
    all_data = pd.merge(all_data, avg_item_sales_per_month,on=['date_block_num','item_id'] ,how='left')
    all_data = pd.merge(all_data, avg_item_cat_sales_per_month,on=['date_block_num','item_category_id'] ,how='left')
    
    gc.collect()
del avg_item_cat_sales_per_month,avg_item_sales_per_month,avg_shop_sales_per_month

In [35]:
all_data.drop(columns=['mean'],inplace=True)
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

In [36]:
all_data.fillna(0,inplace=True)
gc.collect()

32

In [37]:
all_data.to_pickle(PATH + 'all_data.pkl')

In [38]:
all_data = pd.read_pickle(PATH + 'all_data.pkl')

In [39]:
all_data.columns

Index(['shop_id', 'item_id', 'date_block_num', 'target', 'month', 'year', 'ID',
       'item_category_id', 'target_lag_1', 'target_lag_2', 'target_lag_3',
       'target_lag_4', 'target_lag_5', 'target_lag_12',
       'avg_sales_per_shop_id', 'avg_sales_per_item_id',
       'avg_sales_per_item_cat_id', 'shop_id_lag_1', 'item_id_lag_1',
       'item_category_id_lag_1', 'shop_id_lag_2', 'item_id_lag_2',
       'item_category_id_lag_2', 'shop_id_lag_3', 'item_id_lag_3',
       'item_category_id_lag_3', 'shop_id_lag_4', 'item_id_lag_4',
       'item_category_id_lag_4', 'shop_id_lag_5', 'item_id_lag_5',
       'item_category_id_lag_5', 'shop_id_lag_12', 'item_id_lag_12',
       'item_category_id_lag_12'],
      dtype='object')

### Category

In [40]:
l_cat = list(categories.item_category_name)
for ind in range(0,1):
    l_cat[ind] = 'PC Headsets / Headphones'
for ind in range(1,8):
    l_cat[ind] = 'Access'
l_cat[8] = 'Tickets (figure)'
l_cat[9] = 'Delivery of goods'
for ind in range(10,18):
    l_cat[ind] = 'Consoles'
for ind in range(18,25):
    l_cat[ind] = 'Consoles Games'
l_cat[25] = 'Accessories for games'
for ind in range(26,28):
    l_cat[ind] = 'phone games'
for ind in range(28,32):
    l_cat[ind] = 'CD games'
for ind in range(32,37):
    l_cat[ind] = 'Card'
for ind in range(37,43):
    l_cat[ind] = 'Movie'
for ind in range(43,55):
    l_cat[ind] = 'Books'
for ind in range(55,61):
    l_cat[ind] = 'Music'
for ind in range(61,73):
    l_cat[ind] = 'Gifts'
for ind in range(73,79):
    l_cat[ind] = 'Soft'
for ind in range(79,81):
    l_cat[ind] = 'Office'
for ind in range(81,83):
    l_cat[ind] = 'Clean'
l_cat[83] = 'Elements of a food'

In [41]:
all_data['item_category_name'] = all_data.item_category_id.apply(
    lambda x: l_cat[x]
)

In [42]:
cat_list = list(all_data.item_category_name)

In [43]:
all_data.columns

Index(['shop_id', 'item_id', 'date_block_num', 'target', 'month', 'year', 'ID',
       'item_category_id', 'target_lag_1', 'target_lag_2', 'target_lag_3',
       'target_lag_4', 'target_lag_5', 'target_lag_12',
       'avg_sales_per_shop_id', 'avg_sales_per_item_id',
       'avg_sales_per_item_cat_id', 'shop_id_lag_1', 'item_id_lag_1',
       'item_category_id_lag_1', 'shop_id_lag_2', 'item_id_lag_2',
       'item_category_id_lag_2', 'shop_id_lag_3', 'item_id_lag_3',
       'item_category_id_lag_3', 'shop_id_lag_4', 'item_id_lag_4',
       'item_category_id_lag_4', 'shop_id_lag_5', 'item_id_lag_5',
       'item_category_id_lag_5', 'shop_id_lag_12', 'item_id_lag_12',
       'item_category_id_lag_12', 'item_category_name'],
      dtype='object')

In [44]:
for cat in cat_vars:
    all_data[cat] = all_data[cat].astype('category')
all_data['date_block_num'] = all_data['date_block_num'].cat.as_ordered()
all_data['month'] = all_data['month'].cat.as_ordered()
all_data['year'] = all_data['year'].cat.as_ordered()

In [45]:
# X_train = all_data[(all_data['date_block_num']>12)&(all_data['date_block_num']<33)].drop(['target','ID'], axis = 1)
# y_train = all_data[(all_data['date_block_num']>12)&(all_data['date_block_num']<33)]['target']
X_train = all_data[all_data['date_block_num']<33].drop(['target','ID'], axis = 1)
y_train = all_data[all_data['date_block_num']<33]['target']
X_val =  all_data[all_data['date_block_num']==33].drop(['target','ID'], axis = 1)
y_val = all_data[all_data['date_block_num'] == 33]['target']
X_test = all_data[(all_data['date_block_num']==34)].drop(['target'],axis=1)
del all_data

In [46]:
X_test = X_test.drop('ID', axis=1)

In [47]:
gc.collect()

32

In [48]:
X_train.to_pickle(PATH + 'X_train.pkl')
y_train.to_pickle(PATH + 'y_train.pkl')
X_val.to_pickle(PATH + 'X_val.pkl')
y_val.to_pickle(PATH + 'y_val.pkl')
X_test.to_pickle(PATH + 'X_test.pkl')

In [49]:
X_train.head().columns

Index(['shop_id', 'item_id', 'date_block_num', 'month', 'year',
       'item_category_id', 'target_lag_1', 'target_lag_2', 'target_lag_3',
       'target_lag_4', 'target_lag_5', 'target_lag_12',
       'avg_sales_per_shop_id', 'avg_sales_per_item_id',
       'avg_sales_per_item_cat_id', 'shop_id_lag_1', 'item_id_lag_1',
       'item_category_id_lag_1', 'shop_id_lag_2', 'item_id_lag_2',
       'item_category_id_lag_2', 'shop_id_lag_3', 'item_id_lag_3',
       'item_category_id_lag_3', 'shop_id_lag_4', 'item_id_lag_4',
       'item_category_id_lag_4', 'shop_id_lag_5', 'item_id_lag_5',
       'item_category_id_lag_5', 'shop_id_lag_12', 'item_id_lag_12',
       'item_category_id_lag_12', 'item_category_name'],
      dtype='object')

In [50]:
X_train.tail()

Unnamed: 0,shop_id,item_id,date_block_num,month,year,item_category_id,target_lag_1,target_lag_2,target_lag_3,target_lag_4,...,shop_id_lag_4,item_id_lag_4,item_category_id_lag_4,shop_id_lag_5,item_id_lag_5,item_category_id_lag_5,shop_id_lag_12,item_id_lag_12,item_category_id_lag_12,item_category_name
10675673,59,22162,32,8,2,40,1.0,0.0,0.0,1.0,...,0.169411,0.795455,0.229217,0.176406,1.659575,0.231683,0.184978,0.0,0.227142,Movie
10675674,59,22163,32,8,2,40,0.0,0.0,0.0,0.0,...,0.169411,0.0,0.229217,0.176406,0.0,0.231683,0.184978,0.0,0.227142,Movie
10675675,59,22164,32,8,2,37,0.0,1.0,0.0,0.0,...,0.169411,0.522727,0.146812,0.176406,0.574468,0.194307,0.184978,0.0,0.129389,Movie
10675676,59,22166,32,8,2,54,0.0,0.0,0.0,0.0,...,0.169411,0.181818,0.188088,0.176406,0.085106,0.162737,0.184978,0.26,0.165143,Books
10675677,59,22167,32,8,2,49,0.0,0.0,0.0,0.0,...,0.169411,0.704545,0.162424,0.176406,0.808511,0.210402,0.184978,0.78,0.18619,Books


In [51]:
X_test.head()

Unnamed: 0,shop_id,item_id,date_block_num,month,year,item_category_id,target_lag_1,target_lag_2,target_lag_3,target_lag_4,...,shop_id_lag_4,item_id_lag_4,item_category_id_lag_4,shop_id_lag_5,item_id_lag_5,item_category_id_lag_5,shop_id_lag_12,item_id_lag_12,item_category_id_lag_12,item_category_name
10913850,2,30,34,10,2,40,0.0,0.0,0.0,0.0,...,0.147097,0.093023,0.249288,0.153183,0.093023,0.252182,0.17447,0.26,0.264651,Movie
10913851,2,31,34,10,2,37,1.0,0.0,0.0,0.0,...,0.147097,0.139535,0.156834,0.153183,0.232558,0.1697,0.17447,0.22,0.156944,Movie
10913852,2,32,34,10,2,40,0.0,0.0,1.0,0.0,...,0.147097,0.488372,0.249288,0.153183,0.604651,0.252182,0.17447,0.58,0.264651,Movie
10913853,2,33,34,10,2,37,0.0,1.0,0.0,1.0,...,0.147097,0.348837,0.156834,0.153183,0.255814,0.1697,0.17447,0.4,0.156944,Movie
10913854,2,38,34,10,2,41,0.0,0.0,0.0,0.0,...,0.147097,0.116279,0.143554,0.153183,0.046512,0.113528,0.17447,0.2,0.154545,Movie


### Training

In [52]:
X_train = pd.read_pickle(PATH + 'X_train.pkl')
y_train = pd.read_pickle(PATH + 'y_train.pkl')
X_val = pd.read_pickle(PATH + 'X_val.pkl')
y_val = pd.read_pickle(PATH + 'y_val.pkl')
X_test = pd.read_pickle(PATH + 'X_test.pkl')

In [53]:
orig_len = len(X_test)
X_test = X_test.merge(test, on=['item_id', 'shop_id'], how='inner')
X_test = X_test.set_index('ID').sort_index().reset_index().drop('ID', axis=1)
assert len(X_test) == orig_len  # sanity check

In [55]:
def rmse(y_pred, targ):
    return np.sqrt(((targ - y_pred)**2).mean())

def rmse_expm1(y_pred, targ):
    return rmse(y_pred=np.expm1(y_pred), targ=np.expm1(targ))

### Deep Learning

In [56]:
X_train['target'] = y_train
X_val['target'] = y_val
X_test['target'] = 0

In [57]:
X_test.columns

Index(['shop_id', 'item_id', 'date_block_num_x', 'month_x', 'year_x',
       'item_category_id', 'target_lag_1', 'target_lag_2', 'target_lag_3',
       'target_lag_4', 'target_lag_5', 'target_lag_12',
       'avg_sales_per_shop_id', 'avg_sales_per_item_id',
       'avg_sales_per_item_cat_id', 'shop_id_lag_1', 'item_id_lag_1',
       'item_category_id_lag_1', 'shop_id_lag_2', 'item_id_lag_2',
       'item_category_id_lag_2', 'shop_id_lag_3', 'item_id_lag_3',
       'item_category_id_lag_3', 'shop_id_lag_4', 'item_id_lag_4',
       'item_category_id_lag_4', 'shop_id_lag_5', 'item_id_lag_5',
       'item_category_id_lag_5', 'shop_id_lag_12', 'item_id_lag_12',
       'item_category_id_lag_12', 'item_category_name', 'date_block_num_y',
       'month_y', 'year_y', 'target'],
      dtype='object')

In [58]:
X_test.head()[['item_id', 'shop_id', 'date_block_num_x', 'month_x', 'year_x', 'target', 'target_lag_1', 'target_lag_2', 'target_lag_3', 'target_lag_12']]

Unnamed: 0,item_id,shop_id,date_block_num_x,month_x,year_x,target,target_lag_1,target_lag_2,target_lag_3,target_lag_12
0,5037,5,34,10,2,0,0.0,1.0,3.0,1.0
1,5320,5,34,10,2,0,0.0,0.0,0.0,0.0
2,5233,5,34,10,2,0,1.0,3.0,1.0,0.0
3,5232,5,34,10,2,0,0.0,0.0,1.0,0.0
4,5268,5,34,10,2,0,0.0,0.0,0.0,0.0


In [59]:
# For faster prototyping
X_train = X_train[X_train['date_block_num'] > 12]

In [60]:
X_train.columns

Index(['shop_id', 'item_id', 'date_block_num', 'month', 'year',
       'item_category_id', 'target_lag_1', 'target_lag_2', 'target_lag_3',
       'target_lag_4', 'target_lag_5', 'target_lag_12',
       'avg_sales_per_shop_id', 'avg_sales_per_item_id',
       'avg_sales_per_item_cat_id', 'shop_id_lag_1', 'item_id_lag_1',
       'item_category_id_lag_1', 'shop_id_lag_2', 'item_id_lag_2',
       'item_category_id_lag_2', 'shop_id_lag_3', 'item_id_lag_3',
       'item_category_id_lag_3', 'shop_id_lag_4', 'item_id_lag_4',
       'item_category_id_lag_4', 'shop_id_lag_5', 'item_id_lag_5',
       'item_category_id_lag_5', 'shop_id_lag_12', 'item_id_lag_12',
       'item_category_id_lag_12', 'item_category_name', 'target'],
      dtype='object')

In [62]:
splits = RandomSplitter(valid_pct=0.2)(range_of(X_train))

In [67]:
to_train = TabularPandas(X_train, procs=[Categorify, FillMissing,Normalize],
                   cat_names = ['shop_id', 'item_id', 'date_block_num', 'month', 'year',
       'item_category_id','item_category_name'],
                       cont_names  = ['target_lag_1', 'target_lag_2', 'target_lag_3',
       'target_lag_4', 'target_lag_5', 'target_lag_12',
       'avg_sales_per_shop_id', 'avg_sales_per_item_id',
       'avg_sales_per_item_cat_id', 'shop_id_lag_1', 'item_id_lag_1',
       'item_category_id_lag_1', 'shop_id_lag_2', 'item_id_lag_2',
       'item_category_id_lag_2', 'shop_id_lag_3', 'item_id_lag_3',
       'item_category_id_lag_3', 'shop_id_lag_4', 'item_id_lag_4',
       'item_category_id_lag_4', 'shop_id_lag_5', 'item_id_lag_5',
       'item_category_id_lag_5', 'shop_id_lag_12', 'item_id_lag_12',
       'item_category_id_lag_12'],
                   y_names='target',splits=splits)

In [83]:
to_train = TabularPandas(X_train, procs=[Categorify, FillMissing,Normalize],
                   cat_names = ['shop_id', 'item_id', 'date_block_num', 'month', 'year',
       'item_category_id'],
                       cont_names  = ['target_lag_1', 'target_lag_2', 'target_lag_3',
       'target_lag_4','target_lag_12'],
                   y_names='target',splits=splits)

In [84]:
to_train.xs.iloc[:2]

Unnamed: 0,shop_id,item_id,date_block_num,month,year,item_category_id,target_lag_1,target_lag_2,target_lag_3,target_lag_4,target_lag_12
7205352,16,6348,21,9,2,56,-0.207108,-0.207062,-0.203672,-0.201013,-0.176707
9198530,43,9414,27,3,3,71,-0.207108,0.468019,-0.203672,0.450175,-0.176707


In [85]:
dls = to_train.dataloaders(bs=64)

In [86]:
dls.show_batch()

Unnamed: 0,shop_id,item_id,date_block_num,month,year,item_category_id,target_lag_1,target_lag_2,target_lag_3,target_lag_4,target_lag_12,target
0,52,17792,19,7,1,37,-5.819789e-09,8.763465e-09,3.163585e-10,2.593472e-09,1.0,0.0
1,24,15430,20,8,1,63,-5.819789e-09,8.763465e-09,3.163585e-10,2.593472e-09,6.299297e-09,0.0
2,21,21787,18,6,1,43,-5.819789e-09,8.763465e-09,3.163585e-10,2.593472e-09,6.299297e-09,0.0
3,44,16447,20,8,1,37,-5.819789e-09,8.763465e-09,3.163585e-10,2.593472e-09,6.299297e-09,0.0
4,29,5637,13,1,1,2,3.0,1.0,1.0,1.0,6.299297e-09,1.0
5,40,8089,21,9,1,55,-5.819789e-09,8.763465e-09,3.163585e-10,2.593472e-09,6.299297e-09,0.0
6,24,4190,13,1,1,75,1.0,1.0,2.0,1.0,6.299297e-09,1.0
7,46,6225,30,6,2,58,-5.819789e-09,8.763465e-09,3.163585e-10,2.593472e-09,6.299297e-09,0.0
8,50,17322,22,10,1,40,-5.819789e-09,1.0,3.163585e-10,2.593472e-09,2.0,0.0
9,40,19044,20,8,1,58,-5.819789e-09,8.763465e-09,3.163585e-10,2.593472e-09,6.299297e-09,0.0


In [73]:
learn = tabular_learner(dls, metrics=accuracy)

In [None]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,accuracy,time


In [None]:
learn.show_results()

In [None]:
row, clas, probs = learn.predict(X_train.iloc[0])

In [None]:
row.show()

In [None]:
clas, probs

In [None]:
test_df = df.copy()
test_df.drop(['salary'], axis=1, inplace=True)
dl = learn.dls.test_dl(test_df)

In [None]:
learn.get_preds(dl=dl)

In [None]:
X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_test, y_test = to.valid.xs, to.valid.ys.values.ravel()