In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import sklearn
from sklearn.metrics import mean_squared_error

from itertools import product

import pickle


In [None]:
#Read all csv files to dataframe
df_items = pd.read_csv("items.csv")
df_sales_train = pd.read_csv("sales_train.csv")
df_item_categories = pd.read_csv("item_categories.csv")
df_test = pd.read_csv("test.csv")
df_shops = pd.read_csv("shops.csv")
df_sample_submission = pd.read_csv("sample_submission.csv")

In [None]:
#clean irregular data
df_sales_train = df_sales_train[(df_sales_train['item_price']<50000) & (df_sales_train['item_cnt_day']<1000)]

In [None]:
#Negative values
df_sales_train = df_sales_train[df_sales_train['item_price'] > 0]

In [None]:
#We found repetitive names, so we can combine them
df_sales_train.loc[df_sales_train.shop_id == 0,'shop_id']=57
df_test.loc[df_test.shop_id == 0,'shop_id']=57
df_sales_train.loc[df_sales_train.shop_id == 1,'shop_id']=58
df_test.loc[df_test.shop_id == 1,'shop_id']=58
df_sales_train.loc[df_sales_train.shop_id == 10,'shop_id']=11
df_test.loc[df_test.shop_id == 10,'shop_id']=11

In [None]:
#add the item_ID we want to predict
df_sales_train = pd.merge(df_sales_train,df_items,how='left', on=['item_id'])
df_sales_train.fillna(0,inplace=True)

In [None]:
#add the test ID we want to predict
df_test['date_block_num'] = 34
df_sales_train = pd.concat([df_sales_train,df_test],ignore_index=True,
                        sort = False,keys = ['date_block_num','shop_id','item_id'])
df_sales_train.fillna(0,inplace=True)
df_sales_train.tail()

In [None]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in df_sales_train['date_block_num'].unique():
    cur_shops = df_sales_train.loc[df_sales_train['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = df_sales_train.loc[df_sales_train['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int16'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

In [None]:
#Create a column with month count
df_new = df_sales_train.groupby(index_cols,as_index=False)["item_cnt_day"].sum()
df_new = df_new.rename(columns = {"item_cnt_day": "target"})
all_data = pd.merge(grid, df_new, how='left', on=index_cols).fillna(0)

In [None]:
#Create a column with mean encoded shop_id
df_new = df_sales_train.groupby(['shop_id', 'date_block_num'],as_index=False)["item_cnt_day"].sum()
df_new = df_new.rename(columns = {"item_cnt_day": "target_shop"})
all_data = pd.merge(all_data, df_new, how='left', on=['shop_id', 'date_block_num']).fillna(0)

In [None]:
#Create a column with mean encoded item_id
df_new = df_sales_train.groupby(['item_id', 'date_block_num'],as_index=False)["item_cnt_day"].sum()
df_new = df_new.rename(columns = {"item_cnt_day": "target_item"})
all_data = pd.merge(all_data, df_new, how='left', on=['item_id', 'date_block_num']).fillna(0)

In [None]:
del grid, df_new

In [None]:
all_data.head()

In [None]:
#Names of new columns
cols_to_rename = list(all_data.columns.difference(index_cols)) 
cols_to_rename

In [None]:
shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in shift_range:

    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift


In [None]:
all_data.head().transpose()

In [None]:
# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 


# We will drop these at fitting stage
to_drop_cols = ['target', 'target_shop', 'target_item', 'date_block_num']

# Category for each item
item_category_mapping = df_items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data.to_pickle("data.pkl")
del all_data

In [None]:
#Train/Split
#For a sake of the programming assignment, let's artificially split the data into train and test. We will treat last month data as the test set.

In [None]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
all_data = pd.read_pickle("data.pkl")
dates = all_data['date_block_num']

In [None]:
dates_train = dates[dates < 34]
dates_target  = dates[dates == 34]

X_train = all_data.loc[dates <  34].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == 34].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <  34, 'target'].values
y_test =  all_data.loc[dates == 34, 'target'].values


In [None]:
#we will run linear regression on numeric columns and get predictions for the last month.

lr = LinearRegression()
lr.fit(X_train.values, y_train)
pred_lr = lr.predict(X_test.values)
df_new = pd.DataFrame(pred_lr)
df_sample_submission["item_cnt_month"] = df_new
#we create a filter for values bigger than 20 or smaller than 0
df_sample_submission["item_cnt_month"] = df_sample_submission["item_cnt_month"].where(df_sample_submission["item_cnt_month"] < 20, 20)
df_sample_submission["item_cnt_month"] = df_sample_submission["item_cnt_month"].where(df_sample_submission["item_cnt_month"] > 0, 0)
df_sample_submission.to_csv('lr_pred.csv',index=False)

In [None]:
#lgb model
lgb_params = {

            'feature_fraction': 0.75,
            'metric': 'rmse',
           'nthread':1, 
           'min_data_in_leaf': 2**7, 
           'bagging_fraction': 0.75, 
           'learning_rate': 0.03, 
           'objective': 'mse', 
           'bagging_seed': 2**7, 
           'num_leaves': 2**7,
           'bagging_freq':1,
           'verbose':0 
          }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb = model.predict(X_test)
df_new = pd.DataFrame(pred_lgb)
df_sample_submission["item_cnt_month"] = df_new

df_sample_submission["item_cnt_month"] = df_sample_submission["item_cnt_month"].where(df_sample_submission["item_cnt_month"] < 20, 20)
df_sample_submission["item_cnt_month"] = df_sample_submission["item_cnt_month"].where(df_sample_submission["item_cnt_month"] > 0, 0)

df_sample_submission.to_csv('lgb_pred.csv',index=False)

In [None]:
X_test_level2 = np.c_[pred_lr, pred_lgb]

In [None]:
# we get target for the 2nd level dataset
y_train_level2 = y_train[dates_train.isin([27, 28, 29, 30, 31, 32, 33])]

In [None]:
#And here we create 2nd level feeature matrix, init it with zeros first
X_train_level2 = np.zeros([y_train_level2.shape[0], 2])

In [None]:
#training with metafeatures
xposition = 0 
# Now fill `X_train_level2` with metafeatures
for cur_block_num in [27, 28, 29, 30, 31, 32, 33]:
    
    X_train = all_data.loc[dates <  cur_block_num].drop(to_drop_cols, axis=1)
    X_test =  all_data.loc[dates == cur_block_num].drop(to_drop_cols, axis=1)

    y_train = all_data.loc[dates <  cur_block_num, 'target'].values

    lr.fit(X_train.values, y_train)
    pred_lr = lr.predict(X_test.values)
    
    model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
    pred_lgb = model.predict(X_test)
    
    dates_train_level2 = np.c_[pred_lr, pred_lgb] 
    
    X_train_level2[xposition:(xposition + X_test.shape[0])]=dates_train_level2
    xposition = xposition + X_test.shape[0] 

In [None]:
#simple convex mix
alphas_to_try = np.linspace(0, 1, 1001)

error = 1000
for a in alphas_to_try:
    mix = a * X_train_level2[:,0] + (1 - a) * X_train_level2[:,1]
    mse = sklearn.metrics.mean_squared_error(mix, y_train_level2)
    if mse < error:
        best_alpha = a
        r2_train_simple_mix = sklearn.metrics.r2_score(y_train_level2, mix)
        error = mse

print('Best alpha: %f; Corresponding r2 score on train: %f' % (best_alpha, r2_train_simple_mix))

In [None]:
test_preds = best_alpha * X_test_level2[:,0] + (1 - best_alpha) * X_test_level2[:,1]
#r2_test_simple_mix = sklearn.metrics.r2_score(y_test, test_preds)
df_new = pd.DataFrame(test_preds)
df_sample_submission["item_cnt_month"] = df_new

df_sample_submission["item_cnt_month"] = df_sample_submission["item_cnt_month"].where(df_sample_submission["item_cnt_month"] < 20, 20)
df_sample_submission["item_cnt_month"] = df_sample_submission["item_cnt_month"].where(df_sample_submission["item_cnt_month"] > 0, 0)

df_sample_submission.to_csv('mix_preds.csv',index=False)

In [None]:
#stacking
lr.fit(X_train_level2, y_train_level2)

In [None]:
test_preds = lr.predict(X_test_level2)

df_new = pd.DataFrame(test_preds)
df_new.head()
df_sample_submission["item_cnt_month"] = df_new

df_sample_submission["item_cnt_month"] = df_sample_submission["item_cnt_month"].where(df_sample_submission["item_cnt_month"] < 20, 20)
df_sample_submission["item_cnt_month"] = df_sample_submission["item_cnt_month"].where(df_sample_submission["item_cnt_month"] > 0, 0)

df_sample_submission.to_csv('stacking_preds.csv',index=False)