In [1]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)

#import matplotlib as mpl
import matplotlib.pyplot as plt
#import matplotlib.dates as mdates
%matplotlib inline 

from tqdm import tqdm_notebook
from itertools import product

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import gc

In [2]:
DATA_FOLDER = './data/'

sales = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))
items = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

In [3]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [4]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

In [5]:
def create_grid(sales, index_cols):
    from tqdm import tqdm_notebook
    
    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    for block_num in tqdm_notebook(sales['date_block_num'].unique()):
        cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
        cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    # Turn the grid into a dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    # Groupby data to get shop-item-month aggregates
    gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
    # Fix column names
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
    # Join it to the grid
    all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

    # Same as above but with shop-month aggregates
    gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

    # Same as above but with item-month aggregates
    gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

    # Downcast dtypes from 64 to 32 bit to save memory
    all_data = downcast_dtypes(all_data)
    del grid, gb 
    gc.collect();
    
    return all_data

In [6]:
max_train_date_block_num = sales.date_block_num.max()
max_train_date_block_num

33

In [7]:
all_data = create_grid(
    pd.concat(
        [sales, test.assign(date_block_num=max_train_date_block_num+1)],
        ignore_index=True, sort=False),
    index_cols)
all_data.head()

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item
0,59,22154,0,1.0,2017.0,18.0
1,59,2552,0,0.0,2017.0,0.0
2,59,2554,0,0.0,2017.0,1.0
3,59,2555,0,0.0,2017.0,2.0
4,59,2564,0,0.0,2017.0,5.0


In [8]:
def create_lags(all_data, shift_range = [1, 2, 3, 4, 5, 12]):

    # List of columns that we will use to create lags
    cols_to_rename = list(all_data.columns.difference(index_cols)) 

    for month_shift in tqdm_notebook(shift_range):
        train_shift = all_data[index_cols + cols_to_rename].copy()

        train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift

        foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

    del train_shift

    # List of all lagged features
    fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
    # We will drop these at fitting stage
    to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

    # Category for each item
    item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()
    # TODO category means

    all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
    all_data = downcast_dtypes(all_data)
    gc.collect();
    
    return all_data, to_drop_cols

In [9]:
lagged_data, to_drop_cols = create_lags(all_data)
lagged_data.head()
to_drop_cols

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_2,target_item_lag_2,target_shop_lag_2,target_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_4,target_item_lag_4,target_shop_lag_4,target_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id
0,59,22154,0,1.0,2017.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
1,59,2552,0,0.0,2017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58
2,59,2554,0,0.0,2017.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58
3,59,2555,0,0.0,2017.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56
4,59,2564,0,0.0,2017.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59


['target_item', 'target', 'target_shop', 'date_block_num']

In [None]:
# TODO shop_id one-hot encoding, category_id one-hot encoding

In [None]:
# TODO target.clip(0, 20)

In [10]:
train = lagged_data.loc[lagged_data.date_block_num <= max_train_date_block_num]
test_lagged = lagged_data.loc[lagged_data.date_block_num == max_train_date_block_num + 1]

In [None]:
# TODO
#train = train.loc[train.shop_id.isin([26, 27, 28])]

In [11]:
X_train = train.drop(to_drop_cols, axis=1).values
print(X_train) #X.head()

[[5.9000e+01 2.2154e+04 0.0000e+00 ... 0.0000e+00 0.0000e+00 3.7000e+01]
 [5.9000e+01 2.5520e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 5.8000e+01]
 [5.9000e+01 2.5540e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 5.8000e+01]
 ...
 [2.1000e+01 7.6400e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 6.4000e+01]
 [2.1000e+01 7.6320e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 6.4000e+01]
 [2.1000e+01 7.4400e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 5.7000e+01]]


In [12]:
target_col = 'target'
y_train = train.loc[:, [target_col]].values.ravel()
print(y_train) #y.head()

[1. 0. 0. ... 0. 0. 0.]


In [13]:
def gen_time_split(data, n_splits):
    for i in range(n_splits):
        print(i)
        first_vali_date_block_num = max_train_date_block_num - i
        vali_indices = data.loc[:,'date_block_num'] == first_vali_date_block_num
        train_indices = X.loc[:,'date_block_num'] < first_vali_date_block_num
        yield (train_indices[train_indices].index, vali_indices[vali_indices].index)        

In [14]:
cv = gen_time_split(train, 3)

In [15]:
from sklearn.ensemble import GradientBoostingRegressor
est = GradientBoostingRegressor(n_estimators=50, max_depth=7, loss='ls', verbose=1)
#from sklearn.ensemble import RandomForestRegressor
#est = RandomForestRegressor(n_estimators=10, max_depth=7, verbose=1, n_jobs=-1)

In [16]:
#lr = 1 / np.logspace(0.0, 1.0, num=5)[2:]
#lr = np.array([0.3, 0.45, 0.6])
#lr = np.linspace(0.3, 0.6, 5)
lr = np.array([0.3])
print(lr)

[0.3]


In [17]:
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate':lr}
gs = GridSearchCV(est, param_grid, cv=cv, refit=True, n_jobs=4, scoring='r2', verbose=1)
#gs.fit(X, y)

In [18]:
#from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('Search', gs)]) #('Scaling', StandardScaler()), 

In [19]:
#pipe.fit(X,y)
est.fit(X_train,y_train)

      Iter       Train Loss   Remaining Time 
         1          10.2512           88.62m
         2           9.1416           88.25m
         3           8.1899           86.10m
         4           7.3854           82.96m
         5           6.7550           80.57m
         6           6.2286           78.68m
         7           5.7747           77.07m
         8           5.4222           75.33m
         9           5.1123           73.97m


KeyboardInterrupt: 

from sklearn.model_selection import cross_val_score
scores = cross_val_score(est, X, y, cv=cv)
print(scores)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
print(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'])
#gs.best_estimator_

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
y_train_pred = est.predict(X_train) #pipe.predict(X)
#y_train_pred = y_train_pred.clip(0, 20)
mse = mean_squared_error(y_train, y_train_pred)
print(mse)
print(np.sqrt(mse))
print(r2_score(y_train, y_train_pred))

In [20]:
import lightgbm as lgb

lgb_params = {
               'metric': 'rmse',
               'nthread':4, 
               'learning_rate': 0.3, 
               'objective': 'mse', 
               'verbose':1 
              }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)

In [21]:
from sklearn.metrics import mean_squared_error, r2_score
y_train_pred_lgb = model.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred_lgb)
print(mse)
print(np.sqrt(mse))
print('Train R-squared for LightGBM is %f' % r2_score(y_train, y_train_pred_lgb))

3.137131109167573
1.7711948252994567
Train R-squared for LightGBM is 0.731354


In [22]:
X_test = test.merge(test_lagged, how='left').drop(to_drop_cols + ['ID'], axis=1).values
print(X_test)

[[5.0000e+00 5.0370e+03 0.0000e+00 ... 6.5000e+01 1.4450e+03 1.9000e+01]
 [5.0000e+00 5.3200e+03 0.0000e+00 ... 0.0000e+00 0.0000e+00 5.5000e+01]
 [5.0000e+00 5.2330e+03 1.0000e+00 ... 0.0000e+00 0.0000e+00 1.9000e+01]
 ...
 [4.5000e+01 1.5757e+04 0.0000e+00 ... 9.0000e+00 1.2510e+03 5.5000e+01]
 [4.5000e+01 1.9648e+04 0.0000e+00 ... 0.0000e+00 0.0000e+00 4.0000e+01]
 [4.5000e+01 9.6900e+02 0.0000e+00 ... 6.0000e+00 1.2510e+03 3.7000e+01]]


In [23]:
#y_test_pred = est.predict(X_test) #pipe.predict(X_test)
y_test_pred = model.predict(X_test)
#y_test_pred = y_test_pred.clip(0, 20)
print(y_test_pred)

[0.38920491 0.15657952 0.85671721 ... 0.03374777 0.02551574 0.02010241]


In [25]:
submission = test.assign(item_cnt_month=y_test_pred)[['item_cnt_month']]
submission.describe()

Unnamed: 0,item_cnt_month
count,214200.0
mean,0.316502
std,2.994464
min,-31.56949
25%,0.041379
50%,0.107745
75%,0.246133
max,565.812894


In [26]:
submission.head()

Unnamed: 0,item_cnt_month
0,0.389205
1,0.15658
2,0.856717
3,0.215647
4,3.219996


In [27]:
submission.to_csv('Refactor.csv', index_label='ID') #header=['ID', 'item_cnt_month'])

In [28]:
!gzip Refactor.csv
!ls

Baseline.ipynb	   lagged2.csv.gz   ShopAndCategoryMeans_50.csv.gz
combos.csv.gz	   lagged3.csv.gz   ShopAndCategoryMeans.csv
Combos.ipynb	   lagged.csv.gz    ShopAndCategoryMeans_xgb.csv.gz
combos_xgb.csv.gz  Lagged.ipynb     Shop and item category means.ipynb
data		   Refactor.csv.gz  submission.csv.gz
EDA.ipynb	   Refactor.ipynb


0.3 is best learning rate so far.

[
 (split1_train_idxs, split1_test_idxs),
 (split2_train_idxs, split2_test_idxs),
 (split3_train_idxs, split3_test_idxs),
 ...
]

"Submissions are evaluated by root mean squared error (RMSE). True target values are clipped into [0,20] range."

and

"For each id in the test set, you must predict a total number of sales."

and

"Submission is for date_block_num 34"

and

"
My CV strategy is 5-fold moving window:

fold 1: Train on month 0 to 32 and validate on 33
fold 2: Train on month 0 to 31 and validate on 32
…
fold 5: Train on month 0 to 28 and validate on 29
"

and

- mean encodings
- lag
- text extraction on item and category names

In [None]:
#import sys
#!conda install --yes --prefix {sys.prefix} xgboost
#{sys.executable} -m pip install xgboost

In [None]:
import xgboost as xgb
dtrain = xgb.DMatrix(buf.loc[:, buf.columns != target_col], label=y)
param = {'max_depth':7, 'eta':0.3, 'silent':0, 'objective':'reg:linear', 'eval_metrix':'rmse' }
num_round = 100
bst = xgb.train(param, dtrain, num_round)

In [None]:
y_pred = bst.predict(dtrain)
mse = mean_squared_error(y, y_pred)
print(mse)
print(np.sqrt(mse))

In [None]:
# make prediction
dtest = xgb.DMatrix(buf2)
y_pred_test = bst.predict(dtest)
y_pred_test