In [26]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from xgboost import plot_importance
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from itertools import product
from catboost import Pool
import lightgbm as lgb
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
import warnings
import catboost
import time
import gc



pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)

%matplotlib inline
sns.set(style="darkgrid")
pd.set_option('display.float_format', lambda x: '%.2f' % x)
warnings.filterwarnings("ignore")
start_time = time.time()

for p in [np, pd, scipy, sklearn, lightgbm]:
    print (p.__name__, p.__version__)

numpy 1.16.2
pandas 0.23.4
scipy 1.1.0
sklearn 0.19.2
lightgbm 2.2.2


In [27]:
def downcast_dtypes(df):   
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [28]:
sales = pd.read_csv('input/sales_train_v2.csv')
shops = pd.read_csv('input/shops.csv')
items = pd.read_csv('input/items.csv')
item_cats = pd.read_csv('input/item_categories.csv')

In [29]:
#sales = sales[sales['shop_id'].isin([26, 27, 28])]

In [30]:
test = pd.read_csv('input/test.csv', dtype={'ID': 'int32', 'shop_id': 'int32','item_id': 'int32'})
test['date_block_num'] = 34
test.drop(['ID'], axis=1, inplace=True)
#test =  test.merge(item_categories[['item_cat_id_fix', 'item_category_id']], on = ['item_category_id'], how = 'left')


In [31]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)


# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})

# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 

# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

In [32]:
all_data = pd.concat([all_data, test], axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [33]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)


del train_shift

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




In [None]:
# Get Mean Encodings

In [34]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = all_data['date_block_num']

last_block = dates.max()
print('Test `date_block_num` is %d' % last_block)

Test `date_block_num` is 34


In [78]:

dates_train = dates[dates <  last_block]
#dates_val = dates[dates ==  last_block - 1]
dates_test  = dates[dates == last_block]

X_train = all_data.loc[dates <  last_block].drop(to_drop_cols, axis=1)
#X_val = all_data.loc[dates ==  last_block - 1].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == last_block].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <  last_block, 'target'].values
#y_val = all_data.loc[dates ==  last_block - 1, 'target'].values
y_test =  all_data.loc[dates == last_block, 'target'].values

In [79]:
lr = LinearRegression()
lr.fit(X_train.values, y_train)
#pred_val_lr = lr.predict(X_val.values)
pred_lr = lr.predict(X_test.values)

#print('Test R-squared for linreg is %f' % r2_score(y_val, pred_val_lr))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Test R-squared for linreg is 0.255000


In [80]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
#pred_val_lgb = model.predict(X_val)
pred_lgb = model.predict(X_test)
print(y_test.shape)

#print('Test R-squared for LightGBM is %f' % r2_score(y_val, pred_val_lgb))

(214200,)


In [None]:
# XGB - , (validation, Y_validation)
xgb_model = XGBRegressor(max_depth=8, 
                         n_estimators=500, 
                         min_child_weight=1000,  
                         colsample_bytree=0.7, 
                         subsample=0.7, 
                         eta=0.3, 
                         seed=0)
xgb_model.fit(train, 
              Y_train, 
              eval_metric="rmse", 
              eval_set=[(X_train, y_train)], 
              verbose=20,
              early_stopping_rounds=20)

In [81]:
len(pred_lr),len(pred_lgb)

(214200, 214200)

In [82]:
X_test_level2 = np.c_[pred_lr, pred_lgb]
print(X_test_level2.shape)

(214200, 2)


In [83]:
dates_train_level2 = dates_train[dates_train.isin([27, 28, 29, 30, 31, 32,33])]

# That is how we get target for the 2nd level dataset
y_train_level2 = y_train[dates_train.isin([27, 28, 29, 30, 31, 32, 33])]

In [86]:
X_train_level2 = np.zeros([y_train_level2.shape[0], 2])
slice_start = 0

# Now fill `X_train_level2` with metafeatures
for cur_block_num in [27, 28, 29, 30, 31, 32, 33]:
    preds = []        

    print(cur_block_num)
    
    X_train = all_data.loc[dates <  cur_block_num].drop(to_drop_cols, axis=1)
    X_test =  all_data.loc[dates == cur_block_num].drop(to_drop_cols, axis=1)
    
    y_train = all_data.loc[dates <  cur_block_num, 'target'].values
    
    # Fit LinearRegression
    lr.fit(X_train.values, y_train)
    pred_lr = lr.predict(X_test.values)
    
    # Fit LGB
    model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
    pred_lgb = model.predict(X_test)
    
    # Fit KNN Model
    knn_model = KNeighborsRegressor(n_neighbors=9, leaf_size=13, n_jobs=-1)
    knn_model.fit(X_train, y_train)
    knn_pred = knn_model.predict(X_test)
    
    # Fit Random Forest    
    rf_model = RandomForestRegressor(n_estimators=50, max_depth=7, random_state=0, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_test)
    
    # Fit XGB
    xgb_model.fit(X_train, 
              y_train, 
              #eval_metric="rmse", 
              #eval_set=[(X_train, y_train)], 
              verbose=20,
              early_stopping_rounds=20)
    xgb_pred = xgb_model.predict(test)
    
    preds.append(pred_lr)
    preds.append(pred_lgb)
    preds.append(xgb_pred)
    preds.append(knn_pred)    
    preds.append(rf_pred)
        
    indexes = dates_train_level2.index[dates_train_level2 == cur_block_num]
    
    #for i in range(pred_lgb.shape[0]):
    #    X_train_level2[indexes[i] - 120192 ] = [pred_lr[i], pred_lgb[i]] 
        
    slice_end = slice_start + X_test.shape[0]
    X_train_level2[slice_start : slice_end , :] = np.c_[preds].transpose()
    slice_start = slice_end
    
    
# Sanity check
#assert np.all(np.isclose(X_train_level2.mean(axis=0), [ 1.50148988,  1.38811989]))

27


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

28


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

29


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

30


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

31


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

32


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

33


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [85]:
slice_end,slice_start
#len(np.c_[preds].transpose())
#X_train_level2[slice_start : slice_end , :] = np.c_[preds].transpose()
#X_train_level2[slice_start : slice_end].shape
#y_train_level2.shape[0]

y_train[dates_train.isin([27, 28, 29, 30, 31, 32, 33])].shape
dates_train[dates_train.isin([27, 28, 29, 30, 31, 32,33])].shape

(1614364, 1376192)

(1614364,)

(1614364,)

In [88]:
# Ensembling

In [89]:
alphas_to_try = np.linspace(0, 1, 1001)


best_alpha = -1
r2_train_simple_mix = -1

for current_alpha in alphas_to_try:
    mix = current_alpha * X_train_level2[:,0] + (1 - current_alpha) * X_train_level2[:,1]
    current_r2 = r2_score(y_train_level2, mix)
    if(current_r2 > r2_train_simple_mix):
        r2_train_simple_mix = current_r2
        best_alpha = current_alpha

print('Best alpha: %f; Corresponding r2 score on train: %f' % (best_alpha, r2_train_simple_mix))

Best alpha: 0.298000; Corresponding r2 score on train: 0.266232


In [90]:

test_preds = best_alpha * X_test_level2[:,0] + (1 - best_alpha) * X_test_level2[:,1]

#r2_test_simple_mix = r2_score(y_test, test_preds)

print('Test R-squared for simple mix is %f' % r2_test_simple_mix)

Test R-squared for simple mix is 0.000000


In [91]:
#Stacking

In [92]:
lr.fit(X_train_level2, y_train_level2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [93]:
train_preds = lr.predict(X_train_level2)
r2_train_stacking = r2_score(y_train_level2, train_preds)

test_preds = lr.predict(X_test_level2)
#r2_test_stacking = r2_score(y_test, test_preds)

print('Train R-squared for stacking is %f' % r2_train_stacking)
#print('Test  R-squared for stacking is %f' % r2_test_stacking)

Train R-squared for stacking is 0.269972
Test  R-squared for stacking is 0.000000


In [94]:
# Write to file to Submit
test = pd.read_csv('input/test.csv', dtype={'ID': 'int32', 'shop_id': 'int32','item_id': 'int32'})
prediction_df = pd.DataFrame(test['ID'], columns=['ID'])
prediction_df['item_cnt_month'] = test_preds.clip(0., 20.)
prediction_df.to_csv('submission_final.csv', index=False)
prediction_df.head(10)

Unnamed: 0,ID,item_cnt_month
0,0,0.514411
1,1,0.143912
2,2,0.945865
3,3,0.29499
4,4,1.446523
5,5,0.46943
6,6,1.067788
7,7,0.191685
8,8,1.327031
9,9,0.320749
