In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import gc


import matplotlib.pyplot as plt
import seaborn as sns
import itertools


from sklearn.model_selection import KFold
from sklearn.neighbors import NearestNeighbors
#from sklearn.ensemble import GradientBoostingRegressor
#import xgboost as xgb
#from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
#https://www.kaggle.com/c/competitive-data-science-final-project
demo = False   #process only 3 shops to speed up
groupby_cols = ['date_block_num', 'shop_id', 'item_id']
shift_range = [1, 2, 3, 4, 5, 6, 12]
DATA_FOLDER = './' #'../readonly/final_project_data/'
k_list = [3, 5, 8, 32]


In [3]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    # int_cols =   [c for c in df if df[c].dtype == "int64"] + ['target', 'target_item', 'target_shop']
    int_cols = [c for c in df if (c.startswith('target')) or (df[c].dtype == "int64") ]
    #cat_cols = [c for c in df if c in ['item_category_id'] ]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    #df[cat_cols] =  df[cat_cols].astype(np.str)
    
    return df

In [4]:
import os


for file in os.listdir(DATA_FOLDER):
    print(file)


    
#train     = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
items     = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
#item_cats = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
#shops     = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
#test      = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz')).drop(columns=['ID'])

.ipynb_checkpoints
00_final_project.ipynb
01_mean_encodings.h5
01_mean_encodings.ipynb
02_historic_features.h5
02_historic_features.ipynb
03_dates.h5
03_knn_features.h5
03_knn_features.ipynb
03_knn_features_target.h5
04_other_features.ipynb
boost_pred.csv
items.csv
item_categories.csv
knn_distances.npy
knn_indexes.npy
knn_labels.npy
sales_train.csv.gz
sample_submission.csv.gz
shops.csv
test.csv.gz
xgboost.ipynb


In [5]:
if demo:
    last_block_num = 33
    train = train[ train['shop_id'].isin([26, 27, 28]) ]
    test = train[ train['date_block_num'] == last_block_num ]
    train = train[ train['date_block_num'] < last_block_num ]
    
else:
    last_block_num = 34
    test['date_block_num'] = last_block_num

In [6]:
all_shops_items = []

for block_num in train['date_block_num'].unique():
    unique_shops = train[ train['date_block_num']==block_num ]['shop_id'].unique()
    unique_items = train[ train['date_block_num']==block_num ]['item_id'].unique()
    all_shops_items.append( np.array( list( itertools.product([block_num], unique_shops, unique_items) ), dtype='int32' ) )


train_data = pd.DataFrame( np.vstack( all_shops_items ), columns=groupby_cols )




In [11]:
groupby = train.groupby(by=groupby_cols, as_index=False).agg({ 'item_cnt_day': ['sum']})
groupby.columns = [val[0] if val[-1]=='' else '_'.join(val) for val in groupby.columns.values]
groupby.rename(columns={'item_cnt_day_sum':'target'}, inplace=True)
train_data = pd.merge(train_data, groupby, how='left', on=groupby_cols).fillna(0)

groupby = train.groupby(by=['date_block_num', 'shop_id'], as_index=False).agg({'item_cnt_day':['sum']})
groupby.columns = [val[0] if val[-1]=='' else '_'.join(val) for val in groupby.columns.values]
groupby.rename(columns={'item_cnt_day_sum':'target_shop'}, inplace=True)
train_data = pd.merge(train_data, groupby, how='left', on=['date_block_num', 'shop_id']).fillna(0)

groupby = train.groupby(by=['date_block_num', 'item_id'], as_index=False).agg({'item_cnt_day':['sum']})
groupby.columns = [val[0] if val[-1]=='' else '_'.join(val) for val in groupby.columns.values]
groupby.rename(columns={'item_cnt_day_sum':'target_item'}, inplace=True)
train_data = pd.merge(train_data, groupby, how='left', on=['date_block_num', 'item_id']).fillna(0)

groupby = train.groupby( by=['date_block_num', 'item_id'], as_index=False).agg({ 'item_price': ['mean'] })
groupby.columns = [val[0] if val[-1]=='' else '_'.join(val) for val in groupby.columns.values]
train_data = pd.merge(train_data, groupby, how='left', on=['date_block_num', 'item_id']).fillna(0)


# Clipping and Indexing

In [13]:
train_data['target'] = np.clip( train_data['target'], a_min=0, a_max=20 )
train_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
train_data.reset_index(inplace=True, drop=True)

if demo:
    test.drop(labels=['date', 'item_price', 'item_cnt_day'], axis=1, inplace=True)


In [14]:
del groupby, all_shops_items
gc.collect();

In [15]:
train_data.tail()

Unnamed: 0,date_block_num,shop_id,item_id,target,target_shop,target_item,item_price_mean
10913845,33,59,22162,0.0,790.0,10.0,349.0
10913846,33,59,22163,0.0,790.0,26.0,162.6984
10913847,33,59,22164,0.0,790.0,15.0,549.0
10913848,33,59,22166,0.0,790.0,11.0,150.0
10913849,33,59,22167,0.0,790.0,37.0,293.098437


# Mean Encodings

In [16]:
%%time
train_data['item_target_enc'] = np.nan

kfold = KFold(n_splits=5, shuffle=False)
for train_index, val_index in kfold.split(train_data.index):
    #X_val = train_data.iloc[val_index]
    #X_train = train_data.iloc[train_index]
    
    item_id_target_mean = train_data.loc[train_index].groupby('item_id').target.mean()
    train_data.loc[val_index, 'item_target_enc'] = train_data.loc[val_index, 'item_id'].map(item_id_target_mean)


#train_data['item_target_enc'].fillna(0.3343, inplace=True)     
train_data['item_target_enc'].fillna(train_data['target'].mean(), inplace=True)

Wall time: 3min 21s


In [17]:
all_data = train_data.append(test).fillna(0)
dates = all_data['date_block_num']

In [21]:
item_id_target_mean = all_data[dates<last_block_num].groupby('item_id').target.mean()

In [24]:
#all_data.loc[dates == last_block_num, 'item_target_enc']

In [25]:
all_data.loc[dates == last_block_num, 'item_target_enc'] = all_data.loc[dates == last_block_num, 'item_id'].map(item_id_target_mean)
all_data.item_target_enc.fillna( all_data.target.mean(), inplace=True )

In [26]:
all_data.to_hdf("01_mean_encodings.h5", 'key1')

In [5]:
all_data = pd.read_hdf("01_mean_encodings.h5", 'key1')

## Historical values

In [None]:
%%time
shifted_columns = list( all_data.columns.difference(groupby_cols) )

for shift in tqdm_notebook(shift_range):
    
    #print("starting shift", shift)
    shifted_data = all_data[groupby_cols + shifted_columns].copy()
    shifted_data['date_block_num'] = shifted_data['date_block_num'] + shift
    
    foo = lambda x: '{}_lag_{}'.format(x, shift) if x in shifted_columns else x
    shifted_data = shifted_data.rename(columns=foo)
    
    #print("starting merge", shift)
    all_data = pd.merge(all_data, shifted_data, how='left', on=groupby_cols).fillna(0)
    all_data = downcast_dtypes(all_data)

del shifted_data

In [None]:
item_category = items[['item_id', 'item_category_id']].drop_duplicates()
all_data = pd.merge(all_data, item_category, how='left', on=['item_id'])

In [17]:
all_data = downcast_dtypes(all_data)

In [None]:
all_data.tail()

In [21]:
all_data = all_data[all_data['date_block_num'] >= 12] 
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] + ['item_category_id']

to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(groupby_cols))) + ['date_block_num'] 
print(to_drop_cols)

all_data.reset_index(inplace=True, drop=True)


['target', 'target_shop', 'item_target_enc', 'item_price_mean', 'target_item', 'date_block_num']


In [22]:
#all_data.dtypes

In [None]:
all_data.to_hdf("02_historic_features.h5", 'key1')

In [20]:
all_data = pd.read_hdf("02_historic_features.h5", 'key1')

In [42]:
dates = all_data['date_block_num']

X_all = all_data.drop(to_drop_cols, axis=1)
y_all = all_data['target' ]

X_train = X_all[ dates<last_block_num ]
X_test = X_all[ dates==last_block_num ]

y_train = y_all[ dates<last_block_num ]
y_test = y_all[ dates==last_block_num ]

In [24]:
y_train.dtype

dtype('int32')

In [25]:
%%time
nnf = NearestNeighbors(n_neighbors=max(k_list))
nnf.fit(X_train, y_train)


Wall time: 12.3 s


In [26]:
%%time
(dist, ind) = nnf.kneighbors(X_all)

Wall time: 3min 16s


In [27]:
%%time
labels = np.stack( y_all.index.map( lambda x: np.array( y_train.loc[ ind[x] ] , dtype=np.int32) ) )

Wall time: 1min 18s


# Save / Load  KNN results

In [28]:
np.save('knn_distances.npy', dist)
np.save('knn_indexes.npy', ind)
np.save('knn_labels.npy', labels)


In [29]:
dist = np.load('knn_distances.npy')
ind = np.load('knn_indexes.npy')
labels = np.load('knn_labels.npy')

In [43]:
for k in k_list:
    knn_median = pd.Series( np.median( labels[:, :k] , axis=1).astype(np.float32), name="knn_median_"+str(k))
    knn_mean = pd.Series( np.mean( labels[:, :k] , axis=1).astype(np.float32), name="knn_mean_"+str(k))
    X_all = pd.concat([X_all, knn_mean, knn_median], axis=1)
    
    

In [44]:
X_all.tail()

Unnamed: 0,item_id,shop_id,item_price_mean_lag_1,item_target_enc_lag_1,target_lag_1,target_item_lag_1,target_shop_lag_1,item_price_mean_lag_2,item_target_enc_lag_2,target_lag_2,...,target_shop_lag_12,item_category_id,knn_mean_3,knn_median_3,knn_mean_5,knn_median_5,knn_mean_8,knn_median_8,knn_mean_32,knn_median_32
158225,22001,28,0.0,0.0,0,0,0,0.0,0.0,0,...,0,40,0.333333,0.0,0.4,0.0,0.375,0.0,0.71875,0.0
158226,22001,28,0.0,0.0,0,0,0,0.0,0.0,0,...,0,40,0.333333,0.0,0.4,0.0,0.375,0.0,0.71875,0.0
158227,22001,28,0.0,0.0,0,0,0,0.0,0.0,0,...,0,40,0.333333,0.0,0.4,0.0,0.375,0.0,0.71875,0.0
158228,21811,28,0.0,0.0,0,0,0,0.0,0.0,0,...,0,37,0.333333,0.0,0.8,1.0,0.625,0.5,0.59375,0.5
158229,20949,28,5.0,18.8125,20,327,2979,5.0,18.8125,20,...,4695,71,20.0,20.0,20.0,20.0,13.375,20.0,5.375,1.5


In [50]:
X_all.to_hdf("03_knn_features.h5", 'key1')
y_all.to_hdf("03_knn_features_target.h5", 'key1')
dates.to_hdf("03_dates.h5", 'key1')

In [51]:
X_all = pd.read_hdf("03_knn_features.h5", 'key1')
y_all = pd.read_hdf("03_knn_features_target.h5", 'key1')
dates = pd.read_hdf("03_dates.h5", 'key1')

In [53]:
X_all.sample(10)

Unnamed: 0,item_id,shop_id,item_price_mean_lag_1,item_target_enc_lag_1,target_lag_1,target_item_lag_1,target_shop_lag_1,item_price_mean_lag_2,item_target_enc_lag_2,target_lag_2,...,target_shop_lag_12,item_category_id,knn_mean_3,knn_median_3,knn_mean_5,knn_median_5,knn_mean_8,knn_median_8,knn_mean_32,knn_median_32
51509,5762,26,0.0,0.0,0,0,0,0.0,0.0,0,...,0,75,0.0,0.0,0.0,0.0,0.0,0.0,0.09375,0.0
143569,13720,27,229.0,1.223574,1,1,3357,229.0,1.223574,1,...,0,69,2.0,2.0,1.6,2.0,1.125,1.0,1.3125,1.0
89727,10325,26,0.0,0.0,0,0,0,0.0,0.0,0,...,3391,40,0.0,0.0,0.4,0.0,0.25,0.0,0.375,0.0
81017,482,26,3163.636475,3.207792,0,11,1900,3187.5,3.207792,3,...,2409,73,1.0,0.0,0.6,0.0,0.75,0.0,0.90625,1.0
53815,2505,27,299.0,0.722222,0,1,4282,299.0,0.461538,0,...,4148,55,0.333333,0.0,0.6,1.0,0.625,1.0,0.5,0.5
86244,2754,28,999.0,1.767857,1,1,4695,763.140015,1.767857,0,...,6454,19,3.0,2.0,2.4,2.0,3.125,2.5,3.125,2.0
117953,3028,28,2599.0,3.0,1,3,4234,2599.0,3.0,7,...,0,23,1.666667,2.0,1.4,2.0,1.75,2.0,2.40625,2.0
158030,15066,28,799.0,6.0,0,1,2979,699.125,6.0,4,...,0,28,2.0,1.0,2.0,1.0,1.75,1.0,1.65625,1.0
54590,8465,27,0.0,0.0,0,0,0,349.0,0.958333,1,...,0,43,0.0,0.0,0.4,0.0,0.5,0.5,0.53125,0.5
81566,5204,26,2599.0,1.1,2,3,1900,0.0,0.0,0,...,0,19,1.0,1.0,1.0,1.0,0.875,1.0,1.28125,1.0
