In [4]:
import numpy as np 
import pandas as pd
import os
import gc

# feature libraries
from scipy.sparse import hstack, csr_matrix

In [5]:
# standard directories
data_dir = '../input/kaggle_data'
feat_dir = '../input/features'

In [6]:
used_cols = ['item_id','user_id']

train_idx = pd.read_csv(f'{data_dir}/train.csv', usecols=used_cols)
test_idx = pd.read_csv(f'{data_dir}/test.csv', usecols=used_cols)

train = pd.read_csv(f'{data_dir}/train.csv', usecols=used_cols)
train_active = pd.read_csv(f'{data_dir}/train_active.csv', usecols=used_cols)
train_periods = pd.read_csv(f'{data_dir}/periods_train.csv', parse_dates=['date_from','date_to'])

test = pd.read_csv(f'{data_dir}/test.csv', usecols=used_cols)
test_active = pd.read_csv(f'{data_dir}/test_active.csv', usecols=used_cols)
test_periods = pd.read_csv(f'{data_dir}/periods_test.csv', parse_dates=['date_from','date_to'])

In [7]:
# concatenate active data
all_samples = pd.concat([train, train_active, test, test_active]).reset_index(drop=True)
all_samples.drop_duplicates(['item_id'], inplace=True)

del train_active
del test_active
gc.collect()    

14

In [8]:
# concatenate periods data
all_periods = pd.concat([train_periods, test_periods])

del train_periods
del test_periods
gc.collect()

0

In [9]:
all_periods['days_up'] = all_periods['date_to'].dt.dayofyear - all_periods['date_from'].dt.dayofyear
gp = all_periods.groupby(['item_id'])[['days_up']]

gp_df = pd.DataFrame()
gp_df['days_up_sum'] = gp.sum()['days_up']
gp_df['times_put_up'] = gp.count()['days_up']
gp_df.reset_index(inplace=True)
gp_df.rename(index=str, columns={'index': 'item_id'})

all_periods.drop_duplicates(['item_id'], inplace=True)
all_periods = all_periods.merge(gp_df, on='item_id', how='left')
all_periods = all_periods.merge(all_samples, on='item_id', how='left')

cols= ['days_up_sum','times_put_up']
gp_mean = all_periods.groupby(['user_id'])[cols].mean().reset_index() \
.rename(index=str, columns={
    'days_up_sum': 'days_up_user_mean',
    'times_put_up': 'times_up_user_mean'
})

gp_median = all_periods.groupby(['user_id'])[cols].median().reset_index() \
.rename(index=str, columns={
    'days_up_sum': 'days_up_user_median',
    'times_put_up': 'times_up_user_median'
})

gp_min = all_periods.groupby(['user_id'])[cols].min().reset_index() \
.rename(index=str, columns={
    'days_up_sum': 'days_up_user_min',
    'times_put_up': 'times_up_user_min'
})

gp_max = all_periods.groupby(['user_id'])[cols].max().reset_index() \
.rename(index=str, columns={
    'days_up_sum': 'days_up_user_max',
    'times_put_up': 'times_up_user_max'
})

n_user_items = all_samples.groupby(['user_id'])[['item_id']].count().reset_index() \
.rename(index=str, columns={
    'item_id': 'n_user_items'
})


In [10]:
gp = gp_mean.merge(gp_median, on='user_id', how='outer')
gp = gp.merge(gp_min, on='user_id', how='outer')
gp = gp.merge(gp_max, on='user_id', how='outer')
gp = gp.merge(n_user_items, on='user_id', how='outer')

gp.head()

Unnamed: 0,user_id,days_up_user_mean,times_up_user_mean,days_up_user_median,times_up_user_median,days_up_user_min,times_up_user_min,days_up_user_max,times_up_user_max,n_user_items
0,00000077ff21,12.5,2.0,12.5,2.0,10.0,2.0,15.0,2.0,2
1,000006497719,19.0,2.0,19.0,2.0,19.0,2.0,19.0,2.0,1
2,00000b4d72f6,3.0,1.0,3.0,1.0,3.0,1.0,3.0,1.0,1
3,00000d642d7e,13.0,1.0,13.0,1.0,13.0,1.0,13.0,1.0,2
4,0000126b80a4,12.0,1.75,10.5,1.5,6.0,1.0,19.0,4.0,8


In [14]:
gp.columns[gp.isna().any()].tolist()

['days_up_user_mean',
 'times_up_user_mean',
 'days_up_user_median',
 'times_up_user_median',
 'days_up_user_min',
 'times_up_user_min',
 'days_up_user_max',
 'times_up_user_max']

In [15]:
# impute price
miss_cols = ['days_up_user_mean','times_up_user_mean','days_up_user_median','times_up_user_median','days_up_user_min',
             'times_up_user_min','days_up_user_max','times_up_user_max']
for c in miss_cols:
    gp[c+'_missing'] = 0
    gp[c+'_missing'] = np.where(gp[c].isnull(), 1, gp[c+'_missing'])
    gp[c].fillna(-999, inplace=True)  

In [22]:
time_cols = ['days_up_user_mean','times_up_user_mean','days_up_user_median','times_up_user_median','days_up_user_min',
             'times_up_user_min','days_up_user_max','times_up_user_max','n_user_items']
for c in time_cols:
    gp[c] = (gp[c] - np.mean(gp[c]))/np.std(gp[c])

In [23]:
gp.head()

Unnamed: 0,user_id,days_up_user_mean,times_up_user_mean,days_up_user_median,times_up_user_median,days_up_user_min,times_up_user_min,days_up_user_max,times_up_user_max,n_user_items,days_up_user_mean_missing,times_up_user_mean_missing,days_up_user_median_missing,times_up_user_median_missing,days_up_user_min_missing,times_up_user_min_missing,days_up_user_max_missing,times_up_user_max_missing
0,00000077ff21,0.300482,0.29541,0.299829,0.295434,0.298853,0.296188,0.304125,0.294561,-0.124947,0,0,0,0,0,0,0,0
1,000006497719,0.3243,0.29541,0.323643,0.295434,0.331899,0.296188,0.31876,0.294561,-0.175681,0,0,0,0,0,0,0,0
2,00000b4d72f6,0.265669,0.291712,0.265023,0.291735,0.27315,0.292489,0.260219,0.290863,-0.175681,0,0,0,0,0,0,0,0
3,00000d642d7e,0.302314,0.291712,0.301661,0.291735,0.309868,0.292489,0.296807,0.290863,-0.124947,0,0,0,0,0,0,0,0
4,0000126b80a4,0.298649,0.294485,0.292501,0.293584,0.284166,0.292489,0.31876,0.301956,0.179454,0,0,0,0,0,0,0,0


In [24]:
feat_cols = list(gp.columns)[1:]
feat_cols

['days_up_user_mean',
 'times_up_user_mean',
 'days_up_user_median',
 'times_up_user_median',
 'days_up_user_min',
 'times_up_user_min',
 'days_up_user_max',
 'times_up_user_max',
 'n_user_items',
 'days_up_user_mean_missing',
 'times_up_user_mean_missing',
 'days_up_user_median_missing',
 'times_up_user_median_missing',
 'days_up_user_min_missing',
 'times_up_user_min_missing',
 'days_up_user_max_missing',
 'times_up_user_max_missing']

In [25]:
train_idx.head()

Unnamed: 0,item_id,user_id
0,b912c3c6a6ad,e00f8ff2eaf9
1,2dac0150717d,39aeb48f0017
2,ba83aefab5dc,91e2f88dd6e3
3,02996f1dd2ea,bf5cccea572d
4,7c90be56d2ab,ef50846afc0b


In [28]:
train_uptime = train_idx.reset_index().merge(gp, how="left", on='user_id').set_index('item_id')
train_uptime.drop(['user_id','index'], axis=1, inplace=True)
train_uptime.head()

Unnamed: 0_level_0,days_up_user_mean,times_up_user_mean,days_up_user_median,times_up_user_median,days_up_user_min,times_up_user_min,days_up_user_max,times_up_user_max,n_user_items,days_up_user_mean_missing,times_up_user_mean_missing,days_up_user_median_missing,times_up_user_median_missing,days_up_user_min_missing,times_up_user_min_missing,days_up_user_max_missing,times_up_user_max_missing
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
b912c3c6a6ad,0.283992,0.29541,0.283342,0.295434,0.291509,0.296188,0.278513,0.294561,-0.124947,0,0,0,0,0,0,0,0
2dac0150717d,-3.4061,-3.406584,-3.40603,-3.406583,-3.406014,-3.406586,-3.405952,-3.406577,-0.175681,1,1,1,1,1,1,1,1
ba83aefab5dc,0.270904,0.29224,0.265023,0.291735,0.265807,0.292489,0.304125,0.294561,0.230187,0,0,0,0,0,0,0,0
02996f1dd2ea,0.315925,0.297787,0.319979,0.299132,0.284166,0.292489,0.31876,0.301956,1.397058,0,0,0,0,0,0,0,0
7c90be56d2ab,-3.4061,-3.406584,-3.40603,-3.406583,-3.406014,-3.406586,-3.405952,-3.406577,-0.175681,1,1,1,1,1,1,1,1


In [29]:
test_uptime = test_idx.reset_index().merge(gp, how="left", on='user_id').set_index('item_id')
test_uptime.drop(['user_id','index'], axis=1, inplace=True)
test_uptime.head()

Unnamed: 0_level_0,days_up_user_mean,times_up_user_mean,days_up_user_median,times_up_user_median,days_up_user_min,times_up_user_min,days_up_user_max,times_up_user_max,n_user_items,days_up_user_mean_missing,times_up_user_mean_missing,days_up_user_median_missing,times_up_user_median_missing,days_up_user_min_missing,times_up_user_min_missing,days_up_user_max_missing,times_up_user_max_missing
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6544e41a8817,0.282159,0.293561,0.287006,0.291735,0.265807,0.292489,0.289489,0.298258,0.077987,0,0,0,0,0,0,0,0
65b9484d670f,-3.4061,-3.406584,-3.40603,-3.406583,-3.406014,-3.406586,-3.405952,-3.406577,-0.175681,1,1,1,1,1,1,1,1
8bab230b2ecd,0.276663,0.291712,0.276015,0.291735,0.284166,0.292489,0.271195,0.290863,-0.02348,0,0,0,0,0,0,0,0
8e348601fefc,0.309887,0.297876,0.312652,0.299132,0.265807,0.292489,0.31876,0.298258,0.737523,0,0,0,0,0,0,0,0
8bd2fe400b89,0.278495,0.291712,0.272351,0.291735,0.27315,0.292489,0.296807,0.290863,0.027253,0,0,0,0,0,0,0,0


In [30]:
train_uptime.to_csv(f'{feat_dir}/train_uptime.csv', index=True, header=True)
test_uptime.to_csv(f'{feat_dir}/test_uptime.csv', index=True, header=True)