#### *Basic information on these data sets can be viewed in 'Data Summary.ipynb'*
#### Features inspired by others' kernel contributions*

In [1]:
import numpy as np
import pandas as pd
pd.set_option('precision',4)
import sys
print (sys.version)

2.7.14 |Anaconda, Inc.| (default, Oct 15 2017, 03:34:40) [MSC v.1500 64 bit (AMD64)]


### Create User Order Features for Prior Orders

In [2]:
# Combine order and order_products_prior datasets
orders_df = pd.read_csv('input/orders.csv')
orders_df = orders_df[orders_df.eval_set=='prior']

order_products_prior_df = pd.read_csv('input/order_products__prior.csv')

prior_order_features_df = orders_df.merge(order_products_prior_df, how='left', on='order_id')

prior_order_features_df.sample(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
31457173,2244419,199894,prior,49,3,22,0.0,47977,19,1
16743907,2231758,105943,prior,2,0,12,8.0,31562,5,0
28815286,2375130,183071,prior,6,5,11,14.0,12614,4,1
2459849,1938845,15813,prior,16,2,7,10.0,8021,7,1
13820716,3301873,87473,prior,9,4,23,2.0,29926,16,0
28747806,401846,182643,prior,1,5,16,,41559,13,0
12257184,1745257,77595,prior,18,0,11,30.0,4312,16,0
17312994,2680884,109587,prior,4,6,22,30.0,21938,5,1
25253514,3025856,160309,prior,6,6,10,13.0,26209,23,1
19517920,2145411,123701,prior,24,6,13,3.0,9327,6,0


In [3]:
prior_order_features_df['user_orders'] = prior_order_features_df.groupby('user_id')['order_id'].transform('count')
prior_order_features_df['basket_size'] = prior_order_features_df.groupby(['user_id','order_id'])['product_id'].transform('count')
prior_order_features_df['avg_basket_size'] = prior_order_features_df.groupby('user_id')['basket_size'].transform(np.mean)
prior_order_features_df['avg_days_between_orders'] = prior_order_features_df.groupby('user_id')['days_since_prior_order'].transform(np.mean)
prior_order_features_df['avg_add_to_cart_order'] = prior_order_features_df.groupby(['user_id','product_id'])['add_to_cart_order'].transform(np.mean)
prior_order_features_df['user_prod_reorders'] = prior_order_features_df.groupby(['user_id','product_id'])['reordered'].transform(np.sum)
prior_order_features_df['user_prod_reorder_rate'] = ( prior_order_features_df.user_prod_reorders / prior_order_features_df.user_orders )
prior_order_features_df['prod_order_count'] = prior_order_features_df.groupby('product_id')['add_to_cart_order'].transform('count')
prior_order_features_df['prod_reorders'] = prior_order_features_df.groupby('product_id')['reordered'].transform(np.sum)
prior_order_features_df['prod_reorder_rate'] = ( prior_order_features_df.prod_reorders / prior_order_features_df.prod_order_count )

In [4]:
prior_order_features_df.sample(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,user_orders,basket_size,avg_basket_size,avg_days_between_orders,avg_add_to_cart_order,user_prod_reorders,user_prod_reorder_rate,prod_order_count,prod_reorders,prod_reorder_rate
839001,386142,5425,prior,7,4,14,6.0,47766,8,1,128,12,12.0469,6.1466,2.75,7,0.0547,176815,134044,0.7581
15287560,2879879,96718,prior,7,1,12,24.0,41419,8,0,72,11,8.8056,11.0323,8.5,1,0.0139,216,108,0.5
7909924,93858,50139,prior,10,3,12,8.0,39121,8,0,136,10,10.1176,9.879,8.0,0,0.0,6676,4427,0.6631
15768344,10721,99767,prior,7,3,10,30.0,26209,19,1,171,34,21.7602,23.303,15.5,1,0.0058,140627,95768,0.681
7228201,824075,45847,prior,4,2,8,30.0,40120,5,1,68,11,9.9412,28.2373,6.3333,2,0.0294,3082,2061,0.6687
19731809,2811045,125128,prior,5,4,16,6.0,4704,2,1,722,22,16.4488,5.9367,2.2222,8,0.0111,128,66,0.5156
31858213,2936679,202509,prior,7,1,17,15.0,10106,3,1,74,9,8.4054,17.1343,2.5,1,0.0135,5504,3300,0.5996
22709206,156292,144237,prior,29,1,13,23.0,11365,14,1,181,14,7.6519,12.1029,5.4,4,0.0221,4337,2765,0.6375
17972755,3227363,113667,prior,1,4,13,,23333,5,0,90,6,7.6,20.631,5.6667,2,0.0222,788,407,0.5165
12015125,111829,76056,prior,19,0,15,13.0,5085,18,1,724,37,30.1547,12.6535,15.2,4,0.0055,1358,614,0.4521


In [18]:
# confirm record count and data types
prior_order_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 20 columns):
order_id                   int64
user_id                    int64
eval_set                   object
order_number               int64
order_dow                  int64
order_hour_of_day          int64
days_since_prior_order     float64
product_id                 int64
add_to_cart_order          int64
reordered                  int64
user_orders                int64
basket_size                int64
avg_basket_size            float64
avg_days_between_orders    float64
avg_add_to_cart_order      float64
user_prod_reorders         int64
user_prod_reorder_rate     float64
prod_order_count           int64
prod_reorders              int64
prod_reorder_rate          float64
dtypes: float64(6), int64(13), object(1)
memory usage: 6.3+ GB


In [17]:
# check for null or nan values
prior_order_features_df.isnull().sum()

order_id                         0
user_id                          0
eval_set                         0
order_number                     0
order_dow                        0
order_hour_of_day                0
days_since_prior_order     2078068
product_id                       0
add_to_cart_order                0
reordered                        0
user_orders                      0
basket_size                      0
avg_basket_size                  0
avg_days_between_orders          0
avg_add_to_cart_order            0
user_prod_reorders               0
user_prod_reorder_rate           0
prod_order_count                 0
prod_reorders                    0
prod_reorder_rate                0
dtype: int64

In [5]:
# Write to full set to file
prior_order_features_df.to_csv('transformed/prior_order_features.csv')

In [39]:
# check ratio of reordered products vs non-reordered products 'Null Accuracy Check'
prior_order_features_df.dropna().reordered.value_counts(normalize=True)

1    0.6301
0    0.3699
Name: reordered, dtype: float64

In [40]:
# sample of 1% of records or approximatly 300K excluding NaN records to compare reordered ratio 
prior_order_features_df.dropna().sample(frac=0.01).reordered.value_counts(normalize=True)

1    0.6313
0    0.3687
Name: reordered, dtype: float64

**It looks like the balance of reorders in the sample is almost identical to original set.**

In [41]:
prior_order_sample_df = prior_order_features_df.dropna().sample(frac=0.01)

In [42]:
# Write to sample to file
prior_order_sample_df.to_csv('transformed/prior_order_features_sample.csv')

### Evaluate Feature Importance

In [43]:
# Utilize RFC to evaluate feature predictive importance
from sklearn.ensemble import RandomForestClassifier

y = prior_order_sample_df['reordered']
X = prior_order_sample_df.drop(['reordered','order_id', 'user_id', 'eval_set','order_number'],axis=1)

cls = RandomForestClassifier(n_estimators=50)
cls.fit(X, y)

features = X.columns
feature_importances = cls.feature_importances_

features_df = pd.DataFrame({'features': features, 'importance': feature_importances})
features_df.sort_values('importance', inplace=True, ascending=False)

features_df.head()

Unnamed: 0,features,importance
11,user_prod_reorders,0.2953
12,user_prod_reorder_rate,0.2793
15,prod_reorder_rate,0.0486
6,user_orders,0.0433
9,avg_days_between_orders,0.0354


### We now have our top 5 features of significance using RFC but I will leave out the last since avg_days_between_orders will be less relevant to later orders.

In [None]:
# let's try Recursive feature elimination with cross-validation
# http://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html

import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

y = prior_order_features_df['reordered']
X = prior_order_features_df.drop(['reordered','order_id', 'user_id', 'eval_set','order_number'],axis=1)

svc = SVC(kernel="linear") # Create the RFE object and compute a cross-validated score.

# The "accuracy" scoring is proportional to the number of correct classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2), scoring='accuracy')
rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("# Features")
plt.ylabel("Cross Val / Precision Score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

### Add prior order features to training data for validation

In [44]:
# create validation data set, keeping features of importance
train_order_features_df = pd.read_csv("input/order_products__train.csv") 

orders_df = pd.read_csv('input/orders.csv')
orders_df = orders_df[orders_df.eval_set=='train'].iloc[:,:2]

train_order_features_df = train_order_features_df.merge(orders_df, how='left', on='order_id')

In [45]:
temp_prior_orders_df = prior_order_features_df[['user_id','user_orders']].drop_duplicates()

train_order_features_df = train_order_features_df.merge(temp_prior_orders_df, how='left', on='user_id')

In [46]:
temp_prior_orders_df = prior_order_features_df[['product_id', 'prod_reorder_rate']].drop_duplicates()

train_order_features_df = train_order_features_df.merge(temp_prior_orders_df, how='left', on='product_id')

In [47]:
temp_prior_orders_df = prior_order_features_df[['user_id','product_id','user_prod_reorder_rate','user_prod_reorders']].drop_duplicates()

train_order_features_df = train_order_features_df.merge(temp_prior_orders_df, how='left', on=['user_id','product_id'])

In [48]:
# confirm record count and data types
train_order_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384617 entries, 0 to 1384616
Data columns (total 9 columns):
order_id                  1384617 non-null int64
product_id                1384617 non-null int64
add_to_cart_order         1384617 non-null int64
reordered                 1384617 non-null int64
user_id                   1384617 non-null int64
user_orders               1384617 non-null int64
prod_reorder_rate         1384608 non-null float64
user_prod_reorder_rate    828824 non-null float64
user_prod_reorders        828824 non-null float64
dtypes: float64(3), int64(6)
memory usage: 105.6 MB


In [49]:
train_order_features_df.sample(10)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,user_orders,prod_reorder_rate,user_prod_reorder_rate,user_prod_reorders
1064960,2632923,33125,6,1,121166,256,0.6552,0.0156,4.0
1273664,3146511,44632,1,1,7595,35,0.7711,0.0857,3.0
144830,353498,19019,19,1,138252,246,0.6209,0.0122,3.0
270410,658358,19173,2,1,144339,78,0.6316,0.0,0.0
513454,1260872,37940,4,1,59319,121,0.6072,0.0083,1.0
1101329,2722322,39812,4,1,116371,153,0.4146,0.0,0.0
233444,570641,13861,4,1,54041,713,0.5992,0.0084,6.0
543568,1337570,30639,5,0,123206,25,0.7677,,
737143,1813484,33184,11,1,180493,114,0.3355,0.0175,2.0
347934,848536,17949,23,1,162474,132,0.7622,0.0,0.0


In [50]:
# replace NaN values with '0' and write to file
train_order_features_df = train_order_features_df.fillna(0)
train_order_features_df.to_csv('transformed/train_order_features.csv')

In [51]:
# check ratio of reordered products vs non-reordered products 'Null Accuracy Check'
train_order_features_df.reordered.value_counts(normalize=True)

1    0.5986
0    0.4014
Name: reordered, dtype: float64

** We see a lower reorder rate among the training data set from the prior order set.**

In [52]:
# capture sample 
train_order_features_df = train_order_features_df.sample(frac=0.1)
train_order_features_df.to_csv('transformed/train_order_features_sample.csv')
train_order_features_df.shape

(138462, 9)

In [53]:
# compare sample reordered ratio
train_order_features_df.reordered.value_counts(normalize=True)

1    0.598
0    0.402
Name: reordered, dtype: float64