In [2]:
# Importing libraries
import pandas as pd
from os import listdir

In [3]:
files = [file.split('.')[0] for file in listdir("../data")]

In [4]:
eval_expr = ', '.join(f'pd.read_csv(\'../data/{file}.csv\')' for file in files)

In [5]:
products, orders, departments, order_products_train, aisles, order_products_prior, sample_submission = eval(eval_expr)

In [6]:
# Shaping the data

prior_orders = pd.merge(orders, order_products_prior, on='order_id', how='inner')
prior_orders.head()


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,prior,1,2,8,,196,1,0
1,2539329,1,prior,1,2,8,,14084,2,0
2,2539329,1,prior,1,2,8,,12427,3,0
3,2539329,1,prior,1,2,8,,26088,4,0
4,2539329,1,prior,1,2,8,,26405,5,0


In [7]:
#Creating predictors

#Possible predictor categories:
#1. User predictors: How often does a user reorder? 
#2. Product Predictors: How often is a product reordered?
#3. User-Product Predictors: How often does a user buy a specific product?

In [8]:
total_orders_feature = prior_orders.groupby('user_id').agg(
    total_orders = pd.NamedAgg(column='order_number', aggfunc='max')
).reset_index()

In [9]:
user_reorder_ratio = prior_orders.groupby('user_id').agg(
    reorder_ratio = pd.NamedAgg(column='reordered', aggfunc='mean')
).reset_index()

In [10]:
# Joining the two user features on id 'user_id'
user_features = total_orders_feature.merge(user_reorder_ratio, on='user_id', how='inner')

In [11]:
# Product features

total_product_purchases = prior_orders.groupby('product_id').agg(
    total_product_purchases = pd.NamedAgg(column='order_id', aggfunc='count')
).reset_index()

In [12]:
product_reorder_ratio = prior_orders.groupby('product_id').agg(
    product_reorder_ratio = pd.NamedAgg(column='reordered', aggfunc='mean')
).reset_index()

In [13]:
product_reorder_ratio_filtered = product_reorder_ratio.loc[product_reorder_ratio['product_reorder_ratio'] > 0.5]

In [14]:
# Join product features
product_features = total_product_purchases.merge(product_reorder_ratio_filtered, on='product_id', how='left').fillna(0)

In [15]:
# User-product predictors
total_product_buys = prior_orders.groupby(['user_id', 'product_id']).agg(
    total_product_buys = pd.NamedAgg(column='order_id', aggfunc='count')
).reset_index()

In [16]:
features_user_product = total_product_buys.merge(user_features, on='user_id', how='left')

In [17]:
# Combining features

features_user_product = features_user_product.merge(product_features, on='product_id', how='inner')

In [18]:
features_user_product.head(10)

Unnamed: 0,user_id,product_id,total_product_buys,total_orders,reorder_ratio,total_product_purchases,product_reorder_ratio
0,1,196,10,10,0.694915,35791,0.77648
1,15,196,5,22,0.819444,35791,0.77648
2,19,196,3,9,0.348039,35791,0.77648
3,21,196,1,33,0.502439,35791,0.77648
4,31,196,2,20,0.364548,35791,0.77648
5,43,196,2,11,0.401361,35791,0.77648
6,52,196,14,27,0.698225,35791,0.77648
7,67,196,19,24,0.716049,35791,0.77648
8,81,196,2,7,0.259259,35791,0.77648
9,82,196,9,19,0.793893,35791,0.77648


In [19]:
orders_future = orders[((orders['eval_set']=='train') | (orders['eval_set']=='test'))]
orders_future = orders_future[ ['user_id', 'eval_set', 'order_id'] ]
orders_future.head(10)

Unnamed: 0,user_id,eval_set,order_id
10,1,train,1187899
25,2,train,1492625
38,3,test,2774568
44,4,test,329954
49,5,train,2196797
53,6,test,1528013
74,7,train,525192
78,8,train,880375
82,9,train,1094988
88,10,train,1822501


In [20]:
data_prep = features_user_product.merge(orders_future, on='user_id', how='left')


In [21]:
# Curating training dataframe
train_data = data_prep[data_prep['eval_set']=='train']

In [22]:
train_data.head(5)

Unnamed: 0,user_id,product_id,total_product_buys,total_orders,reorder_ratio,total_product_purchases,product_reorder_ratio,eval_set,order_id
0,1,196,10,10,0.694915,35791,0.77648,train,1187899
3,21,196,1,33,0.502439,35791,0.77648,train,1854765
5,43,196,2,11,0.401361,35791,0.77648,train,1864787
6,52,196,14,27,0.698225,35791,0.77648,train,1647290
7,67,196,19,24,0.716049,35791,0.77648,train,2757217


In [23]:
train_data = train_data.merge(order_products_train[['product_id','order_id', 'reordered']], on=['product_id','order_id'], how='left')

In [25]:
train_data['reordered'] = train_data['reordered'].fillna(0)