In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore') #Supress unnecessary warnings for readability and cleaner presentation

from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# Load files
datadir = '/data/Instacart/'

orders = pd.read_csv(datadir + 'orders.csv')
prior = pd.read_csv( datadir + 'order_products__prior.csv')
train = pd.read_csv(datadir + 'order_products__train.csv')

# Evaluation : Mean F1 score



$$F_1=2 \cdot\frac{\mathbf{precision} \cdot \mathbf{recall}}{\mathbf{precision} + \mathbf{recall}}$$

In [3]:
# Convert train file into submit format
train_grouped = train.groupby('order_id').product_id.apply(list).reset_index()
del train
train_grouped.head()

Unnamed: 0,order_id,product_id
0,1,"[49302, 11109, 10246, 49683, 43633, 13176, 472..."
1,36,"[39612, 19660, 49235, 43086, 46620, 34497, 486..."
2,38,"[11913, 18159, 4461, 21616, 23622, 32433, 2884..."
3,96,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]"
4,98,"[8859, 19731, 43654, 13176, 4357, 37664, 34065..."


In [4]:
mlb = MultiLabelBinarizer(sparse_output=True)
mlb.fit(train_grouped.product_id)
y_true = mlb.transform(train_grouped.product_id)

In [5]:
[f1_score(y_true, y_true, average='micro') ,
f1_score(y_true, y_true, average = 'macro'),
f1_score(y_true, y_true, average = 'weighted'),
f1_score(y_true, y_true, average = 'samples')]

[1.0, 1.0, 1.0, 1.0]

# TryOut1 : Completely Repeat the last order

In [6]:
prior_grouped = prior.groupby('order_id').product_id.apply(list).reset_index()

orders['prior_order'] = orders['order_number'] - 1
prior_orders = orders[['order_id','user_id','order_number']]
prior_orders.rename(columns={ 'order_number':'prior_order','order_id':'prior_order_id'}, inplace=True)
orders = pd.merge(orders, prior_orders, on =['user_id','prior_order'], how = 'left')
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,prior_order,prior_order_id
0,2539329,1,prior,1,2,8,,0,
1,2398795,1,prior,2,3,7,15.0,1,2539329.0
2,473747,1,prior,3,3,12,21.0,2,2398795.0
3,2254736,1,prior,4,4,7,29.0,3,473747.0
4,431534,1,prior,5,4,15,28.0,4,2254736.0


In [7]:
selected = orders.ix[orders['eval_set'] == 'train',['order_id', 'prior_order_id']]
selected = pd.merge(selected, train_grouped, on ='order_id', how ='left')
selected = pd.merge(selected, prior_grouped, left_on = 'prior_order_id', right_on = 'order_id', how = 'left')
selected.head()

Unnamed: 0,order_id_x,prior_order_id,product_id_x,order_id_y,product_id_y
0,1187899,2550362.0,"[196, 25133, 38928, 26405, 39657, 10258, 13032...",2550362,"[196, 46149, 39657, 38928, 25133, 10258, 35951..."
1,1492625,839880.0,"[22963, 7963, 16589, 32792, 41787, 22825, 1364...",839880,"[24852, 16589, 1559, 19156, 18523, 22825, 2741..."
2,2196797,157374.0,"[15349, 19057, 16185, 21413, 20843, 20114, 482...",157374,"[27344, 24535, 43693, 40706, 16168, 21413, 139..."
3,525192,2452257.0,"[12053, 47272, 37999, 13198, 43967, 40852, 176...",2452257,"[47272, 29993, 31683, 27690, 9598, 13198, 3039..."
4,880375,2570360.0,"[15937, 5539, 10960, 23165, 22247, 4853, 27104...",2570360,"[23165, 2078, 4799, 33640, 10644, 41540, 15143..."


In [8]:
mlb.fit(selected.product_id_y +selected.product_id_x)
y_true = mlb.transform(selected.product_id_x)
y_pred = mlb.transform(selected.product_id_y)

In [9]:
[f1_score(y_true, y_pred, average='micro') ,
f1_score(y_true, y_pred, average = 'macro'),
f1_score(y_true, y_pred, average = 'weighted'),
f1_score(y_true, y_pred, average = 'samples')]

[0.27655730325611744,
 0.16340051280800727,
 0.27669414866023379,
 0.26162917497110255]

##### (0.3276746 Public LB Score)

# TryOut2 : Order reordered products in last order

In [10]:
prior_grouped = prior.ix[prior['reordered'] ==1].groupby('order_id').product_id.apply(list).reset_index()

In [11]:
selected = orders.ix[orders['eval_set'] == 'train',['order_id', 'prior_order_id']]
selected = pd.merge(selected, train_grouped, on ='order_id', how ='left')
selected = pd.merge(selected, prior_grouped, left_on = 'prior_order_id', right_on = 'order_id', how = 'inner')
selected.head()

Unnamed: 0,order_id_x,prior_order_id,product_id_x,order_id_y,product_id_y
0,1187899,2550362.0,"[196, 25133, 38928, 26405, 39657, 10258, 13032...",2550362,"[196, 46149, 25133, 10258, 13032, 12427]"
1,1492625,839880.0,"[22963, 7963, 16589, 32792, 41787, 22825, 1364...",839880,"[24852, 16589, 1559, 19156, 18523, 33754, 2170..."
2,2196797,157374.0,"[15349, 19057, 16185, 21413, 20843, 20114, 482...",157374,"[24535, 43693, 40706, 21413, 13988, 8518, 2660..."
3,525192,2452257.0,"[12053, 47272, 37999, 13198, 43967, 40852, 176...",2452257,"[47272, 29993, 31683, 27690, 9598, 13198, 3039..."
4,880375,2570360.0,"[15937, 5539, 10960, 23165, 22247, 4853, 27104...",2570360,"[23165, 2078, 34358, 17794, 18531]"


In [12]:
mlb.fit(selected.product_id_y +selected.product_id_x)
y_true = mlb.transform(selected.product_id_x)
y_pred = mlb.transform(selected.product_id_y)

In [13]:
[f1_score(y_true, y_pred, average='micro') ,
f1_score(y_true, y_pred, average = 'macro'),
f1_score(y_true, y_pred, average = 'weighted'),
f1_score(y_true, y_pred, average = 'samples')]

[0.28609177966505295,
 0.1637014192638124,
 0.27389268327012739,
 0.26612619160327722]

##### (0.3276826 Public LB Score)

In [14]:
del prior_grouped

# TryOut3: Order all the products ordered before

In [15]:
prior_user = pd.merge(prior, orders[['order_id','user_id']], on='order_id', how = 'left')
del prior

In [16]:
user_grouped = prior_user.groupby('user_id').product_id.apply(lambda x : list(set(list(x)))).reset_index()
user_grouped.head()

Unnamed: 0,user_id,product_id
0,1,"[17122, 196, 26405, 46149, 14084, 13032, 26088..."
1,2,"[45066, 2573, 18961, 23, 32792, 1559, 22559, 1..."
2,3,"[17668, 44683, 48523, 21903, 14992, 21137, 324..."
3,4,"[21573, 42329, 17769, 35469, 37646, 1200, 1905..."
4,5,"[11777, 40706, 28289, 48775, 20754, 6808, 1398..."


In [17]:
selected = orders.ix[orders['eval_set']=='train']
selected = pd.merge(selected[['order_id', 'user_id']],  train_grouped, on = 'order_id', how = 'left')
selected = pd.merge(selected, user_grouped, on='user_id', how = 'left')
selected.head()

Unnamed: 0,order_id,user_id,product_id_x,product_id_y
0,1187899,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032...","[17122, 196, 26405, 46149, 14084, 13032, 26088..."
1,1492625,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364...","[45066, 2573, 18961, 23, 32792, 1559, 22559, 1..."
2,2196797,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482...","[11777, 40706, 28289, 48775, 20754, 6808, 1398..."
3,525192,7,"[12053, 47272, 37999, 13198, 43967, 40852, 176...","[11520, 35333, 519, 10504, 47623, 45066, 13198..."
4,880375,8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104...","[11136, 8193, 17794, 26882, 39812, 24838, 651,..."


In [18]:
mlb.fit(selected.product_id_y + selected.product_id_x)
y_true = mlb.transform(selected.product_id_x)
y_pred = mlb.transform(selected.product_id_y)

In [19]:
[f1_score(y_true, y_pred, average='micro') ,
f1_score(y_true, y_pred, average = 'macro'),
f1_score(y_true, y_pred, average = 'weighted'),
f1_score(y_true, y_pred, average = 'samples')]

[0.16813076981904759,
 0.099262958027494116,
 0.19806153072343349,
 0.19748902517893549]

# TryOut4: Order all the products reordered before

In [20]:
user_grouped = prior_user.ix[prior_user['reordered'] ==1].groupby('user_id').product_id\
              .apply(lambda x : list(set(list(x)))).reset_index()
user_grouped.head()

Unnamed: 0,user_id,product_id
0,1,"[196, 26405, 46149, 13032, 26088, 12427, 25133..."
1,2,"[34688, 21376, 40198, 45066, 32139, 2573, 2485..."
2,3,"[23650, 17668, 16965, 18599, 24810, 24010, 938..."
3,4,[35469]
4,5,"[11777, 40706, 13988, 21413, 8518, 26604, 4369..."


In [21]:
selected = orders.ix[orders['eval_set']=='train']
selected = pd.merge(selected[['order_id', 'user_id']],  train_grouped, on = 'order_id', how = 'left')
selected = pd.merge(selected, user_grouped, on='user_id', how = 'inner')
selected.head()

Unnamed: 0,order_id,user_id,product_id_x,product_id_y
0,1187899,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032...","[196, 26405, 46149, 13032, 26088, 12427, 25133..."
1,1492625,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364...","[34688, 21376, 40198, 45066, 32139, 2573, 2485..."
2,2196797,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482...","[11777, 40706, 13988, 21413, 8518, 26604, 4369..."
3,525192,7,"[12053, 47272, 37999, 13198, 43967, 40852, 176...","[35333, 519, 45066, 13198, 10895, 21137, 40852..."
4,880375,8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104...","[17794, 18531, 9839, 14992, 21903, 34358, 2898..."


In [22]:
mlb.fit(selected.product_id_y +selected.product_id_x)
y_true = mlb.transform(selected.product_id_x)
y_pred = mlb.transform(selected.product_id_y)

In [23]:
[f1_score(y_true, y_pred, average='micro') ,
f1_score(y_true, y_pred, average = 'macro'),
f1_score(y_true, y_pred, average = 'weighted'),
f1_score(y_true, y_pred, average = 'samples')]

[0.24750974597803274,
 0.14760066954993412,
 0.25314330808333446,
 0.25462647977058134]

# Baseline is "TryOut2: Order reordered products in last order"