In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# read the file
df=pd.read_csv('df.csv')

As we had seen in EDA, order number of customers ranged from 1 - 100. Due to limited computational capacity, I will only use only the last 5 order numbers and exclude few features to predict reordered since they are recent purchases. 

In [3]:
# create a column that contained the reverse of order number
df['rev_order_num'] = df.groupby('user_id').order_number.transform(np.max) - df.order_number
df.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,rev_order_num
0,2539329,1,prior,1,2,8,,196.0,1.0,0.0,Soda,77.0,7.0,soft drinks,beverages,10
1,2539329,1,prior,1,2,8,,14084.0,2.0,0.0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,soy lactosefree,dairy eggs,10
2,2539329,1,prior,1,2,8,,12427.0,3.0,0.0,Original Beef Jerky,23.0,19.0,popcorn jerky,snacks,10
3,2539329,1,prior,1,2,8,,26088.0,4.0,0.0,Aged White Cheddar Popcorn,23.0,19.0,popcorn jerky,snacks,10
4,2539329,1,prior,1,2,8,,26405.0,5.0,0.0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,paper goods,household,10


In [4]:
# create a new df containing only the last 5 order number
df_new = df[df['rev_order_num'] < 5]    #using 5 because the rev_order_num started from 0 instead of 1
#check the shape
df_new.shape

(10193312, 16)

### User Feature

We want to know what is the pattern of the user, based on: 
* total number of orders
* average amount per order 
* average days between orders
* probability of items that the users bought in each departments

#### User Features: total order

In [5]:
user = df_new.groupby('user_id')['order_number'].max().reset_index(name='u_total_order')
user.head()

Unnamed: 0,user_id,u_total_order
0,1,11
1,2,15
2,3,12
3,4,5
4,5,5


#### User Features: average amount per order

In [6]:
# how many did the user order at one time?
u_order_size = df_new.groupby(['user_id', 'order_id']).size().reset_index()
# rename the columns
u_order_size.columns = ['user_id', 'order_id', 'order_amt']
u_order_size.head()

Unnamed: 0,user_id,order_id,order_amt
0,1,550135,5
1,1,1187899,11
2,1,2295261,6
3,1,2550362,9
4,1,3108588,6


In [7]:
# average user order 
u_order_size = u_order_size.groupby('user_id').order_amt.mean().round(0).reset_index()
u_order_size.columns = ['user_id', 'avg_u_order']
u_order_size.head()

Unnamed: 0,user_id,avg_u_order
0,1,7.0
1,2,18.0
2,3,6.0
3,4,4.0
4,5,9.0


#### User Features: Average days between orders

In [8]:
# to make the average days between order
u_days = df_new.groupby('user_id')['days_since_prior_order'].mean().round(0).reset_index()
u_days.columns = ['user_id', 'u_avg_days']
u_days.head()

Unnamed: 0,user_id,u_avg_days
0,1,16.0
1,2,27.0
2,3,11.0
3,4,15.0
4,5,12.0


In [9]:
# what day did the user bought?
u_dow = df_new.groupby(['user_id','order_dow']).size().reset_index()
u_dow.columns = ['user_id', 'order_dow', 'user_ct_day']
u_dow.head()

Unnamed: 0,user_id,order_dow,user_ct_day
0,1,1,17
1,1,4,20
2,2,1,65
3,2,3,16
4,2,4,9


In [10]:
# ratio
u_dow['u_dow_ratio'] = u_dow['user_ct_day']/ u_dow.groupby('user_id').user_ct_day.transform('sum')
u_dow.head()

Unnamed: 0,user_id,order_dow,user_ct_day,u_dow_ratio
0,1,1,17,0.459459
1,1,4,20,0.540541
2,2,1,65,0.722222
3,2,3,16,0.177778
4,2,4,9,0.1


In [11]:
# make a table for each user and dow ratio
u_dow = u_dow.pivot(index ='user_id', columns = 'order_dow', values = 'u_dow_ratio').add_prefix('u_dow_')
u_dow = u_dow.fillna(0)
u_dow.head()

order_dow,u_dow_0,u_dow_1,u_dow_2,u_dow_3,u_dow_4,u_dow_5,u_dow_6
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.0,0.459459,0.0,0.0,0.540541,0.0,0.0
2,0.0,0.722222,0.0,0.177778,0.1,0.0,0.0
3,0.6,0.2,0.0,0.2,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.5,0.277778,0.222222
5,0.391304,0.26087,0.0,0.347826,0.0,0.0,0.0


#### User Features: probability of items bought per departments

In [12]:
u_dep_prob = df_new.groupby('user_id')['department'].value_counts(normalize=True).reset_index(name='d_prob')
u_dep_prob.head()

Unnamed: 0,user_id,department,d_prob
0,1,dairy eggs,0.324324
1,1,snacks,0.324324
2,1,beverages,0.243243
3,1,breakfast,0.081081
4,1,household,0.027027


In [13]:
u_dep_prob = pd.pivot(data=u_dep_prob, index='user_id', columns='department', values='d_prob').add_prefix('u_d_')
u_dep_prob = u_dep_prob.fillna(0)
u_dep_prob.head()

department,u_d_alcohol,u_d_babies,u_d_bakery,u_d_beverages,u_d_breakfast,u_d_bulk,u_d_canned goods,u_d_dairy eggs,u_d_deli,u_d_dry goods pasta,...,u_d_household,u_d_international,u_d_meat seafood,u_d_missing,u_d_other,u_d_pantry,u_d_personal care,u_d_pets,u_d_produce,u_d_snacks
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.243243,0.081081,0.0,0.0,0.324324,0.0,0.0,...,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324324
2,0.0,0.0,0.022222,0.044444,0.0,0.0,0.044444,0.2,0.088889,0.0,...,0.0,0.022222,0.0,0.0,0.0,0.055556,0.011111,0.0,0.177778,0.188889
3,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.266667,0.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.4,0.066667
4,0.111111,0.0,0.111111,0.166667,0.0,0.0,0.055556,0.0,0.111111,0.0,...,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.055556
5,0.0,0.0,0.0,0.0,0.0,0.0,0.021739,0.195652,0.021739,0.021739,...,0.021739,0.086957,0.0,0.0,0.0,0.065217,0.0,0.0,0.5,0.021739


Merging u_features

In [64]:
u_features = pd.merge(user, u_order_size, on='user_id', how='left')
u_features = pd.merge(u_features, u_days, on='user_id', how='left')
u_features = pd.merge(u_features, u_dow, on='user_id', how='left')
u_features = pd.merge(u_features, u_dep_prob, on='user_id', how='left')
u_features.head()

Unnamed: 0,user_id,u_total_order,avg_u_order,u_avg_days,u_dow_0,u_dow_1,u_dow_2,u_dow_3,u_dow_4,u_dow_5,...,u_d_household,u_d_international,u_d_meat seafood,u_d_missing,u_d_other,u_d_pantry,u_d_personal care,u_d_pets,u_d_produce,u_d_snacks
0,1,11,7.0,16.0,0.0,0.459459,0.0,0.0,0.540541,0.0,...,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324324
1,2,15,18.0,27.0,0.0,0.722222,0.0,0.177778,0.1,0.0,...,0.0,0.022222,0.0,0.0,0.0,0.055556,0.011111,0.0,0.177778,0.188889
2,3,12,6.0,11.0,0.6,0.2,0.0,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.4,0.066667
3,4,5,4.0,15.0,0.0,0.0,0.0,0.0,0.5,0.277778,...,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.055556
4,5,5,9.0,12.0,0.391304,0.26087,0.0,0.347826,0.0,0.0,...,0.021739,0.086957,0.0,0.0,0.0,0.065217,0.0,0.0,0.5,0.021739


In [77]:
u_features.shape

(206209, 32)

In [65]:
# save u_features
u_features.to_csv('u_features.csv', index=False)

### Item Features

To understand the item characteristics, we are going to calculate:
* total item bought
* item reordered ratio
* item day of week ratio

#### Item features: Item size

In [14]:
# total item bought
i_size = df_new.groupby('product_id').size().reset_index(name='i_total_bought')
i_size.head()

Unnamed: 0,product_id,i_total_bought
0,1.0,542
1,2.0,30
2,3.0,49
3,4.0,158
4,5.0,4


#### Item feature: item reordered ratio

In [15]:
# sum of reordered per product_id
i_reordered = df_new.groupby('product_id').reordered.sum().reset_index(name='reordered_sum')
i_reordered.head()

Unnamed: 0,product_id,reordered_sum
0,1.0,300.0
1,2.0,3.0
2,3.0,34.0
3,4.0,76.0
4,5.0,2.0


In [16]:
# item reordered ratio
i_reordered_ratio = pd.merge(i_size, i_reordered, on='product_id', how='left')
i_reordered_ratio.head()

Unnamed: 0,product_id,i_total_bought,reordered_sum
0,1.0,542,300.0
1,2.0,30,3.0
2,3.0,49,34.0
3,4.0,158,76.0
4,5.0,4,2.0


In [17]:
# make ratio between reordered_sum and i_total_bought
i_reordered_ratio['i_reordered_ratio'] = i_reordered_ratio['reordered_sum'] / i_reordered_ratio['i_total_bought'] 
i_reordered_ratio.head()

Unnamed: 0,product_id,i_total_bought,reordered_sum,i_reordered_ratio
0,1.0,542,300.0,0.553506
1,2.0,30,3.0,0.1
2,3.0,49,34.0,0.693878
3,4.0,158,76.0,0.481013
4,5.0,4,2.0,0.5


#### Item features: item dow ratio

In [18]:
# how many did an item bought in each day in avg?
i_dow = df_new.groupby(['product_id','order_dow']).size().reset_index()
i_dow.columns = ['product_id', 'order_dow', 'item_ct_day']
i_dow.head()

Unnamed: 0,product_id,order_dow,item_ct_day
0,1.0,0,59
1,1.0,1,109
2,1.0,2,88
3,1.0,3,75
4,1.0,4,82


In [19]:
# ratio 
i_dow['item_dow_ratio'] = i_dow.item_ct_day / i_dow.groupby('product_id').item_ct_day.transform(np.sum)
i_dow.head()

Unnamed: 0,product_id,order_dow,item_ct_day,item_dow_ratio
0,1.0,0,59,0.108856
1,1.0,1,109,0.201107
2,1.0,2,88,0.162362
3,1.0,3,75,0.138376
4,1.0,4,82,0.151292


In [20]:
i_dow = i_dow.pivot(index ='product_id', columns = 'order_dow', values = 'item_dow_ratio').add_prefix('i_dow_')
i_dow = i_dow.fillna(0)
i_dow.head()

order_dow,i_dow_0,i_dow_1,i_dow_2,i_dow_3,i_dow_4,i_dow_5,i_dow_6
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,0.108856,0.201107,0.162362,0.138376,0.151292,0.154982,0.083026
2.0,0.2,0.166667,0.066667,0.166667,0.1,0.066667,0.233333
3.0,0.204082,0.081633,0.122449,0.102041,0.081633,0.122449,0.285714
4.0,0.208861,0.151899,0.164557,0.120253,0.094937,0.094937,0.164557
5.0,0.0,0.0,0.0,0.0,0.25,0.5,0.25


In [59]:
products = pd.read_csv('products.csv')
dept = pd.read_csv('departments.csv')
prod_dep = pd.merge(products, dept, on='department_id', how='left')

In [60]:
prod_dep

Unnamed: 0,product_id,product_name,aisle_id,department_id,department
0,1,Chocolate Sandwich Cookies,61,19,snacks
1,2,All-Seasons Salt,104,13,pantry
2,3,Robust Golden Unsweetened Oolong Tea,94,7,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen
4,5,Green Chile Anytime Sauce,5,13,pantry
...,...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,alcohol
49684,49685,En Croute Roast Hazelnut Cranberry,42,1,frozen
49685,49686,Artisan Baguette,112,3,bakery
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,pets


In [61]:
i_dep = prod_dep[['product_id', 'department']]
i_dep = pd.get_dummies(i_dep, columns=['department'])
i_dep.head()

Unnamed: 0,product_id,department_alcohol,department_babies,department_bakery,department_beverages,department_breakfast,department_bulk,department_canned goods,department_dairy eggs,department_deli,...,department_household,department_international,department_meat seafood,department_missing,department_other,department_pantry,department_personal care,department_pets,department_produce,department_snacks
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


Merging item

In [62]:
i_features = pd.merge(i_reordered_ratio, i_dow, on='product_id', how='left')
i_features.head()

Unnamed: 0,product_id,i_total_bought,reordered_sum,i_reordered_ratio,i_dow_0,i_dow_1,i_dow_2,i_dow_3,i_dow_4,i_dow_5,i_dow_6
0,1.0,542,300.0,0.553506,0.108856,0.201107,0.162362,0.138376,0.151292,0.154982,0.083026
1,2.0,30,3.0,0.1,0.2,0.166667,0.066667,0.166667,0.1,0.066667,0.233333
2,3.0,49,34.0,0.693878,0.204082,0.081633,0.122449,0.102041,0.081633,0.122449,0.285714
3,4.0,158,76.0,0.481013,0.208861,0.151899,0.164557,0.120253,0.094937,0.094937,0.164557
4,5.0,4,2.0,0.5,0.0,0.0,0.0,0.0,0.25,0.5,0.25


In [63]:
i_features = pd.merge(i_features, i_dep, on='product_id', how='left')
i_features.head()

Unnamed: 0,product_id,i_total_bought,reordered_sum,i_reordered_ratio,i_dow_0,i_dow_1,i_dow_2,i_dow_3,i_dow_4,i_dow_5,...,department_household,department_international,department_meat seafood,department_missing,department_other,department_pantry,department_personal care,department_pets,department_produce,department_snacks
0,1.0,542,300.0,0.553506,0.108856,0.201107,0.162362,0.138376,0.151292,0.154982,...,0,0,0,0,0,0,0,0,0,1
1,2.0,30,3.0,0.1,0.2,0.166667,0.066667,0.166667,0.1,0.066667,...,0,0,0,0,0,1,0,0,0,0
2,3.0,49,34.0,0.693878,0.204082,0.081633,0.122449,0.102041,0.081633,0.122449,...,0,0,0,0,0,0,0,0,0,0
3,4.0,158,76.0,0.481013,0.208861,0.151899,0.164557,0.120253,0.094937,0.094937,...,0,0,0,0,0,0,0,0,0,0
4,5.0,4,2.0,0.5,0.0,0.0,0.0,0.0,0.25,0.5,...,0,0,0,0,0,1,0,0,0,0


In [75]:
i_features.shape

(48867, 32)

In [79]:
# save i_features
i_features.to_csv('i_features.csv', index=False)

### User x Item features
* how many times a user bought a product
* the probability of item being bought 

#### User x Item Features: Product Probability Counts

In [22]:
# total product count per customer
u_i_prob = df_new.groupby('user_id')['product_id'].value_counts().reset_index(name='u_i_count')
u_i_prob.head()

Unnamed: 0,user_id,product_id,u_i_count
0,1,196.0,5
1,1,10258.0,5
2,1,25133.0,5
3,1,12427.0,4
4,1,46149.0,4


In [25]:
# total order (max of order number) per customer
order_sum = df_new.groupby('user_id')['rev_order_num'].max().reset_index(name='order_cnt')
# this order starts from 0-4, but we want to count from 1-5
order_sum['u_i_max_order']=order_sum['order_cnt'] + 1
order_sum.head()

Unnamed: 0,user_id,order_cnt,u_i_max_order
0,1,4,5
1,2,4,5
2,3,4,5
3,4,4,5
4,5,4,5


In [26]:
order_sum = order_sum.drop(columns ='order_cnt')
order_sum.head()

Unnamed: 0,user_id,u_i_max_order
0,1,5
1,2,5
2,3,5
3,4,5
4,5,5


In [27]:
# probability of user ordering an item per order
u_i_prob = pd.merge(u_i_prob, order_sum, on='user_id', how='left')
u_i_prob['i_prob_per_order'] = u_i_prob['u_i_count'] / u_i_prob['u_i_max_order']
u_i_prob.head()

Unnamed: 0,user_id,product_id,u_i_count,u_i_max_order,i_prob_per_order
0,1,196.0,5,5,1.0
1,1,10258.0,5,5,1.0
2,1,25133.0,5,5,1.0
3,1,12427.0,4,5,0.8
4,1,46149.0,4,5,0.8


#### User x Item Features: the ratio of reordered per order

In [29]:
# count the reordered per user id and product id
u_i_reordered = df_new.groupby(['user_id', 'product_id']).reordered.count().reset_index(name='u_i_reordered')
u_i_reordered.head()

Unnamed: 0,user_id,product_id,u_i_reordered
0,1,196.0,5
1,1,10258.0,5
2,1,12427.0,4
3,1,13032.0,3
4,1,25133.0,5


In [30]:
# dividing the reordered by total order number (or the max order number)
u_i_reordered = pd.merge(u_i_reordered, order_sum, on='user_id', how='left')
u_i_reordered['u_i_ratio_reordered'] = u_i_reordered['u_i_reordered'] / u_i_reordered['u_i_max_order']
u_i_reordered.head()

Unnamed: 0,user_id,product_id,u_i_reordered,u_i_max_order,u_i_ratio_reordered
0,1,196.0,5,5,1.0
1,1,10258.0,5,5,1.0
2,1,12427.0,4,5,0.8
3,1,13032.0,3,5,0.6
4,1,25133.0,5,5,1.0


In [70]:
u_i_reordered = u_i_reordered.drop(columns='u_i_max_order')
u_i_reordered.head()

Unnamed: 0,user_id,product_id,u_i_reordered,u_i_ratio_reordered
0,1,196.0,5,1.0
1,1,10258.0,5,1.0
2,1,12427.0,4,0.8
3,1,13032.0,3,0.6
4,1,25133.0,5,1.0


In [71]:
u_i_features = pd.merge(u_i_prob, u_i_reordered, on=['user_id', 'product_id'], how='left')
u_i_features.head()

Unnamed: 0,user_id,product_id,u_i_count,u_i_max_order,i_prob_per_order,u_i_reordered,u_i_ratio_reordered
0,1,196.0,5,5,1.0,5,1.0
1,1,10258.0,5,5,1.0,5,1.0
2,1,25133.0,5,5,1.0,5,1.0
3,1,12427.0,4,5,0.8,4,0.8
4,1,46149.0,4,5,0.8,4,0.8


In [76]:
u_i_features.shape

(6910976, 7)

In [78]:
# save u_i_features
u_i_features.to_csv('u_i_features.csv', index=False)

Merging all

In [72]:
user_df = pd.merge(u_i_features, u_features, on='user_id', how='left')
user_df.head()

Unnamed: 0,user_id,product_id,u_i_count,u_i_max_order,i_prob_per_order,u_i_reordered,u_i_ratio_reordered,u_total_order,avg_u_order,u_avg_days,...,u_d_household,u_d_international,u_d_meat seafood,u_d_missing,u_d_other,u_d_pantry,u_d_personal care,u_d_pets,u_d_produce,u_d_snacks
0,1,196.0,5,5,1.0,5,1.0,11,7.0,16.0,...,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324324
1,1,10258.0,5,5,1.0,5,1.0,11,7.0,16.0,...,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324324
2,1,25133.0,5,5,1.0,5,1.0,11,7.0,16.0,...,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324324
3,1,12427.0,4,5,0.8,4,0.8,11,7.0,16.0,...,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324324
4,1,46149.0,4,5,0.8,4,0.8,11,7.0,16.0,...,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324324


In [73]:
df_used = pd.merge(user_df, i_features, on='product_id', how='left')
df_used.head()

Unnamed: 0,user_id,product_id,u_i_count,u_i_max_order,i_prob_per_order,u_i_reordered,u_i_ratio_reordered,u_total_order,avg_u_order,u_avg_days,...,department_household,department_international,department_meat seafood,department_missing,department_other,department_pantry,department_personal care,department_pets,department_produce,department_snacks
0,1,196.0,5,5,1.0,5,1.0,11,7.0,16.0,...,0,0,0,0,0,0,0,0,0,0
1,1,10258.0,5,5,1.0,5,1.0,11,7.0,16.0,...,0,0,0,0,0,0,0,0,0,1
2,1,25133.0,5,5,1.0,5,1.0,11,7.0,16.0,...,0,0,0,0,0,0,0,0,0,0
3,1,12427.0,4,5,0.8,4,0.8,11,7.0,16.0,...,0,0,0,0,0,0,0,0,0,1
4,1,46149.0,4,5,0.8,4,0.8,11,7.0,16.0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
df_used = pd.merge(df_new[['user_id', 'product_id', 'reordered']], df_used, on=('user_id', 'product_id'), how='left')
df_used.head()

Unnamed: 0,user_id,product_id,reordered,u_i_count,u_i_max_order,i_prob_per_order,u_i_reordered,u_i_ratio_reordered,u_total_order,avg_u_order,...,department_household,department_international,department_meat seafood,department_missing,department_other,department_pantry,department_personal care,department_pets,department_produce,department_snacks
0,1,196.0,1.0,5,5,1.0,5,1.0,11,7.0,...,0,0,0,0,0,0,0,0,0,0
1,1,10258.0,1.0,5,5,1.0,5,1.0,11,7.0,...,0,0,0,0,0,0,0,0,0,1
2,1,12427.0,1.0,4,5,0.8,4,0.8,11,7.0,...,0,0,0,0,0,0,0,0,0,1
3,1,25133.0,1.0,5,5,1.0,5,1.0,11,7.0,...,0,0,0,0,0,0,0,0,0,0
4,1,13032.0,1.0,3,5,0.6,3,0.6,11,7.0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
df_used.shape

(10193312, 70)

In [83]:
df_used.to_csv('df_used.csv', index=False)

In [84]:
df_used.head()

Unnamed: 0,user_id,product_id,reordered,u_i_count,u_i_max_order,i_prob_per_order,u_i_reordered,u_i_ratio_reordered,u_total_order,avg_u_order,...,department_household,department_international,department_meat seafood,department_missing,department_other,department_pantry,department_personal care,department_pets,department_produce,department_snacks
0,1,196.0,1.0,5,5,1.0,5,1.0,11,7.0,...,0,0,0,0,0,0,0,0,0,0
1,1,10258.0,1.0,5,5,1.0,5,1.0,11,7.0,...,0,0,0,0,0,0,0,0,0,1
2,1,12427.0,1.0,4,5,0.8,4,0.8,11,7.0,...,0,0,0,0,0,0,0,0,0,1
3,1,25133.0,1.0,5,5,1.0,5,1.0,11,7.0,...,0,0,0,0,0,0,0,0,0,0
4,1,13032.0,1.0,3,5,0.6,3,0.6,11,7.0,...,0,0,0,0,0,0,0,0,0,0
