In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The goal here is to predict which previously purchased products will be in a user’s next order.

# Overview

In [None]:
aisles = pd.read_csv('../input/instacart-market-basket-analysis/aisles.csv')
depts = pd.read_csv('../input/instacart-market-basket-analysis/departments.csv')
orders_prior = pd.read_csv('../input/instacart-market-basket-analysis/order_products__prior.csv')
orders_train = pd.read_csv('../input/instacart-market-basket-analysis/order_products__train.csv')
orders = pd.read_csv('../input/instacart-market-basket-analysis/orders.csv')
products = pd.read_csv('../input/instacart-market-basket-analysis/products.csv')

Let's start by looking at each of the inputs: their contents and sizes.

In [None]:
aisles.tail(3)

In [None]:
depts.tail(3)

In [None]:
orders_prior.tail(3)

In [None]:
orders_train.tail(3)

In [None]:
orders.tail(3)

In [None]:
products.tail(3)

# Orders

Orders is clearly the most important of the inputs and contains the most information to digest. It has 3421083 rows and 7 columns. Let's look at it in more detail.

In [None]:
ax = orders['order_dow'].value_counts(sort=False).plot(kind='bar', figsize=(10,5), fontsize=12)
ax.set_title('Order count distribution by day of week',fontsize=16)
ax.set_xlabel('Day of week', fontsize=14)
ax.set_ylabel('No. of orders', fontsize=14)

It's not mentioned explicitly, but I assumed 0 to mean Sunday, 1 Monday and so on. I expected the number of orders to be highest on weekends, but judging from the plot above, I'm confused if the orders are highest on Sunday and Monday (since it is online ordering) or if 0 refers to Saturday in this case. I have to check further on this.

In [None]:
ax = orders['order_hour_of_day'].value_counts(sort=False).plot(kind='bar', figsize=(10,5), fontsize=12)
ax.set_title('Order count distribution by hour of day',fontsize=16)
ax.set_xlabel('Hour of day', fontsize=14)
ax.set_ylabel('No. of orders', fontsize=14)

This is an expected distribution. Most of the orders are in the daytime while ordering at night is still possible since Instacart is an online service.

In [None]:
ax = orders['days_since_prior_order'].value_counts(sort=False).plot(kind='bar', figsize=(10,5), fontsize=12)
ax.set_title('Last order distribution',fontsize=16)
ax.set_xlabel('Days since prior order', fontsize=14)
ax.set_ylabel('Frequency', fontsize=14)

We see that most of the customers order 7 and 30 days from their last order. A simple and possibly accurate way to interpret this distribution is to say that most people order groceries once a week or once a month in bulk.

The products contained in an order are present in orders_prior and orders_train. My understanding after reading https://tech.instacart.com/3-million-instacart-orders-open-sourced-d40d29ead6f2 is that since the goal is to predict the products in an order, the last order of a user is separated out from the previous orders. These previous orders are in orders_prior. orders_train is used as training data for the model. There are also some orders left out which can be used as a test set for predictions. This test test can be fetched from the main orders df since it contains the eval_set column.

In [None]:
orders['eval_set'].value_counts()

In [None]:
eval_set_user_counts = orders.drop_duplicates(subset=['user_id', 'eval_set'])['eval_set'].value_counts()
eval_set_user_counts

In [None]:
# No. of users
len(orders['user_id'].unique())

In [None]:
eval_set_user_counts['train'] + eval_set_user_counts['test']

In [None]:
prior_orders = orders.loc[orders['eval_set'] == 'prior']
prior_orders['order_number'].sort_values().unique()

In [None]:
train_set = orders.loc[orders['eval_set'] == 'train']
test_set = orders.loc[orders['eval_set'] == 'test']

In [None]:
train_set['order_number'].sort_values().unique()

In [None]:
test_set['order_number'].sort_values().unique()

From the above snippets, we see that of the total 206209 users, their last orders have been split into train and test eval_sets

In [None]:
ax = orders['days_since_prior_order'].value_counts(sort=False).plot(kind='bar', figsize=(10,5), fontsize=12)
ax.set_title('Days since prior order distribution',fontsize=16)
ax.set_xlabel('Days since prior order', fontsize=14)
ax.set_ylabel('No. of orders', fontsize=14)

# Products
Now let's look at products

In [None]:
products.shape

In [None]:
ax = orders_prior.groupby('order_id')['add_to_cart_order'].max().value_counts().head(30).plot(kind='bar', figsize=(16,8), fontsize=10)
ax.set_title('Products per order',fontsize=16)
ax.set_xlabel('No. of products', fontsize=14)
ax.set_ylabel('No. of orders with that many products', fontsize=14)

In [None]:
product_details = pd.merge(products, aisles, on=['aisle_id'], how='inner')
product_details = pd.merge(product_details, depts, on=['department_id'], how='inner')
product_details = product_details.drop(columns=['aisle_id', 'department_id'])
product_details.tail()

In [None]:
order_details = pd.merge(orders_prior, product_details, on=['product_id'], how='inner').drop(columns=['product_id'])
order_details.tail()

Plot product categories

In [None]:
ax = order_details['department'].value_counts(sort=True).head(30).plot(kind='bar', figsize=(16,8), fontsize=10)
ax.set_title('Department popularities',fontsize=16)
ax.set_xlabel('Dept name', fontsize=14)
ax.set_ylabel('No. of orders from this dept', fontsize=14)

In [None]:
ax = order_details['aisle'].value_counts(sort=True).head(30).plot(kind='bar', figsize=(16,8), fontsize=10)
ax.set_title('Aisle popularities',fontsize=16)
ax.set_xlabel('Aisle name', fontsize=14)
ax.set_ylabel('No. of orders from this aisle', fontsize=14)

In [None]:
ax = order_details['product_name'].value_counts(sort=True).head(30).plot(kind='bar', figsize=(16,8), fontsize=10)
ax.set_title('Products popularities',fontsize=16)
ax.set_xlabel('Product name', fontsize=14)
ax.set_ylabel('No. of times ordered', fontsize=14)

Reorders of particular products

In [None]:
ax = order_details.loc[order_details['reordered'] == 1]['product_name'].value_counts(sort=True).head(30).plot(kind='bar', figsize=(16,8), fontsize=10)
ax.set_title('Most reordered products',fontsize=16)
ax.set_xlabel('Product', fontsize=14)
ax.set_ylabel('No. of time reordered', fontsize=14)

# Hypothesis Generation

Let's try to cluster our pool of customers with the data we have available. Note that these could very well be entirely off the mark.

In [None]:
order_details = pd.merge(order_details, orders, on=['order_id'], how='inner')

In [None]:
len(order_details['user_id'].unique())

Let's try to validate some assumptions about customers using the given data. First let's look at departments. 

In [None]:
order_details['department'].unique().tolist()

Male and female count.

In [None]:
personal_care_dept = order_details.loc[order_details['department'] == 'personal care']
personal_care_dept['aisle'].unique().tolist()

In [None]:
# Possible lower bound for female customers
len(personal_care_dept.loc[personal_care_dept['aisle'] == 'feminine care']['user_id'].unique())

Customers with babies

In [None]:
# order_details.loc[order_details['department'] == 'babies']['product_name'].unique().tolist()

In [None]:
len(order_details.loc[order_details['department'] == 'babies']['user_id'].unique())

Age of customers

In [None]:
# Older customers
len(personal_care_dept.loc[personal_care_dept['aisle'] == 'muscles joints pain relief']['user_id'].unique())

Customers with pets

In [None]:
len(order_details.loc[order_details['department'] == 'pets']['user_id'].unique())

Health conscious customers

In [None]:
len(personal_care_dept.loc[personal_care_dept['aisle'] == 'protein meal replacements']['user_id'].unique())

Junk food eaters

# Reorder data

The information about previous reorders could be very useful for predicting the next order of a customer. So let's look at that more closely now.