In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from jupyterthemes import jtplot
jtplot.style()

In [6]:
df_aisles = pd.read_csv('aisles.csv')
df_aisles.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [7]:
print(df_aisles.aisle_id.nunique())
print(df_aisles.aisle_id.unique())

134
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134]


##### There are 134 types of aisles with id number 1-134.

In [8]:
df_depts = pd.read_csv('departments.csv')
df_depts.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [9]:
print(df_depts.department_id.nunique())
print(df_depts.department_id.unique())

21
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21]


##### There are 21 departments with id number 1-21.

In [10]:
df_orders = pd.read_csv('orders.csv')
df_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [11]:
print(df_orders.order_id.nunique())
print(len(df_orders.order_id))
print(df_orders.order_id.unique())

3421083
3421083
[2539329 2398795  473747 ...  626363 2977660  272231]


##### There are 3,421,083 unique orders, the same as the total number of observations

In [12]:
print(df_orders.user_id.nunique())
print(df_orders.user_id.unique())

206209
[     1      2      3 ... 206207 206208 206209]


##### There are 206,209 unique users, indicating there are user(s) that have multiple orders

In [13]:
print('--- descriptive statistics by user_id latest order ---')
print('min:\t',df_orders.groupby('user_id').aggregate('last').order_number.min())
print('mean:\t',df_orders.groupby('user_id').aggregate('last').order_number.mean())
print('median:\t',df_orders.groupby('user_id').aggregate('last').order_number.median())
print('max:\t',df_orders.groupby('user_id').aggregate('last').order_number.max())

--- descriptive statistics by user_id latest order ---
min:	 4
mean:	 16.590367054784224
median:	 10.0
max:	 100


#####  Users ordered between 4 and 100 times inclusive

In [17]:
print('--- total number of orders per day of week ---')
df_orders.groupby('order_dow').aggregate('count').order_number

--- total number of orders per day of week ---


order_dow
0    600905
1    587478
2    467260
3    436972
4    426339
5    453368
6    448761
Name: order_number, dtype: int64

In [18]:
print('--- total number of orders per hour of the day ---')
df_orders.groupby('order_hour_of_day').aggregate('count').order_number

--- total number of orders per hour of the day ---


order_hour_of_day
0      22758
1      12398
2       7539
3       5474
4       5527
5       9569
6      30529
7      91868
8     178201
9     257812
10    288418
11    284728
12    272841
13    277999
14    283042
15    283639
16    272553
17    228795
18    182912
19    140569
20    104292
21     78109
22     61468
23     40043
Name: order_number, dtype: int64

In [18]:
df_prod = pd.read_csv('products.csv')
df_prod.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [19]:
print(df_prod.product_id.nunique())
print(df_prod.product_id.unique())

49688
[    1     2     3 ... 49686 49687 49688]


##### There are 49,688 unique products with id number 1-49688

In [3]:
df_ord_prod = pd.read_csv('order_products__prior.csv')
print(df_ord_prod.shape)
df_ord_prod.head()

(32434489, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [23]:
print('--- df_orders: order_id ---')
print(df_orders.order_id.nunique())
print(df_orders.order_id.unique())

print('--- df_ord_prod: order_id ---')
print(df_ord_prod.order_id.nunique())
print(df_ord_prod.order_id.unique())

print('--- df_prod: product_id ---')
print(df_prod.product_id.nunique())
print(df_prod.product_id.unique())

print('--- df_ord_prod: product_id ---')
print(df_ord_prod.product_id.nunique())
print(df_ord_prod.product_id.unique())

--- df_orders: order_id ---
3421083
[2539329 2398795  473747 ...  626363 2977660  272231]
--- df_ord_prod: order_id ---
3214874
[      2       3       4 ... 3421081 3421082 3421083]
--- df_prod: product_id ---
49688
[    1     2     3 ... 49686 49687 49688]
--- df_ord_prod: product_id ---
49677
[33120 28985  9327 ... 33097 38977 23624]


In [19]:
print('--- descriptive statistics by number of items in cart ---')
print('min:\t',df_ord_prod.groupby('order_id').aggregate('last').add_to_cart_order.min())
print('mean:\t',df_ord_prod.groupby('order_id').aggregate('last').add_to_cart_order.mean())
print('median:\t',df_ord_prod.groupby('order_id').aggregate('last').add_to_cart_order.median())
print('max:\t',df_ord_prod.groupby('order_id').aggregate('last').add_to_cart_order.max())

--- descriptive statistics by number of items in cart ---
min:	 1
mean:	 10.088883421247614
median:	 8.0
max:	 145


In [20]:
print('--- descriptive statistics by number of items in cart ---')
print('min:\t',df_ord_prod.groupby('order_id').aggregate('sum').reordered.min())
print('mean:\t',df_ord_prod.groupby('order_id').aggregate('sum').reordered.mean())
print('median:\t',df_ord_prod.groupby('order_id').aggregate('sum').reordered.median())
print('max:\t',df_ord_prod.groupby('order_id').aggregate('sum').reordered.max())

--- descriptive statistics by number of items in cart ---
min:	 0
mean:	 5.949388996271704
median:	 4.0
max:	 130


# Project Proposal
---
#### Unsupervised Learning
- Perform clustering to segement users by their ordering habits (type and amount of products, frequency of purchases, etc.)
- **Business Value**
 - Understanding similarity in user behavior can allow for InstaCart to target specific users to increase usage or other users to retain usage
 - Could also potentially create order groups per user (clustering per user perhaps?) for simple 1-click ordering

<br></br>
#### Supervised Learning
- Perform classification/regression to predict number of days until next purchase
 - Classification - group categories into "day ranges" and do a multi-class classification
- **Business Value**
 - This can allow InstaCart to implement a feature to reminder users that it's time to shop again
 
<br></br>
#### Project Tasks
##### Data Cleaning (PySpark)
- combine multiple data sets into single data set using common id variables
- sort variables into continuous/categorical
- deal with NULLs / missing data
- deal with outliers

<br></br>
##### Data Exploration (PySpark)
- generate new variables
- data visualization (Jupyter)

<br></br>
##### Feature Engineering (Jupyter/Local)
- select features for unsupervised and supervised learning
- select supervised learning output variable
- feature reduction (pca/tsne/umaps/correlation matrix)

<br></br>
##### Unsupervised Learning (Jupyter/Local)
- elbow plot / silhouette coefficient to determine k
- try multiple clustering algorithms
 - kmeans with k clusters
 - agglomerative clustering
 - dbscan
 - gmm
 - other?
- visualize data with pca/tsne/umaps
- choose best 2 solutions and search hyperparameters
- analyze the resulting clusters

<br></br>
##### Supervised Learning (Jupyter/Local)
- choose classification or regression
- try multiple algorithms
 - random forest
 - knn
 - svm
 - xgb
- choose best 2 solutions and search hyperparameters