# 01 Import libraries and data

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Set path
path = r"C:\Users\miche\Instacart Basket Analysis 2022_MP\02 Data"

In [3]:
# Import pkl (df_derived = orders_produces_derived.pkl from tast 4.7)
derived = pd.read_pickle(os.path.join(path, 'Prepared Data', 'orders_products_derived.pkl'))

In [4]:
# Explore data

In [5]:
derived.head()

Unnamed: 0,order_id,customer_id,order_number,order_day,order_hour,previous_order,product_id,add_to_cart_order,reordered,product_name,department_id,prices,_merge,price_range,busy_day,busy_days,busy_hours
0,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,19,4.7,both,Low-range product,Regularly busy,Regularly busy,Average orders
1,3169664,363,9,5,14,8.0,26088,19,0,Aged White Cheddar Popcorn,19,4.7,both,Low-range product,Regularly busy,Least busy,Most orders
2,554620,1357,4,4,15,30.0,26088,1,1,Aged White Cheddar Popcorn,19,4.7,both,Low-range product,Least busy,Least busy,Most orders
3,2927564,1357,6,4,9,30.0,26088,3,1,Aged White Cheddar Popcorn,19,4.7,both,Low-range product,Least busy,Least busy,Most orders
4,1028773,1960,13,0,19,16.0,26088,5,1,Aged White Cheddar Popcorn,19,4.7,both,Low-range product,Busiest day,Busiest day,Average orders


In [6]:
derived.columns

Index(['order_id', 'customer_id', 'order_number', 'order_day', 'order_hour',
       'previous_order', 'product_id', 'add_to_cart_order', 'reordered',
       'product_name', 'department_id', 'prices', '_merge', 'price_range',
       'busy_day', 'busy_days', 'busy_hours'],
      dtype='object')

In [7]:
derived.shape

(6481355, 17)

# 2 Aggregating data with agg( )

In [8]:
# performing a single aggregation
derived.groupby('department_id').agg({'order_number':['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.458194
2,16.87603
3,17.103005
4,17.811421
5,15.267666
6,16.457281
7,17.242568
8,15.460554
9,15.955104
10,19.906733


In [9]:
# Another way to performe aggregation
derived.groupby('department_id')['order_number'].mean()

department_id
1     15.458194
2     16.876030
3     17.103005
4     17.811421
5     15.267666
6     16.457281
7     17.242568
8     15.460554
9     15.955104
10    19.906733
11    16.186501
12    15.861564
13    16.570208
14    16.772875
15    16.139515
16    17.683961
17    15.722692
18    19.446872
19    17.183289
20    16.526508
21    22.762749
Name: order_number, dtype: float64

In [10]:
# Performing multiple aggregations
derived.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,15.458194,1,99
2,16.87603,1,99
3,17.103005,1,99
4,17.811421,1,99
5,15.267666,1,99
6,16.457281,1,99
7,17.242568,1,99
8,15.460554,1,99
9,15.955104,1,99
10,19.906733,1,99


# 3 Aggregating Data with transform( )

In [11]:
# Create max_order column populated with customer_id's max order count
derived['max_order'] = derived.groupby(['customer_id'])['order_number'].transform(np.max)

In [12]:
# Confirm transformation
derived.head()

Unnamed: 0,order_id,customer_id,order_number,order_day,order_hour,previous_order,product_id,add_to_cart_order,reordered,product_name,department_id,prices,_merge,price_range,busy_day,busy_days,busy_hours,max_order
0,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,19,4.7,both,Low-range product,Regularly busy,Regularly busy,Average orders,10
1,3169664,363,9,5,14,8.0,26088,19,0,Aged White Cheddar Popcorn,19,4.7,both,Low-range product,Regularly busy,Least busy,Most orders,12
2,554620,1357,4,4,15,30.0,26088,1,1,Aged White Cheddar Popcorn,19,4.7,both,Low-range product,Least busy,Least busy,Most orders,7
3,2927564,1357,6,4,9,30.0,26088,3,1,Aged White Cheddar Popcorn,19,4.7,both,Low-range product,Least busy,Least busy,Most orders,7
4,1028773,1960,13,0,19,16.0,26088,5,1,Aged White Cheddar Popcorn,19,4.7,both,Low-range product,Busiest day,Busiest day,Average orders,17


# 4 Deriving Columns with loc( )

In [13]:
# Assign loyalty_flags based on customers max_order
derived.loc[derived['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [14]:
derived.loc[(derived['max_order'] <= 40) & (derived['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [16]:
derived.loc[derived['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [17]:
derived['loyalty_flag'].value_counts(dropna=False)

Regular customer    3158817
Loyal customer      2045740
New customer        1276798
Name: loyalty_flag, dtype: int64

In [18]:
derived[['customer_id', 'loyalty_flag', 'order_number']].head(60)

Unnamed: 0,customer_id,loyalty_flag,order_number
0,1,New customer,1
1,363,Regular customer,9
2,1357,New customer,4
3,1357,New customer,6
4,1960,Regular customer,13
5,2556,Loyal customer,17
6,3010,Regular customer,11
7,3393,Regular customer,6
8,3672,Regular customer,5
9,4083,New customer,1


# Task, Steps 5 - 9
#### Steps 1-4 performed above

In [21]:
# 5 Find the difference between spending habits based on customers loyalty flag 
derived.groupby('loyalty_flag').agg({'prices': ['mean', 'min', 'max']})

Unnamed: 0_level_0,prices,prices,prices
Unnamed: 0_level_1,mean,min,max
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Loyal customer,10.525109,1.0,99999.0
New customer,13.252656,1.0,99999.0
Regular customer,12.792695,1.0,99999.0


In [22]:
# 6 Create a spending flag for High and Low spenders (criteria: customers' average price per product <10 = Low spendder, >=10 = High spender)
# create average_price column
derived['average_price']=derived.groupby(['customer_id'])['prices'].transform(np.mean)

In [25]:
# Create a flag that assigns a “types of spenders” label to a user ID based on average prices.
derived.loc[derived['average_price'] < 10, 'spender_flag'] = 'Low spender'
derived.loc[derived['average_price'] >= 10, 'spender_flag'] = 'High spender'

In [26]:
# Confirm spender_flag count
derived['spender_flag'].value_counts(dropna=False)

Low spender     6323259
High spender     158096
Name: spender_flag, dtype: int64

In [27]:
# 7 Create customer_frequency flag
# create median_previous_oder column based on customers previous_orders 
derived['median_previous_order'] = derived.groupby(['customer_id'])['previous_order'].transform(np.median)

In [29]:
# create flags with criteria based on median_previous_order
derived.loc[derived['median_previous_order']>20, 'customer_frequency'] = 'Non-frequent customer'
derived.loc[(derived['median_previous_order']>10) & (derived['median_previous_order']<=20), 'customer_frequency'] = 'Regular customer'
derived.loc[derived['median_previous_order']<=10, 'customer_frequency'] = 'Frequent customer'

In [31]:
derived['customer_frequency'].value_counts(dropna=False)

Frequent customer        4296048
Regular customer         1434032
Non-frequent customer     748236
NaN                         3039
Name: customer_frequency, dtype: int64

In [32]:
#8 Review data
derived.shape

(6481355, 23)

In [33]:
derived.columns

Index(['order_id', 'customer_id', 'order_number', 'order_day', 'order_hour',
       'previous_order', 'product_id', 'add_to_cart_order', 'reordered',
       'product_name', 'department_id', 'prices', '_merge', 'price_range',
       'busy_day', 'busy_days', 'busy_hours', 'max_order', 'loyalty_flag',
       'average_price', 'spender_flag', 'median_previous_order',
       'customer_frequency'],
      dtype='object')

In [34]:
derived.head()

Unnamed: 0,order_id,customer_id,order_number,order_day,order_hour,previous_order,product_id,add_to_cart_order,reordered,product_name,...,price_range,busy_day,busy_days,busy_hours,max_order,loyalty_flag,average_price,spender_flag,median_previous_order,customer_frequency
0,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,...,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.01,Low spender,21.0,Non-frequent customer
1,3169664,363,9,5,14,8.0,26088,19,0,Aged White Cheddar Popcorn,...,Low-range product,Regularly busy,Least busy,Most orders,12,Regular customer,8.413158,Low spender,8.0,Frequent customer
2,554620,1357,4,4,15,30.0,26088,1,1,Aged White Cheddar Popcorn,...,Low-range product,Least busy,Least busy,Most orders,7,New customer,3.93,Low spender,30.0,Non-frequent customer
3,2927564,1357,6,4,9,30.0,26088,3,1,Aged White Cheddar Popcorn,...,Low-range product,Least busy,Least busy,Most orders,7,New customer,3.93,Low spender,30.0,Non-frequent customer
4,1028773,1960,13,0,19,16.0,26088,5,1,Aged White Cheddar Popcorn,...,Low-range product,Busiest day,Busiest day,Average orders,17,Regular customer,5.684211,Low spender,19.0,Regular customer


In [35]:
# Export dataframe to pkl
derived.to_pickle(os.path.join(path,'Prepared Data', 'orders_products_flags.pkl'))