TABLE OF CONTENT

01 -- IMPORTING LIBRARIES

02 -- IMPORTING DATA

03 -- REGIONAL SEGMENTATION OF THE US CUSTOMER

04 -- EXCLUDING LOW-ACTIVTY CUSTOMERS

05 -- CUSTOMER AGE BINS & INCOME FLAGS

06 -- DF SAMPLE CREATION (SMALL 30% & BIG 70% )

07 -- EXPORTING DATA


# 01 IMPORTING LIBRARIES

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02 IMPORTING DATA

In [3]:
path = r'/Users/woodoooo/Desktop/Instacart Basket Analysis'

# orders + products + customers data
df_ords_prods_customers = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', '4.9_ords_prods_customers.pkl'))

In [4]:
#departments data 
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'), index_col = False)

In [5]:
df_ords_prods_customers.shape

(32404859, 32)

In [6]:
# NOTE:
# For data security and ethical handling of personally identifiable information (PII),
# the 'first_name' and 'last_name' columns have been removed from the customer dataset prior to analysis. 

In [7]:
df_ords_prods_customers.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,user_median_order_frequency,order_frequency_flag,gender,state,age,date_joined,n_dependants,marital_status,income,_merge
0,2539329,1,1,2,8,,True,196,1,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
1,2539329,1,1,2,8,,True,14084,2,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
2,2539329,1,1,2,8,,True,12427,3,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
3,2539329,1,1,2,8,,True,26088,4,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
4,2539329,1,1,2,8,,True,26405,5,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both


# 03 REGIONAL SEGMENTATION OF THE US CUSTOMER

In [8]:
# Creating a regional segmentation of the data to compare customer behavior in greographic areas across the US. 

In [9]:
# Create dictionary mapping each state to a region

# Northeast region states
region_northeast = [
    'Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut',
    'New York', 'Pennsylvania', 'New Jersey'
]

# Midwest region states
region_midwest = [
    'Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio',
    'North Dakota', 'South Dakota', 'Nebraska', 'Kansas',
    'Minnesota', 'Iowa', 'Missouri'
]

# South region states
region_south = [
    'Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia',
    'North Carolina', 'South Carolina', 'Georgia', 'Florida',
    'Kentucky', 'Tennessee', 'Mississippi', 'Alabama',
    'Oklahoma', 'Texas', 'Arkansas', 'Louisiana'
]

# West region states
region_west = [
    'Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado',
    'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon',
    'California', 'Hawaii'
]

In [10]:
# Create mapping dictionary using clearly named bins
region_map = {}

for state in region_northeast:
    region_map[state] = 'Northeast'

for state in region_midwest:
    region_map[state] = 'Midwest'

for state in region_south:
    region_map[state] = 'South'

for state in region_west:
    region_map[state] = 'West'

In [11]:
# Adding region column

df_ords_prods_customers['region'] = df_ords_prods_customers['state'].map(region_map).fillna('Other')

In [12]:
# checking 

df_ords_prods_customers['region'].value_counts(dropna=False)

region
South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: count, dtype: int64

In [13]:
df_ords_prods_customers.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,order_frequency_flag,gender,state,age,date_joined,n_dependants,marital_status,income,_merge,region
0,2539329,1,1,2,8,,True,196,1,0,...,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both,South
1,2539329,1,1,2,8,,True,14084,2,0,...,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both,South
2,2539329,1,1,2,8,,True,12427,3,0,...,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both,South
3,2539329,1,1,2,8,,True,26088,4,0,...,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both,South
4,2539329,1,1,2,8,,True,26405,5,0,...,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both,South


In [14]:
# dropping the '_merge' column

In [15]:
df_ords_prods_customers.drop('_merge', axis=1, inplace=True)

In [16]:
df_ords_prods_customers.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,user_median_order_frequency,order_frequency_flag,gender,state,age,date_joined,n_dependants,marital_status,income,region
0,2539329,1,1,2,8,,True,196,1,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,South
1,2539329,1,1,2,8,,True,14084,2,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,South
2,2539329,1,1,2,8,,True,12427,3,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,South
3,2539329,1,1,2,8,,True,26088,4,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,South
4,2539329,1,1,2,8,,True,26405,5,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,South


In [17]:
list(df_ords_prods_customers.columns)

['order_id',
 'user_id',
 'order_number',
 'order_day_of_week',
 'order_hour_of_day',
 'days_since_prior_order',
 'new_customer',
 'product_id',
 'add_to_cart_order',
 'reordered',
 'product_name',
 'aisle_id',
 'department_id',
 'prices',
 'price_range_loc',
 'Busiest day',
 'Busiest_days',
 'busiest_period_of_day',
 'max_order',
 'loyalty_flag',
 'avg_price',
 'avg_spending_flag',
 'user_median_order_frequency',
 'order_frequency_flag',
 'gender',
 'state',
 'age',
 'date_joined',
 'n_dependants',
 'marital_status',
 'income',
 'region']

In [18]:
# Exploring whether there’s a difference in spending habits between the different U.S. regions

pd.crosstab(df_ords_prods_customers['region'], df_ords_prods_customers['avg_spending_flag'], dropna=False)

avg_spending_flag,High spender,Low spender
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,29265,7568060
Northeast,18642,5704094
South,40579,10751306
West,31223,8261690


The South and West regions have the highest number of high spenders, with 40,579 and 31,223 respectively, compared to 29,265 in the Midwest and 18,642 in the Northeast.
Despite the fact that low spenders dominate in all regions, the South and West still lead in absolute number of high spenders.

Suggestion to Marketing team: Focus marketing and premium product promotions on South and West regions — these areas have a larger base of high-value customers.

# 04 EXCLUDING LOW-ACTIVTY CUSTOMERS

In [19]:
# Creating an exclusion flag for low-activity customers with less than 5 orders

df_ords_prods_customers['low-activity_flag']=df_ords_prods_customers['max_order'] <5

In [20]:
# Checking
df_ords_prods_customers['low-activity_flag'].value_counts()

low-activity_flag
False    30964564
True      1440295
Name: count, dtype: int64

In [21]:
df_ords_prods_customers[df_ords_prods_customers['low-activity_flag']].head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,order_frequency_flag,gender,state,age,date_joined,n_dependants,marital_status,income,region,low-activity_flag
360,2717275,5,1,3,12,,True,15349,1,0,...,Regular customer,Female,California,75,2018-10-08,0,divorced/widowed,115242,West,True
361,2717275,5,1,3,12,,True,21413,2,0,...,Regular customer,Female,California,75,2018-10-08,0,divorced/widowed,115242,West,True
362,2717275,5,1,3,12,,True,48775,3,0,...,Regular customer,Female,California,75,2018-10-08,0,divorced/widowed,115242,West,True
363,2717275,5,1,3,12,,True,28289,4,0,...,Regular customer,Female,California,75,2018-10-08,0,divorced/widowed,115242,West,True
364,2717275,5,1,3,12,,True,8518,5,0,...,Regular customer,Female,California,75,2018-10-08,0,divorced/widowed,115242,West,True


In [22]:
# Exporting the excluded sample of low-activity customers
df_low_activity = df_ords_prods_customers[df_ords_prods_customers['low-activity_flag'] == True]

In [23]:
# Exporting the sample of low-activity customers
df_low_activity.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'low_activity_customers.pkl'))

In [24]:
# Excluding low-activity customers from the main dataframe
df_ords_prods_active_customers = df_ords_prods_customers[df_ords_prods_customers['low-activity_flag'] == False]


In [25]:
df_ords_prods_active_customers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30964564 entries, 0 to 32404858
Data columns (total 33 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   order_id                     int32         
 1   user_id                      int32         
 2   order_number                 int8          
 3   order_day_of_week            int8          
 4   order_hour_of_day            int8          
 5   days_since_prior_order       float32       
 6   new_customer                 bool          
 7   product_id                   int32         
 8   add_to_cart_order            int32         
 9   reordered                    int8          
 10  product_name                 object        
 11  aisle_id                     int32         
 12  department_id                int32         
 13  prices                       float32       
 14  price_range_loc              object        
 15  Busiest day                  object        
 16  Bus

In [26]:
df_ords_prods_active_customers.shape

(30964564, 33)

In [27]:
df_ords_prods_active_customers.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,aisle_id,department_id,prices,max_order,avg_price,user_median_order_frequency,age,date_joined,n_dependants,income
count,30964560.0,30964560.0,30964560.0,30964560.0,30964560.0,29303280.0,30964560.0,30964560.0,30964560.0,30964560.0,30964560.0,30964560.0,30964560.0,30964560.0,30964560.0,30964560.0,30964564,30964560.0,30964560.0
mean,1710317.0,102928.4,17.83414,2.741397,13.41098,10.8103,25598.5,8.363038,0.6061019,71.18859,9.922365,7.790483,34.42569,7.790481,9.957143,49.46803,2018-08-16 15:24:26.241062400,1.501819,99675.87
min,2.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,5.0,1.0,0.0,18.0,2017-01-01 00:00:00,0.0,25903.0
25%,855412.0,51409.0,5.0,1.0,10.0,5.0,13544.0,3.0,0.0,31.0,4.0,4.2,14.0,7.385091,6.0,33.0,2017-10-22 00:00:00,1.0,67292.0
50%,1710404.0,102586.0,12.0,3.0,13.0,7.0,25288.0,6.0,1.0,83.0,9.0,7.4,28.0,7.812069,7.0,49.0,2018-08-16 00:00:00,2.0,96765.0
75%,2565206.0,154387.0,25.0,5.0,16.0,14.0,37947.0,11.0,1.0,107.0,16.0,11.3,48.0,8.224,12.5,65.0,2019-06-09 00:00:00,3.0,128102.0
max,3421083.0,206209.0,99.0,6.0,23.0,30.0,49688.0,145.0,1.0,134.0,21.0,25.0,99.0,20.11429,30.0,81.0,2020-04-01 00:00:00,3.0,593901.0
std,987378.5,59470.61,17.63442,2.089254,4.248845,8.088373,14080.96,7.135399,0.4886127,38.22194,6.282441,4.105847,24.89504,0.9786278,6.494975,18.48528,,1.118896,43141.87


In [28]:
# Exporting the active customers data
df_ords_prods_active_customers.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'active_customers.pkl'))

# 05 CUSTOMER AGE BINS & INCOME FLAGS

In [29]:
df_ords_prods_active_customers['age'].min()

18

In [30]:
df_ords_prods_active_customers['age'].max()

81

In [31]:
# Creating age bins & labels

18–25 → Younger adults (often students or early career)

26–40 → Adults (often working professionals, early family stage)

41–60 → Middle-aged (established careers, older families)

61–81 → Seniors/retirees

In [32]:
df_ords_prods_active_customers.loc[:, 'age_group']=pd.cut(
    df_ords_prods_active_customers['age'],
    bins=[17,25,40,60,81],
    labels=['18-25', '26-40', '41-60', '61-81']
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ords_prods_active_customers.loc[:, 'age_group']=pd.cut(


In [33]:
df_ords_prods_active_customers['age_group'].value_counts(dropna=False)

age_group
61-81    10112607
41-60     9703524
26-40     7283915
18-25     3864518
Name: count, dtype: int64

## INCOME FLAGS

In [34]:
print("Minimum income:", df_ords_prods_active_customers['income'].min())
print("Maximum income:", df_ords_prods_active_customers['income'].max())

Minimum income: 25903
Maximum income: 593901


In [35]:
median_income = df_ords_prods_active_customers['income'].median()
print("Median income:", median_income)

Median income: 96765.0


In [36]:
df_ords_prods_active_customers['income'].describe()

count    3.096456e+07
mean     9.967587e+04
std      4.314187e+04
min      2.590300e+04
25%      6.729200e+04
50%      9.676500e+04
75%      1.281020e+05
max      5.939010e+05
Name: income, dtype: float64

Max: 593,901 (very high → likely outliers)

75% percentile: 127,912

Mean: 99,438

Median (50%): 96,618

In [37]:
# Creating income flags

Low	income ≤ 67,000	-- Bottom 25%

Middle income	67,001 – 127,912	-- Between 25% and 75% (most customers)

High income	127,913 – 250,000	-- Above 75% but before extreme outliers

Very high income	> 250,000	-- Outliers & affluent group

In [38]:
df_ords_prods_active_customers.loc[:, 'income_flag'] = pd.cut(
    df_ords_prods_active_customers['income'],
    bins=[0, 67000, 127912, 250000, df_ords_prods_active_customers['income'].max()],
    labels=[
        'Low income ≤ 67,000',
        'Middle income 67,001–127,912',
        'High income 127,913–250,000',
        'Very high income > 250,000'
    ]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ords_prods_active_customers.loc[:, 'income_flag'] = pd.cut(


In [39]:
df_ords_prods_active_customers['income_flag'].value_counts(dropna=False)

income_flag
Middle income 67,001–127,912    15531746
Low income ≤ 67,000              7661478
High income 127,913–250,000      7591562
Very high income > 250,000        179778
Name: count, dtype: int64

In [40]:
df_ords_prods_active_customers['income_flag'].info()

<class 'pandas.core.series.Series'>
Index: 30964564 entries, 0 to 32404858
Series name: income_flag
Non-Null Count     Dtype   
--------------     -----   
30964564 non-null  category
dtypes: category(1)
memory usage: 265.8 MB


In [41]:
df_ords_prods_active_customers

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,state,age,date_joined,n_dependants,marital_status,income,region,low-activity_flag,age_group,income_flag
0,2539329,1,1,2,8,,True,196,1,0,...,Alabama,31,2019-02-17,3,married,40423,South,False,26-40,"Low income ≤ 67,000"
1,2539329,1,1,2,8,,True,14084,2,0,...,Alabama,31,2019-02-17,3,married,40423,South,False,26-40,"Low income ≤ 67,000"
2,2539329,1,1,2,8,,True,12427,3,0,...,Alabama,31,2019-02-17,3,married,40423,South,False,26-40,"Low income ≤ 67,000"
3,2539329,1,1,2,8,,True,26088,4,0,...,Alabama,31,2019-02-17,3,married,40423,South,False,26-40,"Low income ≤ 67,000"
4,2539329,1,1,2,8,,True,26405,5,0,...,Alabama,31,2019-02-17,3,married,40423,South,False,26-40,"Low income ≤ 67,000"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,2977660,206209,13,1,12,7.0,False,14197,5,1,...,Iowa,74,2019-09-14,3,married,137969,Midwest,False,61-81,"High income 127,913–250,000"
32404855,2977660,206209,13,1,12,7.0,False,38730,6,0,...,Iowa,74,2019-09-14,3,married,137969,Midwest,False,61-81,"High income 127,913–250,000"
32404856,2977660,206209,13,1,12,7.0,False,31477,7,0,...,Iowa,74,2019-09-14,3,married,137969,Midwest,False,61-81,"High income 127,913–250,000"
32404857,2977660,206209,13,1,12,7.0,False,6567,8,0,...,Iowa,74,2019-09-14,3,married,137969,Midwest,False,61-81,"High income 127,913–250,000"


# 06 DF SAMPLE CREATION (SMALL 30% & BIG 70% )

In [42]:
np.random.seed(4)

In [43]:
dev = np.random.rand(len(df_ords_prods_active_customers)) <= 0.7

In [44]:
big = df_ords_prods_active_customers[dev]

In [45]:
small = df_ords_prods_active_customers[~dev]

In [46]:
len(big), len(small), len(big) + len(small)

(21676096, 9288468, 30964564)

# 07 EXPORTING DATA

In [48]:
small.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', '4.10_df_ords_prods_active_customers_small.pkl'))

In [49]:
big.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', '4.10_df_ords_prods_active_customers_BIG.pkl'))