# 1. Import libraries and data

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# define project folder path
path = r'C:\Users\nsmith\OneDrive - Georgia Poultry Laboratory Network\CareerFoundry\02 - Data Immersion\Achievement 4\12-2024 Instacart Basket Analysis'

In [3]:
# import project data
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'completed_data.pkl'))

In [4]:
# import project data
regions = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'regions.csv'))

In [5]:
# import project data
departments = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'))

   # 2. Inspect data

### Merged data containing order, product, and customer information

In [6]:
# view row count
df.shape

(32404859, 35)

In [7]:
# view first 10 rows
df.head(10)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,,True,196,1,0,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
5,3367565,1,6,2,7,19.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
6,550135,1,7,1,9,20.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
7,3108588,1,8,1,14,14.0,False,196,2,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
8,2295261,1,9,1,16,0.0,False,196,4,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
9,2550362,1,10,4,8,30.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


In [8]:
# describe dataset
df['prices'].describe()

count    3.239973e+07
mean     7.790994e+00
std      4.241809e+00
min      1.000000e+00
25%      4.200000e+00
50%      7.400000e+00
75%      1.130000e+01
max      2.500000e+01
Name: prices, dtype: float64

In [9]:
# view the mean
df['prices'].mean()

7.790994092173359

In [10]:
# view the median
df['prices'].median()

7.4

In [11]:
# view the max
df['prices'].max()

25.0

In [12]:
# view data type of state column
df['state'].dtype

dtype('O')

### Regions data containing state and region

In [13]:
# view first 10 rows
regions.head(10)

Unnamed: 0,state,region
0,Maine,Region 1 (Northeast)
1,New Hampshire,Region 1 (Northeast)
2,Vermont,Region 1 (Northeast)
3,Massachusetts,Region 1 (Northeast)
4,Rhode Island,Region 1 (Northeast)
5,Connecticut,Region 1 (Northeast)
6,New York,Region 1 (Northeast)
7,Pennsylvania,Region 1 (Northeast)
8,New Jersey,Region 1 (Northeast)
9,Wisconsin,Region 2 (Midwest)


In [14]:
regions.dtypes

state     object
region    object
dtype: object

In [15]:
regions.shape

(51, 2)

In [16]:
regions['region'].value_counts()

Region 3 (South)        17
Region 4 (West)         13
Region 2 (Midwest)      12
Region 1 (Northeast)     9
Name: region, dtype: int64

# 3. Identify and remove PII

#### Table contains first and last names of individuals. This identifying information is not needed for the analysis and can be removed.

In [6]:
# Drop column with personal information
df.drop('first_name', axis=1, inplace=True)


In [7]:
# Drop column with personal information
df.drop('last_name', axis=1, inplace=True)


In [8]:
# Drop merge flag
df.drop('_merge', axis=1, inplace=True)

In [9]:
# Drop merge flag
df.drop('merge_flag', axis=1, inplace=True)

In [10]:
# check columns
df.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', 'price_range_loc', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'spending', 'spending_flag', 'frequency', 'frequency_flag', 'gender',
       'state', 'age', 'date_joined', 'n_dependants', 'fam_status', 'income'],
      dtype='object')

# 4. Geographic comparison

### Create regions column based on state

#### Explore states column in primary dataset to ensure there are no missing or unexpected values before performing join.

In [11]:
# check for missing values in state column
df['state'].isna().value_counts()

False    32404859
Name: state, dtype: int64

In [12]:
# check to see if there are any unexpected values in state column
states = df['state'].value_counts()

In [13]:
states

Pennsylvania            667082
California              659783
Rhode Island            656913
Georgia                 656389
New Mexico              654494
Arizona                 653964
North Carolina          651900
Oklahoma                651739
Alaska                  648495
Minnesota               647825
Massachusetts           646358
Wyoming                 644255
Virginia                641421
Missouri                640732
Texas                   640394
Colorado                639280
Maine                   638583
North Dakota            638491
Alabama                 638003
Kansas                  637538
Louisiana               637482
Delaware                637024
South Carolina          636754
Oregon                  636425
Arkansas                636144
Nevada                  636139
New York                635983
Montana                 635265
South Dakota            633772
Illinois                633024
Hawaii                  632901
Washington              632852
Mississi

In [14]:
states.shape

(51,)

#### Merge tables to add new region column and verify new table

In [15]:
# merge regions table on state
df_merge = df.merge(regions, on = 'state', indicator = True)

In [16]:
# view first 10 rows of new df
df_merge.head(10)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,frequency_flag,gender,state,age,date_joined,n_dependants,fam_status,income,region,_merge
0,2539329,1,1,2,8,,True,196,1,0,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both
2,473747,1,3,3,12,21.0,False,196,1,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both
4,431534,1,5,4,15,28.0,False,196,1,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both
5,3367565,1,6,2,7,19.0,False,196,1,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both
6,550135,1,7,1,9,20.0,False,196,1,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both
7,3108588,1,8,1,14,14.0,False,196,2,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both
8,2295261,1,9,1,16,0.0,False,196,4,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both
9,2550362,1,10,4,8,30.0,False,196,1,1,...,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both


In [17]:
# verify number of rows after join
df_merge.shape

(32404859, 33)

### Perform analysis with new column

In [18]:
crosstab = pd.crosstab(df_merge['spending_flag'], df_merge['region'], dropna = False)

In [30]:
crosstab

region,Region 1 (Northeast),Region 2 (Midwest),Region 3 (South),Region 4 (West)
spending_flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High spender,108225,155975,209691,160354
Low spender,5614511,7441350,10582194,8132559


# 5. Create dataset with low-activity customers excluded

### Create an exclusion flag

In [19]:
# add a flag
df_merge.loc[df_merge['max_order'] >= 5, 'activity_flag'] = 'Active'

In [32]:
# inspect new variable
df_merge[['max_order', 'activity_flag']].head(10)

Unnamed: 0,max_order,activity_flag
0,10,Active
1,10,Active
2,10,Active
3,10,Active
4,10,Active
5,10,Active
6,10,Active
7,10,Active
8,10,Active
9,10,Active


In [33]:
df_merge['activity_flag'].value_counts()

Active    30964564
Name: activity_flag, dtype: int64

### Filter data to new dataframe

In [20]:
# Filter rows where Flag is "Active"
df_merge_active = df_merge[df_merge['activity_flag'] == 'Active']

In [35]:
# view new table
df_merge_active.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,gender,state,age,date_joined,n_dependants,fam_status,income,region,_merge,activity_flag
0,2539329,1,1,2,8,,True,196,1,0,...,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both,Active
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both,Active
2,473747,1,3,3,12,21.0,False,196,1,1,...,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both,Active
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both,Active
4,431534,1,5,4,15,28.0,False,196,1,1,...,Female,Alabama,31,2/17/2019,3,married,40423,Region 3 (South),both,Active


In [36]:
# Verify that all max orders are 
df_merge_active['max_order'].min()

5

In [37]:
# Verify number of rows is the same as active flag value count
df_merge_active.shape

(30964564, 34)

#### New data table is filtered to include only customers with 5 or more orders. This is verified by checking to ensure there are no max order values below 5 and that the count of rows in the new table is equal to the count of rows in the original table where the activity flag is "active."

### Export sample

In [21]:
# Export data to pkl
df_merge_active.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'completed_data_active_customers.pkl'))