### Contents
1. Import libraries and data
2. Data security check
3. Create a regional segmentation
   - 3a. Create a “Region” column based on the “State” column
   - 3b. Spending habits between the different U.S. regions

### 1. Import libraries and data

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Define path

path = r'C:\Users\Windows 10\Documents\04-2023 Instacart Basket Analysis'

In [3]:
# Import data

df_project_all = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers.pkl'))

### 2. Data security check

In [4]:
df_project_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 31 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int64   
 1   user_id                 object  
 2   order_number            int64   
 3   orders_day_of_week      int64   
 4   order_hour_of_day       int64   
 5   days_since_prior_order  float64 
 6   product_id              int64   
 7   add_to_cart_order       int64   
 8   reordered               int64   
 9   product_name            object  
 10  aisle_id                int64   
 11  department_id           int64   
 12  prices                  float64 
 13  _merge                  category
 14  price_range_loc         object  
 15  busiest_day             object  
 16  busiest_days            object  
 17  busiest_period_of_day   object  
 18  max_order               int64   
 19  loyalty_flag            object  
 20  average_price           float64 
 21  spendi

In [5]:
# You’ll need to address any PII data in the data before continuing your analysis

# I have removed first_name and last_name column in previous task 4.9

### 3. Create a regional segmentation

#### 3a. Create a “Region” column based on the “State” column

In [6]:
# Check the "state" column

df_project_all['state'].value_counts(dropna = False)

Pennsylvania            667082
California              659783
Rhode Island            656913
Georgia                 656389
New Mexico              654494
Arizona                 653964
North Carolina          651900
Oklahoma                651739
Alaska                  648495
Minnesota               647825
Massachusetts           646358
Wyoming                 644255
Virginia                641421
Missouri                640732
Texas                   640394
Colorado                639280
Maine                   638583
North Dakota            638491
Alabama                 638003
Kansas                  637538
Louisiana               637482
Delaware                637024
South Carolina          636754
Oregon                  636425
Arkansas                636144
Nevada                  636139
New York                635983
Montana                 635265
South Dakota            633772
Illinois                633024
Hawaii                  632901
Washington              632852
Mississi

In [7]:
# Create new variable (region)

region = []

In [8]:
# Assign states to region variable using for loop

for value in df_project_all['state']:
    if value in ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']:
        region.append('Northeast')
    elif value in ['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']:
        region.append('Midwest')
    elif value in ['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']:
        region.append('West')
    else:
        region.append('South')

In [9]:
# Insert region as column in df_project_all

df_project_all['region'] = region

In [10]:
df_project_all['region'].value_counts(dropna = False)

South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: region, dtype: int64

In [11]:
df_project_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 32 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int64   
 1   user_id                 object  
 2   order_number            int64   
 3   orders_day_of_week      int64   
 4   order_hour_of_day       int64   
 5   days_since_prior_order  float64 
 6   product_id              int64   
 7   add_to_cart_order       int64   
 8   reordered               int64   
 9   product_name            object  
 10  aisle_id                int64   
 11  department_id           int64   
 12  prices                  float64 
 13  _merge                  category
 14  price_range_loc         object  
 15  busiest_day             object  
 16  busiest_days            object  
 17  busiest_period_of_day   object  
 18  max_order               int64   
 19  loyalty_flag            object  
 20  average_price           float64 
 21  spendi

#### 3b. Spending habits between the different U.S. regions

In [12]:
# Create a crosstab between the 'region' and 'spending_flag' column

crosstab_spending_region = pd.crosstab(df_project_all['region'], df_project_all['spending_flag'], dropna = False)

In [13]:
# Display the crosstab

crosstab_spending_region

spending_flag,High spender,Low spender
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,29444,7567881
Northeast,18662,5704074
South,40905,10750980
West,31347,8261566


In [14]:
# Add additional information

crosstab_spending_region['Total'] = crosstab_spending_region.sum(axis = 1)
crosstab_spending_region['% High spender'] = crosstab_spending_region['High spender'] / crosstab_spending_region['Total'] * 100
crosstab_spending_region['% Low spender'] = crosstab_spending_region['Low spender'] / crosstab_spending_region['Total'] * 100

In [15]:
# Display the new crosstab

crosstab_spending_region

spending_flag,High spender,Low spender,Total,% High spender,% Low spender
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Midwest,29444,7567881,7597325,0.387557,99.612443
Northeast,18662,5704074,5722736,0.326103,99.673897
South,40905,10750980,10791885,0.379035,99.620965
West,31347,8261566,8292913,0.377997,99.622003


##### The majority of customers in each US region are categorized as low-spenders where the average price of a product purchased by a customer is lower than 10