# PII DATA, REGIONS, AND EXCLUSION FLAGS

<a id='0'></a> <br>
 # Table of Contents  
 
[Import Data and Libraries](#0.1)
1. [PII Data](#1)     
1. [Regions](#2) 
    1. [Create regional segmentations](#3)
    1. [Regional Spending Habits](#4)
1. [Exclusion Flag](#5)

<a id='0.1'></a> <br>
# Import Data and Libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# creates path

path = r'G:\My Drive\CareerFoundry\Python Projects\2023-10 Instacart Basket Analysis'

In [3]:
# imports latest dataframe

opc_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_cust_merged.pkl'))

In [4]:
# check results

opc_merge.head()

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,busiest_period_of_day,loyalty_flag,spending_flag,order_freq_flag,gender,state,age,dependants,fam_status,income
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,Average orders,New customer,Low spender,Regular customer,Female,Alabama,31,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Average orders,New customer,Low spender,Regular customer,Female,Alabama,31,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Most orders,New customer,Low spender,Regular customer,Female,Alabama,31,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Average orders,New customer,Low spender,Regular customer,Female,Alabama,31,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Most orders,New customer,Low spender,Regular customer,Female,Alabama,31,3,married,40423


In [5]:
opc_merge.shape

(32404859, 25)

In [6]:
opc_merge.columns

Index(['order_id', 'user_id', 'number_of_orders', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_days',
       'busiest_period_of_day', 'loyalty_flag', 'spending_flag',
       'order_freq_flag', 'gender', 'state', 'age', 'dependants', 'fam_status',
       'income'],
      dtype='object')

<a id='1'></a> <br>
# 1. PII Data
# Consider any security implications that might exist for this new data. You’ll need to address any PII data in the data before continuing your analysis.

### I previously removed and PII data (specifically first_name and last_name) as it was both a security/privacy issue and unnecessary for the analysis.

<a id='2'></a> <br>
# 2. Regions
# Determine whether there’s a difference in spending habits between the different U.S. regions.

<a id='3'></a> <br>
> ## A. Create regional segmentations

In [7]:
# create a “Region” column based on the “State” column from
# https://simple.wikipedia.org/wiki/List_of_regions_of_the_United_States

In [8]:
# creates arrays to use in if-statements

In [9]:
arr_str_south = ['West Virginia', 'District of Columbia', 'Maryland', 'Virginia',
              'Kentucky', 'Tennessee', 'North Carolina', 'Mississippi',
              'Arkansas', 'Louisiana', 'Alabama', 'Georgia', 'South Carolina',
              'Florida', 'Delaware', 'Oklahoma', 'Texas']

In [10]:
arr_str_west = ['Washington', 'Oregon', 'California', 'Nevada', 'Idaho', 'Montana',
             'Wyoming', 'Utah', 'Colorado', 'Alaska', 'Hawaii', 'Arizona', 'New Mexico']

In [11]:
arr_str_midwest = ['North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota',
                'Iowa', 'Missouri', 'Wisconsin', 'Illinois', 'Michigan', 'Indiana',
                'Ohio']

In [12]:
arr_str_northeast = ['Maine', 'Vermont', 'New York', 'New Hampshire', 'Massachusetts',
                  'Rhode Island', 'Connecticut', 'New Jersey', 'Pennsylvania']

In [13]:
# check results

arr_str_south

['West Virginia',
 'District of Columbia',
 'Maryland',
 'Virginia',
 'Kentucky',
 'Tennessee',
 'North Carolina',
 'Mississippi',
 'Arkansas',
 'Louisiana',
 'Alabama',
 'Georgia',
 'South Carolina',
 'Florida',
 'Delaware',
 'Oklahoma',
 'Texas']

In [14]:
arr_str_northeast

['Maine',
 'Vermont',
 'New York',
 'New Hampshire',
 'Massachusetts',
 'Rhode Island',
 'Connecticut',
 'New Jersey',
 'Pennsylvania']

In [15]:
arr_str_west

['Washington',
 'Oregon',
 'California',
 'Nevada',
 'Idaho',
 'Montana',
 'Wyoming',
 'Utah',
 'Colorado',
 'Alaska',
 'Hawaii',
 'Arizona',
 'New Mexico']

In [16]:
arr_str_midwest

['North Dakota',
 'South Dakota',
 'Nebraska',
 'Kansas',
 'Minnesota',
 'Iowa',
 'Missouri',
 'Wisconsin',
 'Illinois',
 'Michigan',
 'Indiana',
 'Ohio']

In [17]:
# creates/imputes new 'region' column by comparing 'state' to arrays using loc()

opc_merge.loc[opc_merge['state'].isin(arr_str_south), 'region'] = 'South'

In [18]:
opc_merge.loc[opc_merge['state'].isin(arr_str_west), 'region'] = 'West'

In [19]:
opc_merge.loc[opc_merge['state'].isin(arr_str_midwest), 'region'] = 'Midwest'

In [20]:
opc_merge.loc[opc_merge['state'].isin(arr_str_northeast), 'region'] = 'Northeast'

In [21]:
# check results

opc_merge['region'].value_counts(dropna = False)

region
South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: count, dtype: int64

In [22]:
opc_merge.head()

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,loyalty_flag,spending_flag,order_freq_flag,gender,state,age,dependants,fam_status,income,region
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,New customer,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,New customer,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,New customer,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,New customer,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,New customer,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South


In [23]:
opc_merge.tail()

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,loyalty_flag,spending_flag,order_freq_flag,gender,state,age,dependants,fam_status,income,region
32404854,156685,106143,26,4,23,5.0,19675,1,1,Organic Raspberry Black Tea,...,Regular customer,High spender,Frequent customer,Male,Hawaii,25,0,single,53755,West
32404855,484769,66343,1,6,11,0.0,47210,1,0,Fresh Farmed Tilapia Fillet,...,New customer,Low spender,Non-frequent customer,Female,Tennessee,22,3,married,46151,South
32404856,1561557,66343,2,1,11,30.0,47210,1,1,Fresh Farmed Tilapia Fillet,...,New customer,Low spender,Non-frequent customer,Female,Tennessee,22,3,married,46151,South
32404857,276317,66343,3,6,15,19.0,47210,1,1,Fresh Farmed Tilapia Fillet,...,New customer,Low spender,Non-frequent customer,Female,Tennessee,22,3,married,46151,South
32404858,2922475,66343,4,1,12,30.0,47210,1,1,Fresh Farmed Tilapia Fillet,...,New customer,Low spender,Non-frequent customer,Female,Tennessee,22,3,married,46151,South


In [24]:
# checks if regions have been assigned correctly

group = opc_merge['state'].groupby(opc_merge['region']).unique()

In [25]:
list(group)

[array(['Indiana', 'Iowa', 'Ohio', 'South Dakota', 'Michigan', 'Minnesota',
        'Illinois', 'Kansas', 'Nebraska', 'Missouri', 'Wisconsin',
        'North Dakota'], dtype=object),
 array(['Connecticut', 'New Jersey', 'Maine', 'Vermont', 'Rhode Island',
        'New Hampshire', 'Massachusetts', 'Pennsylvania', 'New York'],
       dtype=object),
 array(['Alabama', 'Louisiana', 'Oklahoma', 'Tennessee', 'Virginia',
        'Kentucky', 'North Carolina', 'Delaware', 'Maryland', 'Arkansas',
        'Georgia', 'District of Columbia', 'Florida', 'South Carolina',
        'Mississippi', 'Texas', 'West Virginia'], dtype=object),
 array(['Montana', 'Oregon', 'Arizona', 'Hawaii', 'Idaho', 'Alaska',
        'Nevada', 'Utah', 'Colorado', 'Washington', 'California',
        'New Mexico', 'Wyoming'], dtype=object)]

<a id='4'></a> <br>
> ## B. Regional Spending Habits

In [26]:
# creates a crosstab of region and spending flag

crosstab = pd.crosstab(opc_merge['region'], opc_merge['spending_flag'], dropna = False)

In [44]:
# check results

print(crosstab)

spending_flag  High spender  Low spender
region                                  
Midwest              155975      7441350
Northeast            108225      5614511
South                209691     10582194
West                 160354      8132559


### - The South has a greater population of High Spenders, Low Spenders, and overall, and therefore spends the most with the West, Midwest and Northeast following respectively.

<a id='5'></a> <br>
# 3. Exclusion Flag
## Create an exclusion flag for low-activity customers (customers with less than 5 orders) and exclude them from the data. Make sure you export this sample.

In [28]:
# recreate 'max_order' column (previously deleted)

In [29]:
opc_merge['max_order'] = opc_merge.groupby(['user_id'])['number_of_orders'].transform(np.max)

In [30]:
# check results

opc_merge.head(25)

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,spending_flag,order_freq_flag,gender,state,age,dependants,fam_status,income,region,max_order
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South,10
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South,10
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South,10
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South,10
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South,10
5,3367565,1,6,2,7,19.0,196,1,1,Soda,...,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South,10
6,550135,1,7,1,9,20.0,196,1,1,Soda,...,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South,10
7,3108588,1,8,1,14,14.0,196,2,1,Soda,...,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South,10
8,2295261,1,9,1,16,0.0,196,4,1,Soda,...,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South,10
9,2550362,1,10,4,8,30.0,196,1,1,Soda,...,Low spender,Regular customer,Female,Alabama,31,3,married,40423,South,10


In [31]:
opc_merge.shape

(32404859, 27)

In [32]:
# creates the exclusion_flag for max_order < 5 using loc()

opc_merge.loc[opc_merge['max_order'] < 5, 'low_activity_flag'] = 'exclude'

In [33]:
opc_merge.loc[opc_merge['max_order'] >= 5, 'low_activity_flag'] = 'keep'

In [34]:
# check results

opc_merge['low_activity_flag'].value_counts(dropna = False)

low_activity_flag
keep       30964564
exclude     1440295
Name: count, dtype: int64

In [35]:
# creates subset of low_activity users

low_activity_excluded = opc_merge.loc[opc_merge['low_activity_flag'] == 'exclude']

In [36]:
# check results

low_activity_excluded.shape

(1440295, 28)

In [37]:
# exports low_activity_excluded subset

low_activity_excluded.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'low_activity_excluded.pkl'))

In [38]:
# creates new df without low activity users

opc_merge_active = opc_merge.loc[opc_merge['low_activity_flag'] == 'keep']

In [39]:
# check results

opc_merge_active.head()

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,order_freq_flag,gender,state,age,dependants,fam_status,income,region,max_order,low_activity_flag
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,Regular customer,Female,Alabama,31,3,married,40423,South,10,keep
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Regular customer,Female,Alabama,31,3,married,40423,South,10,keep
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Regular customer,Female,Alabama,31,3,married,40423,South,10,keep
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Regular customer,Female,Alabama,31,3,married,40423,South,10,keep
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Regular customer,Female,Alabama,31,3,married,40423,South,10,keep


In [40]:
opc_merge_active.shape

(30964564, 28)

In [41]:
opc_merge_active.columns

Index(['order_id', 'user_id', 'number_of_orders', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_days',
       'busiest_period_of_day', 'loyalty_flag', 'spending_flag',
       'order_freq_flag', 'gender', 'state', 'age', 'dependants', 'fam_status',
       'income', 'region', 'max_order', 'low_activity_flag'],
      dtype='object')

In [45]:
# exports opc_merge_active subset

opc_merge_active.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'opc_merge_active.pkl'))