# 4.10: Coding Etiquette & Excel Reporting - Pt 1, Steps 1 - 4

## This script contains the following points:

### Step 1 -  Import data and libraries
### Step 3 - Compare behavior in different geographic areas
### Step 4 - Create dataframe excluding low-activity customers
### Export data

## Step 1 -  Import data and libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Assign project folder path to a variable

path = r'C:\Users\jomok\Documents\Career Foundry\Achievement 4\07-2023 Instacart Basket Analysis'

In [3]:
# Import data

ords_prods_custs_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers_merged.pkl'))

In [4]:
# Check output

ords_prods_custs_merge.head()

Unnamed: 0,user_id,gender,state,age,date_joined,number_of_dependents,marital_status,income,order_id,eval_set,...,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_days_since_prior_order,order_frequency_flag,_merge,region,activity_flag
0,26711,Female,Missouri,48,1/1/2017,3,married,165665,2543867,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
1,26711,Female,Missouri,48,1/1/2017,3,married,165665,1285508,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
2,26711,Female,Missouri,48,1/1/2017,3,married,165665,2578584,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
3,26711,Female,Missouri,48,1/1/2017,3,married,165665,423547,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
4,26711,Female,Missouri,48,1/1/2017,3,married,165665,2524893,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer


In [5]:
# Check shape

ords_prods_custs_merge.shape

(32404859, 33)

## Step 3 -  Compare behavior in different geographic areas

In [6]:
# Assign state values to region variables

Northeast = ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']
Midwest = ['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']
South = ['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']
West = ['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']

In [7]:
ords_prods_custs_merge.loc[ords_prods_custs_merge['state'].isin(Northeast), 'region'] = 'Northeast'

In [8]:
ords_prods_custs_merge.loc[ords_prods_custs_merge['state'].isin(Midwest), 'region'] = 'Midwest'

In [9]:
ords_prods_custs_merge.loc[ords_prods_custs_merge['state'].isin(South), 'region'] = 'South'

In [10]:
ords_prods_custs_merge.loc[ords_prods_custs_merge['state'].isin(West), 'region'] = 'West'

In [11]:
# Check frequency of regions column

ords_prods_custs_merge['region'].value_counts(dropna = False)

South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: region, dtype: int64

In [12]:
# Check output

ords_prods_custs_merge.head()

Unnamed: 0,user_id,gender,state,age,date_joined,number_of_dependents,marital_status,income,order_id,eval_set,...,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_days_since_prior_order,order_frequency_flag,_merge,region,activity_flag
0,26711,Female,Missouri,48,1/1/2017,3,married,165665,2543867,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
1,26711,Female,Missouri,48,1/1/2017,3,married,165665,1285508,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
2,26711,Female,Missouri,48,1/1/2017,3,married,165665,2578584,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
3,26711,Female,Missouri,48,1/1/2017,3,married,165665,423547,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
4,26711,Female,Missouri,48,1/1/2017,3,married,165665,2524893,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer


In [13]:
# Check shape

ords_prods_custs_merge.shape

(32404859, 33)

In [14]:
# Create crosstab of regions and spending flag columns

crosstab = pd.crosstab(ords_prods_custs_merge['region'], ords_prods_custs_merge['spending_flag'], dropna = False)

In [15]:
# Check output

crosstab

spending_flag,High spender,Low spender
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,155975,7441350
Northeast,108225,5614511
South,209691,10582194
West,160354,8132559


In [16]:
# Copy to clipboard to paste into Excel

crosstab.to_clipboard()

The frequency of high-spenders overall only comprises about 2% of the data, compared to the 98% frequency of low-spenders.  There is a difference in spending habits between the different U.S. regions.  The South has the highest frequency, approximately 33% of both the high and low spenders. The Midwest and West regions have comparable spending habits of approximately 25% of both the high and low spenders.  The Northeast has the least percentage, approximately 17% of both the high-spender and low-spender groups. 

## Step 4 - Create a dataframe excluding low-activity customers

In [17]:
# Create a flag to assign an activity label customers based on number of orders

ords_prods_custs_merge.loc[ords_prods_custs_merge['max_order'] < 5, 'activity_flag'] = 'low activity customer'

In [18]:
ords_prods_custs_merge.loc[ords_prods_custs_merge['max_order'] >= 5, 'activity_flag'] = 'high activity customer'

In [19]:
# Print frequency of the "activity flag" column

ords_prods_custs_merge['activity_flag'].value_counts(dropna = False)

high activity customer    30964564
low activity customer      1440295
Name: activity_flag, dtype: int64

In [20]:
# Check output

ords_prods_custs_merge[['user_id', 'activity_flag', 'order_id']].head(60)

Unnamed: 0,user_id,activity_flag,order_id
0,26711,high activity customer,2543867
1,26711,high activity customer,1285508
2,26711,high activity customer,2578584
3,26711,high activity customer,423547
4,26711,high activity customer,2524893
5,26711,high activity customer,2984525
6,26711,high activity customer,2505178
7,26711,high activity customer,1285508
8,26711,high activity customer,2578584
9,26711,high activity customer,518967


In [21]:
# Create a subset of low activity customers

df_low_activity_customers = ords_prods_custs_merge[ords_prods_custs_merge['activity_flag']=='low activity customer']

In [22]:
# Check output

df_low_activity_customers.head()

Unnamed: 0,user_id,gender,state,age,date_joined,number_of_dependents,marital_status,income,order_id,eval_set,...,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_days_since_prior_order,order_frequency_flag,_merge,region,activity_flag
729,168851,Male,South Carolina,30,1/1/2017,0,single,63712,1903574,prior,...,Average orders,3,New customer,7.485714,Low spender,15.0,Regular customer,both,South,low activity customer
730,168851,Male,South Carolina,30,1/1/2017,0,single,63712,1193140,prior,...,Most orders,3,New customer,7.485714,Low spender,15.0,Regular customer,both,South,low activity customer
731,168851,Male,South Carolina,30,1/1/2017,0,single,63712,1870356,prior,...,Average orders,3,New customer,7.485714,Low spender,15.0,Regular customer,both,South,low activity customer
732,168851,Male,South Carolina,30,1/1/2017,0,single,63712,1903574,prior,...,Average orders,3,New customer,7.485714,Low spender,15.0,Regular customer,both,South,low activity customer
733,168851,Male,South Carolina,30,1/1/2017,0,single,63712,1193140,prior,...,Most orders,3,New customer,7.485714,Low spender,15.0,Regular customer,both,South,low activity customer


In [23]:
# Check shape

df_low_activity_customers.shape

(1440295, 33)

Agrees with frequency count for low activity customers

In [24]:
# Export subset of low activity customers to pkl

df_low_activity_customers.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'low_activity_customers.pkl'))

In [25]:
# Create a subset of high activity customers

df_high_activity_customers = ords_prods_custs_merge[ords_prods_custs_merge['activity_flag']=='high activity customer']

In [26]:
# Check output

df_high_activity_customers.head()

Unnamed: 0,user_id,gender,state,age,date_joined,number_of_dependents,marital_status,income,order_id,eval_set,...,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_days_since_prior_order,order_frequency_flag,_merge,region,activity_flag
0,26711,Female,Missouri,48,1/1/2017,3,married,165665,2543867,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
1,26711,Female,Missouri,48,1/1/2017,3,married,165665,1285508,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
2,26711,Female,Missouri,48,1/1/2017,3,married,165665,2578584,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
3,26711,Female,Missouri,48,1/1/2017,3,married,165665,423547,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
4,26711,Female,Missouri,48,1/1/2017,3,married,165665,2524893,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer


In [27]:
# Check shape

df_high_activity_customers.shape

(30964564, 33)

Agrees with frequency count for high activity customers

In [28]:
# Export subset of high activity customers to pkl

df_high_activity_customers.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'high_activity_customers.pkl'))

## Export data

In [29]:
# Export dataframe to pkl

ords_prods_custs_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_customers_merged.pkl'))