# 4.10. Final report (task 1-4)
#

# List of contents:
## 1. Import libraries
## 2. Import 'orders_products_customers_merged' dataset
## 3. Remove 'first_name' and 'last_name' columns to address PII data
## 4. Compare customers behavior across geographic regions
## 5. Create an exclusion flag for low-activity customers and exclude them from the data
## 6. Export dataframes
#

## 1. Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import scipy

## 2. Import 'orders_products_customers_merged' dataset

In [2]:
# Create path variable
path = r'C:\Users\marta\OneDrive\Documents\2023-09-18 Instacart Basket Analysis'

In [3]:
# Import dataset
ords_prods_customers = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers_merged.pkl'))

In [4]:
# Apply function to display all columns wihtin the dataframe
pd.set_option('display.max_columns', None)

In [5]:
# Check the output
ords_prods_customers.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spending_flag,median_order_day,order_freq_flag,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


In [6]:
# Check the dimensions
ords_prods_customers.shape

(32399732, 32)

## 3. Remove 'first_name' and 'last_name' columns to address PII data

In [7]:
# Drop 'first_name' and 'last_name' columns
ords_prods_customers = ords_prods_customers.drop(columns = ['first_name', 'last_name'])

In [8]:
# Check the result
ords_prods_customers.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spending_flag,median_order_day,order_freq_flag,gender,state,age,date_joined,n_dependants,fam_status,income
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423


## 4. Compare customers behavior across geographic regions

### -- Create 'regions' column

In [9]:
# Check frequency of values within 'state' column
ords_prods_customers['state'].value_counts(dropna = False)

state
Pennsylvania            667007
California              659695
Rhode Island            656777
Georgia                 656249
New Mexico              654400
Arizona                 653864
North Carolina          651790
Oklahoma                651661
Alaska                  648451
Minnesota               647738
Massachusetts           646275
Wyoming                 644191
Virginia                641280
Missouri                640576
Texas                   640285
Colorado                639173
Maine                   638479
North Dakota            638391
Alabama                 637863
Kansas                  637418
Louisiana               637414
Delaware                636906
South Carolina          636677
Oregon                  636332
Arkansas                636070
Nevada                  636034
New York                635912
Montana                 635181
South Dakota            633649
Illinois                632928
Hawaii                  632786
Washington              632722
Mi

#### All states will be divided by regions according to Wikipedia list: https://simple.wikipedia.org/wiki/List_of_regions_of_the_United_States 

In [10]:
# Create 'regions' list to assign a region for each state
regions = []

for value in ords_prods_customers['state']:
  if value in ['Maine','New Hampshire','Vermont','Massachusetts','Rhode Island','Connecticut','New York','Pennsylvania','New Jersey']:
    regions.append('Northeast')
  elif value in ['Wisconsin','Michigan','Illinois','Indiana','Ohio','North Dakota','South Dakota','Nebraska','Kansas','Minnesota','Iowa','Missouri']:
    regions.append('Midwest')
  elif value in ['Idaho','Montana','Wyoming','Nevada','Utah','Colorado','Arizona','New Mexico']:
    regions.append('West')
  else:
    regions.append('South')

In [11]:
# Create 'regions' column within dataframe
ords_prods_customers['regions'] = regions

In [12]:
# Check the output
ords_prods_customers.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spending_flag,median_order_day,order_freq_flag,gender,state,age,date_joined,n_dependants,fam_status,income,regions
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South


In [13]:
# Check frequency of values within 'regions' column
ords_prods_customers['regions'].value_counts()

regions
South        14000082
Midwest       7596065
Northeast     5721892
West          5081693
Name: count, dtype: int64

### -- Compare customer spendings across regions

In [14]:
# Create crosstab between 'regions' and 'spending_flag' columns
crosstab_reg_spend = pd.crosstab(ords_prods_customers['regions'], ords_prods_customers['spending_flag'], dropna = False)

In [15]:
# Check the output
crosstab_reg_spend

spending_flag,High spender,Low spender
regions,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,29265,7566800
Northeast,18639,5703253
South,52506,13947576
West,19309,5062384


#### All regions share the same tendency of having more low spenders than high spenders. South region has the highest number of high- and low-spending customers, each group has 43% of total.
#### Northeast and West regions have approximately 16% of high spenders each. West has the lowest percentage of low spenders, which is 15,6%.

## 5. Create an exclusion flag for low-activity customers and exclude them from the data

### -- Create exclusion 'user_activity_flag'

In [17]:
 # Set first condition (low-activity customers if orders number <5)0
ords_prods_customers.loc[ords_prods_customers['max_order'] < 5, 'user_activity_flag'] = 'Low activity'

In [18]:
 # Set second condition (regular customers if orders number >=5)
ords_prods_customers.loc[ords_prods_customers['max_order'] >= 5, 'user_activity_flag'] = 'Normal activity'

In [19]:
# Check the output
ords_prods_customers['user_activity_flag'].value_counts(dropna = False)

user_activity_flag
Normal activity    30959687
Low activity        1440045
Name: count, dtype: int64

In [20]:
# Check the output
ords_prods_customers

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spending_flag,median_order_day,order_freq_flag,gender,state,age,date_joined,n_dependants,fam_status,income,regions,user_activity_flag
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Normal activity
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Normal activity
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Normal activity
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Normal activity
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,South,Normal activity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32399727,156685,106143,26,4,23,5.0,19675,1,1,Organic Raspberry Black Tea,94,7,10.7,Mid-range product,Least busy,Least busy,Fewest orders,26,Regular customer,10.700000,High spender,7.0,Frequent customer,Male,Hawaii,25,5/26/2017,0,single,53755,South,Normal activity
32399728,484769,66343,1,6,11,,47210,1,0,Fresh Farmed Tilapia Fillet,15,12,8.1,Mid-range product,Regularly busy,Regularly busy,Most orders,4,New customer,8.100000,Low spender,30.0,Non-frequent customer,Female,Tennessee,22,9/12/2017,3,married,46151,South,Low activity
32399729,1561557,66343,2,1,11,30.0,47210,1,1,Fresh Farmed Tilapia Fillet,15,12,8.1,Mid-range product,Regularly busy,Busiest day,Most orders,4,New customer,8.100000,Low spender,30.0,Non-frequent customer,Female,Tennessee,22,9/12/2017,3,married,46151,South,Low activity
32399730,276317,66343,3,6,15,19.0,47210,1,1,Fresh Farmed Tilapia Fillet,15,12,8.1,Mid-range product,Regularly busy,Regularly busy,Most orders,4,New customer,8.100000,Low spender,30.0,Non-frequent customer,Female,Tennessee,22,9/12/2017,3,married,46151,South,Low activity


### -- Remove low-activity customers from dataframe

In [21]:
# Create subset df with only low-activity customers
df_low_activity = ords_prods_customers[ords_prods_customers['user_activity_flag'] == 'Low activity']

In [22]:
# Check the output
df_low_activity['user_activity_flag'].value_counts()

user_activity_flag
Low activity    1440045
Name: count, dtype: int64

In [23]:
# Create subset df with only normal-activity customers
df_normal_activity = ords_prods_customers[ords_prods_customers['user_activity_flag'] == 'Normal activity']

In [24]:
# Check the output
df_normal_activity['user_activity_flag'].value_counts()

user_activity_flag
Normal activity    30959687
Name: count, dtype: int64

In [25]:
# Check the total length of new dataframes
len(df_normal_activity) + len(df_low_activity)

32399732

## 6. Export dataframes

In [26]:
# Export ords_prods_customers dataframe in pkl format
ords_prods_customers.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers_final.pkl'))

In [27]:
# Export df_normal_activity dataframe in pkl format
df_normal_activity.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'subset_customers_norm_activity.pkl'))

In [28]:
# Export df_normal_activity dataframe in pkl format
df_low_activity.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'subset_customers_low_activity.pkl'))