# 4.10. Customer Profiling and Data Visualisations

Contents
1.  Importing Libraries
2.  Import Dataframe
3.  Checking Imported Datatframes
4.  Security Implications
5.  Creating new regions column (grouping)
6.  Creating an exclusion flag
7.  Create customer profiles
8.  Using the loc() function to create the profiles individually
9.  Visualising distribution of profiles
10. Descriptive statistics for customer profiles
11. Crosstab of customer profiles and department_id
12. Bar chart showing sum of orders per customer profile
13. Bar chart to show sum of orders per department
14. Export Final Dataframe

# 01. Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02. Import Dataframe

In [None]:
# create path
path =r'C:\Users\mngun\Documents\11_2023_InstaCart_asket_Analysis'

In [None]:
# import dataframe using created path
ords_prods_cust = pd.read_pickle(os.path.join(path, '02_Data','Prepared Data','ords_prods_cust.pkl'))

# 03. Checking imported dataframe

In [None]:
# Checking dataframe shape
ords_prods_cust.shape

In [None]:
# Checking dataframe columns
ords_prods_cust.head()

In [None]:
ords_prods_cust.columns.to_list()

# 04. Security Implications

- the dataframe does not contain any information which could be directly linked to any individual since there are no addresses, or names

# 05. Creating a new 'regions' column from state groupings

In [None]:
# using .loc function to create 
ords_prods_cust.loc[ords_prods_cust['state'].isin(['Maine','New Hampshire','Vermont','Massachusetts','Rhode Island','Connecticut', 'New York', 'Pennsylvannia', 'New Jersey']), 'region'] = 'Northeast'

In [None]:
ords_prods_cust.loc[ords_prods_cust['state'].isin(['Wisconsin','Michigan','Illinois','Indiana','Ohio','North Dakota','South Dakota','Nebraska','Kansas','Minnesota','Iowa','Missouri']), 'region'] = 'Midwest'

In [None]:
ords_prods_cust.loc[ords_prods_cust['state'].isin(['Delaware','Maryland','District of Columbia','Virginia','West Virginia','North Carolina','South Carolina','Georgia','Florida','Kentucky','Tennessee','Mississippi','Alabama','Oklahoma','Texas','Arkansas','Louisiana']), 'region']= 'South'


In [None]:
ords_prods_cust.loc[ords_prods_cust['state'].isin(['Idaho','Montana','Wyoming','Nevada','Utah','Colorado','Arizona','New Mexico','Alaska','Washington','Oregon','California','Hawaii']), 'region']= 'West'

In [None]:
# checking shape after new column creation
ords_prods_cust.shape

In [None]:
# confirming results
ords_prods_cust.tail()

In [None]:
crosstab = pd.crosstab(ords_prods_cust['spending_flag'], ords_prods_cust['region'], dropna = False)

In [None]:
crosstab

# 06.Creating exclusion flag

In [None]:
# create a column in dataframe to count the max number of orders per user_id
ords_prods_cust['max_order_nr'] = ords_prods_cust.groupby('user_id')['order_number'].transform('max')

In [None]:
# create a True/False indicator column based on the criteria for maximum number of orders
ords_prods_cust['exclusion_flag'] = ords_prods_cust['max_order_nr']<5

In [None]:
# confirm success of creating columns and exclusion flag
ords_prods_cust.head()

In [None]:
# drop max_order_nr column from ords_prods_cust dataframe
ords_prods_cust = ords_prods_cust.drop('max_order_nr', axis = 1)

In [None]:
# creating a new dataframe customers with a false flag in exclusion flag column 
frequent_customers = ords_prods_cust[ords_prods_cust['exclusion_flag']== False]

In [None]:
frequent_customers.shape

# 07. Create customer profiles

In [None]:
frequent_customers.columns

In [None]:
# dropping 3 unneeded columns for memory
frequent_customers = frequent_customers.drop(columns=['average_price','_merge','exclusion_flag'])

 # 08. Using the loc() function to create the profiles individually

a) young profiles

In [None]:
# young high-income parent
frequent_customers.loc[(frequent_customers['age']<39) & (frequent_customers['n_dependants']>0) & (frequent_customers['income']>200000) , 'cust_profile']= 'young high-income parent'

In [None]:
# young low-income parent
frequent_customers.loc[(frequent_customers['age']<39) & (frequent_customers['n_dependants']>0) & (frequent_customers['income']<80000) , 'cust_profile']= 'young low-income parent'

In [None]:
# young middle-income adult
frequent_customers.loc[(frequent_customers['age']<39) & (frequent_customers['n_dependants']==0) & (frequent_customers['income']<200000) , 'cust_profile']= 'young middle-income adult'

In [None]:
# young high-income earner
frequent_customers.loc[(frequent_customers['age']<39) & (frequent_customers['n_dependants']==0) & (frequent_customers['income']>200000) , 'cust_profile']= 'young high-income earner'

b) senior profiles

In [None]:
# high-income senior parent
frequent_customers.loc[(frequent_customers['age']>60) & (frequent_customers['n_dependants']>0) & (frequent_customers['income']>200000) , 'cust_profile']= 'high-income senior parent'

In [None]:
# high-income senior
frequent_customers.loc[(frequent_customers['age']>60) & (frequent_customers['n_dependants']==0) & (frequent_customers['income']>200000) , 'cust_profile']= 'high-income senior'

In [None]:
# middle-income senior
frequent_customers.loc[(frequent_customers['age']>60) & (frequent_customers['n_dependants']==0) & (frequent_customers['income']<100000) , 'cust_profile']= 'middle-income senior'

c) middle-aged profiles

In [None]:
# high-income middle-age parent
frequent_customers.loc[(frequent_customers['age']>39) & (frequent_customers['age']<60) & (frequent_customers['n_dependants']>0) & (frequent_customers['income']>200000), 'cust_profile']= 'high-income middle-age parent'

In [None]:
# high-income middle aged adult
frequent_customers.loc[(frequent_customers['age']>39) & (frequent_customers['age']<60) & (frequent_customers['n_dependants']==0) & (frequent_customers['income']>200000), 'cust_profile']= 'high-income middle-age adult'

In [None]:
# regular-income middle-aged parent
frequent_customers.loc[(frequent_customers['age']>39) & (frequent_customers['age']<60) & (frequent_customers['n_dependants']>0) & (frequent_customers['income']<200000), 'cust_profile']= 'middle-income middle-aged parent'

In [None]:
# middle-income middle-aged adult
frequent_customers.loc[(frequent_customers['age']>39) & (frequent_customers['age']<60) & (frequent_customers['n_dependants']==0) & (frequent_customers['income']<200000), 'cust_profile']= 'middle-income middle-aged adult'

In [None]:
# low-income middle-aged adult
frequent_customers.loc[(frequent_customers['age']>39) & (frequent_customers['age']<60) & (frequent_customers['n_dependants']==0) & (frequent_customers['income']<100000), 'cust_profile']= 'low-income middle-aged adult'

In [None]:
# count values in column
frequent_customers['cust_profile'].value_counts(dropna= False)

In [None]:
# replace NaN in cust_profile with other
frequent_customers['cust_profile'].fillna('other', inplace= True)

In [None]:
# confirm NaN values have been replaced
frequent_customers['cust_profile'].value_counts(dropna= False)

In [None]:
# counting unique user_id's for each profile
profile_counts = frequent_customers.groupby('cust_profile')['user_id'].nunique(dropna=False)

In [None]:
profile_counts

# 09. Visualising distribution of profiles

In [None]:
# generate bar chart to represent customer profiles
bar_profiles= profile_counts.plot.bar(color= 'skyblue', edgecolor= 'black')
# creating a title and legends
plt.title('Frequency of Customer Profiles')
plt.xlabel('Customer Profile')
plt.ylabel('Number')

In [None]:
# saving chart to png
bar_profiles.figure.savefig(os.path.join(path,'04_Analysis','Visualisations','bar_customer_profiles.png'),bbox_inches='tight')

# 10. Descriptive statistics for customer profiles

In [None]:
# generating the min,max and min order numbers for customer profiles 
agg_results = frequent_customers.groupby('cust_profile').agg({'order_number': ['max', 'mean', 'min'],'prices': ['max', 'mean', 'min']}).reset_index()

In [None]:
# results of descriptive statistics
agg_results

In [None]:
# create crosstab to show profiles per region, without NaN values
crosstab_profile = pd.crosstab(frequent_customers['cust_profile'], frequent_customers['region'])

In [None]:
# show crosstab
crosstab_profile

In [None]:
# generate stacked bar chart from crosstab
stacked_chart= crosstab_profile.plot(kind= 'bar', stacked= False, colormap= 'viridis', edgecolor='black')
# creating a title and legends
plt.title('Orders per Customer Profile by Region')
plt.xlabel('Customer Profile')
plt.ylabel('Orders')

In [None]:
# export stacked bar chart to png
stacked_chart.figure.savefig(os.path.join(path, '04_Analysis','Visualisations','stacked_chart_customers_regions.png'),bbox_inches='tight')

# 11. crosstab of customer profiles and department_id

In [None]:
crosstab_profile_2 = pd.crosstab(frequent_customers['cust_profile'], frequent_customers['department_id'])

In [None]:
# most common department id for each customer profile
frequent_customers.groupby('cust_profile')['department_id'].agg(lambda x: x.mode().iat[0] if not x.mode().empty else None)

# 12. Bar chart showing sum of orders per customer profile

In [None]:
# sum of orders grouped by customer profile
bar_order_customer_profile = frequent_customers.groupby('cust_profile')['order_number'].sum()

In [None]:
bar_ords_cust_sorted= bar_order_customer_profile.sort_values(ascending=False)

In [None]:
# plotting the bar chart
bar_ords_cust_sorted= bar_order_customer_profile.plot.bar(color='darkblue', edgecolor= 'black')
# creating a title and legends
plt.title('Sum of Orders per Customer Profile')
plt.xlabel('Customer Profile')
plt.ylabel('Orders')

In [None]:
bar_ords_cust_sorted.figure.savefig(os.path.join(path, '04_Analysis','Visualisations','bar_order_customer_profile.png'),bbox_inches= 'tight')

# 13. Bar chart to show sum of orders per department

In [None]:
# determining the sum of orders per department_id
orders_sum_depart = frequent_customers.groupby('department_id')['order_number'].sum()

In [None]:
# sorting the results in order of magnitude
orders_sum_depart= orders_sum_depart.sort_values(ascending=False)

In [None]:
# generating and naming visualisation
bar_orders_department = orders_sum_depart.plot.bar(color= 'red', edgecolor='black')
# creating a title and legends
plt.title('Sum of Orders per Department')
plt.xlabel('Department ID')
plt.ylabel('Orders')

In [None]:
# export visualisation
bar_orders_department.figure.savefig(os.path.join(path,'04_Analysis','Visualisations','bar_orders_department.png'))

# 14. Export Final Dataframe

In [None]:
# export datframe with customers that order more than 5 times
frequent_customers.to_pickle(os.path.join(path,'02_Data','Prepared Data','order_products_all.pkl'))