# 4.10 Coding Etiquette & Excel Reporting - Pt 1, Step 5

## This script contains the following points:

### 1. Import data and libraries
### 2. Create sample set of dataframe
### 3. Assign departments to department id
### 4. Create age groups
### 5. Create income groups
### 6. Create parent groups
### 7. Create customer profiles
### 8. Assign hourly groups
### 9. Assign days
### 10. Export sample set of dataframe

## 1. Import data and libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Assign project folder path to a variable

path = r'C:\Users\jomok\Documents\Career Foundry\Achievement 4\07-2023 Instacart Basket Analysis'

In [3]:
# Import data

df_final = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers_merged.pkl'))

In [4]:
# Check output

df_final.head()

Unnamed: 0,user_id,gender,state,age,date_joined,number_of_dependents,marital_status,income,order_id,eval_set,...,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_days_since_prior_order,order_frequency_flag,_merge,region,activity_flag
0,26711,Female,Missouri,48,1/1/2017,3,married,165665,2543867,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
1,26711,Female,Missouri,48,1/1/2017,3,married,165665,1285508,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
2,26711,Female,Missouri,48,1/1/2017,3,married,165665,2578584,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
3,26711,Female,Missouri,48,1/1/2017,3,married,165665,423547,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer
4,26711,Female,Missouri,48,1/1/2017,3,married,165665,2524893,prior,...,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both,Midwest,high activity customer


In [5]:
# Check shape

df_final.shape

(32404859, 33)

## 2. Create sample set of dataframe

In [6]:
# Create a list holding True/False values to the test np.random.rant() <= 0.7

dev = np.random.rand(len(df_final)) <= 0.7

In [7]:
# Store 70% of the sample in the dataframe big

big = df_final[dev]

In [8]:
# Store 30% of the sample in the dataframe small

small = df_final[~dev]

In [9]:
# Add both samples together to check results

len(df_final)

32404859

In [10]:
len(big) + len(small)

32404859

Confirmed:  Number of rows in both samples added together is the same number of rows as the original dataframe.

In [11]:
# Reduce samples to only columns relevant to the analysis

df_final.columns

Index(['user_id', 'gender', 'state', 'age', 'date_joined',
       'number_of_dependents', 'marital_status', 'income', 'order_id',
       'eval_set', 'order_number', 'order_day_of_week', 'order_hour_of_day',
       'days_since_prior_order', 'product_id', 'add_to_cart_order',
       'reordered', 'product_name', 'aisle_id', 'department_id', 'prices',
       'price_range_loc', 'busiest_day', 'busiest_period_of_day', 'max_order',
       'loyalty_flag', 'avg_price', 'spending_flag',
       'median_days_since_prior_order', 'order_frequency_flag', '_merge',
       'region', 'activity_flag'],
      dtype='object')

In [12]:
# Reduce samples to subset of only columns relevant to analysis

df_final_subset = small[['gender', 'state', 'age', 'number_of_dependents', 'marital_status', 'income', 'order_number', 'loyalty_flag', 'spending_flag', 'order_frequency_flag', 'activity_flag', 'order_day_of_week', 'order_hour_of_day', 'department_id', 'prices', 'price_range_loc', 'busiest_day', 'busiest_period_of_day', 'region']]

In [13]:
# Check output

df_final_subset.head()

Unnamed: 0,gender,state,age,number_of_dependents,marital_status,income,order_number,loyalty_flag,spending_flag,order_frequency_flag,activity_flag,order_day_of_week,order_hour_of_day,department_id,prices,price_range_loc,busiest_day,busiest_period_of_day,region
1,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,high activity customer,5,15,7,9.0,Mid-range product,Regularly busy,Most orders,Midwest
7,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,high activity customer,5,15,16,12.6,Mid-range product,Regularly busy,Most orders,Midwest
9,Female,Missouri,48,3,married,165665,1,New customer,Low spender,Regular customer,high activity customer,2,9,4,4.3,Low-range product,Regularly busy,Most orders,Midwest
13,Female,Missouri,48,3,married,165665,6,New customer,Low spender,Regular customer,high activity customer,1,8,4,4.3,Low-range product,Busiest days,Average orders,Midwest
14,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,high activity customer,5,15,4,4.3,Low-range product,Regularly busy,Most orders,Midwest


In [14]:
# Check shape

df_final_subset.shape

(9717028, 19)

## 3. Assign departments to department id

In [15]:
# Import departments data

df_depts = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'), index_col= 0)

In [16]:
# Check output

df_depts.head()

Unnamed: 0_level_0,department
department_id,Unnamed: 1_level_1
1,frozen
2,other
3,bakery
4,produce
5,alcohol


In [17]:
# Create data dictionary for values in the "department_id" column

data_dict = df_depts.to_dict('index')

In [18]:
data_dict

{1: {'department': 'frozen'},
 2: {'department': 'other'},
 3: {'department': 'bakery'},
 4: {'department': 'produce'},
 5: {'department': 'alcohol'},
 6: {'department': 'international'},
 7: {'department': 'beverages'},
 8: {'department': 'pets'},
 9: {'department': 'dry goods pasta'},
 10: {'department': 'bulk'},
 11: {'department': 'personal care'},
 12: {'department': 'meat seafood'},
 13: {'department': 'pantry'},
 14: {'department': 'breakfast'},
 15: {'department': 'canned goods'},
 16: {'department': 'dairy eggs'},
 17: {'department': 'household'},
 18: {'department': 'babies'},
 19: {'department': 'snacks'},
 20: {'department': 'deli'},
 21: {'department': 'missing'}}

In [19]:
# Assign department_id values using data dictionary

df_final_subset.loc[df_final_subset['department_id'] == 1, 'department'] = 'frozen'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_subset.loc[df_final_subset['department_id'] == 1, 'department'] = 'frozen'


In [20]:
df_final_subset.loc[df_final_subset['department_id'] == 2, 'department'] = 'other'

In [21]:
df_final_subset.loc[df_final_subset['department_id'] == 3, 'department'] = 'bakery'

In [22]:
df_final_subset.loc[df_final_subset['department_id'] == 4, 'department'] = 'produce'

In [23]:
df_final_subset.loc[df_final_subset['department_id'] == 5, 'department'] = 'alcohol'

In [24]:
df_final_subset.loc[df_final_subset['department_id'] == 6, 'department'] = 'international'

In [25]:
df_final_subset.loc[df_final_subset['department_id'] == 7, 'department'] = 'beverages'

In [26]:
df_final_subset.loc[df_final_subset['department_id'] == 8, 'department'] = 'pets'

In [27]:
df_final_subset.loc[df_final_subset['department_id'] == 9, 'department'] = 'dry goods pasta'

In [28]:
df_final_subset.loc[df_final_subset['department_id'] == 10, 'department'] = 'bulk'

In [29]:
df_final_subset.loc[df_final_subset['department_id'] == 11, 'department'] = 'personal care'

In [30]:
df_final_subset.loc[df_final_subset['department_id'] == 12, 'department'] = 'meat seafood'

In [31]:
df_final_subset.loc[df_final_subset['department_id'] == 13, 'department'] = 'pantry'

In [32]:
df_final_subset.loc[df_final_subset['department_id'] == 14, 'department'] = 'breakfast'

In [33]:
df_final_subset.loc[df_final_subset['department_id'] == 15, 'department'] = 'canned goods'

In [34]:
df_final_subset.loc[df_final_subset['department_id'] == 16, 'department'] = 'dairy eggs'

In [35]:
df_final_subset.loc[df_final_subset['department_id'] == 17, 'department'] = 'household'

In [36]:
df_final_subset.loc[df_final_subset['department_id'] == 18, 'department'] = 'babies'

In [37]:
df_final_subset.loc[df_final_subset['department_id'] == 19, 'department'] = 'snacks'

In [38]:
df_final_subset.loc[df_final_subset['department_id'] == 20, 'department'] = 'deli'

In [39]:
df_final_subset.loc[df_final_subset['department_id'] == 21, 'department'] = 'missing'

In [40]:
# Check frequency 

df_final_subset['department'].value_counts(dropna=False)

produce            2842243
dairy eggs         1619517
snacks              865197
beverages           806386
frozen              669315
pantry              562705
bakery              352566
canned goods        320262
deli                314647
dry goods pasta     259397
household           221359
meat seafood        212725
breakfast           211067
personal care       133735
babies              127467
international        80893
alcohol              46321
pets                 29211
missing              20794
other                10775
bulk                 10446
Name: department, dtype: int64

In [41]:
# Check the shape 

df_final_subset['department'].shape

(9717028,)

In [42]:
# Check output

df_final_subset.head()

Unnamed: 0,gender,state,age,number_of_dependents,marital_status,income,order_number,loyalty_flag,spending_flag,order_frequency_flag,activity_flag,order_day_of_week,order_hour_of_day,department_id,prices,price_range_loc,busiest_day,busiest_period_of_day,region,department
1,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,high activity customer,5,15,7,9.0,Mid-range product,Regularly busy,Most orders,Midwest,beverages
7,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,high activity customer,5,15,16,12.6,Mid-range product,Regularly busy,Most orders,Midwest,dairy eggs
9,Female,Missouri,48,3,married,165665,1,New customer,Low spender,Regular customer,high activity customer,2,9,4,4.3,Low-range product,Regularly busy,Most orders,Midwest,produce
13,Female,Missouri,48,3,married,165665,6,New customer,Low spender,Regular customer,high activity customer,1,8,4,4.3,Low-range product,Busiest days,Average orders,Midwest,produce
14,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,high activity customer,5,15,4,4.3,Low-range product,Regularly busy,Most orders,Midwest,produce


## 4. Create age groups

In [43]:
# Assign age groups

df_final_subset.loc[(df_final_subset['age'] <= 16) & (df_final_subset['age'] >=0), 'age_group'] = 'Child'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_subset.loc[(df_final_subset['age'] <= 16) & (df_final_subset['age'] >=0), 'age_group'] = 'Child'


In [44]:
df_final_subset.loc[(df_final_subset['age'] <= 30) & (df_final_subset['age'] >=17), 'age_group'] = 'Young Adults'

In [45]:
df_final_subset.loc[(df_final_subset['age'] <= 45) & (df_final_subset['age'] >=31), 'age_group'] = 'Middle Aged Adults'

In [46]:
df_final_subset.loc[df_final_subset['age'] > 45, 'age_group'] = 'Old Adults'

In [47]:
# Check frequency 

df_final_subset['age_group'].value_counts(dropna=False)

Old Adults            5460728
Middle Aged Adults    2278526
Young Adults          1977774
Name: age_group, dtype: int64

In [48]:
# Check the shape 

df_final_subset['age_group'].shape

(9717028,)

In [49]:
# Check output

df_final_subset.head()

Unnamed: 0,gender,state,age,number_of_dependents,marital_status,income,order_number,loyalty_flag,spending_flag,order_frequency_flag,...,order_day_of_week,order_hour_of_day,department_id,prices,price_range_loc,busiest_day,busiest_period_of_day,region,department,age_group
1,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,5,15,7,9.0,Mid-range product,Regularly busy,Most orders,Midwest,beverages,Old Adults
7,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,5,15,16,12.6,Mid-range product,Regularly busy,Most orders,Midwest,dairy eggs,Old Adults
9,Female,Missouri,48,3,married,165665,1,New customer,Low spender,Regular customer,...,2,9,4,4.3,Low-range product,Regularly busy,Most orders,Midwest,produce,Old Adults
13,Female,Missouri,48,3,married,165665,6,New customer,Low spender,Regular customer,...,1,8,4,4.3,Low-range product,Busiest days,Average orders,Midwest,produce,Old Adults
14,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,5,15,4,4.3,Low-range product,Regularly busy,Most orders,Midwest,produce,Old Adults


## 5. Create income groups

In [50]:
# Assign income groups

df_final_subset.loc[df_final_subset['income'] < 43350, 'income_group'] = 'Low Income'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_subset.loc[df_final_subset['income'] < 43350, 'income_group'] = 'Low Income'


In [51]:
df_final_subset.loc[(df_final_subset['income'] <= 130000) & (df_final_subset['income'] >=43350), 'income_group'] = 'Middle Income'

In [52]:
df_final_subset.loc[df_final_subset['income'] > 130000, 'income_group'] = 'Upper Income'

Income groups is a rough estimate, based upon U.S. Census Bureau 2021 median household groups, and does not contemplate family size or location.

In [53]:
# Check frequency 

df_final_subset['income_group'].value_counts(dropna=False)

Middle Income    6705294
Upper Income     2314596
Low Income        697138
Name: income_group, dtype: int64

In [54]:
# Check the shape 

df_final_subset['income_group'].shape

(9717028,)

In [55]:
# Check output

df_final_subset.head()

Unnamed: 0,gender,state,age,number_of_dependents,marital_status,income,order_number,loyalty_flag,spending_flag,order_frequency_flag,...,order_hour_of_day,department_id,prices,price_range_loc,busiest_day,busiest_period_of_day,region,department,age_group,income_group
1,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,15,7,9.0,Mid-range product,Regularly busy,Most orders,Midwest,beverages,Old Adults,Upper Income
7,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,15,16,12.6,Mid-range product,Regularly busy,Most orders,Midwest,dairy eggs,Old Adults,Upper Income
9,Female,Missouri,48,3,married,165665,1,New customer,Low spender,Regular customer,...,9,4,4.3,Low-range product,Regularly busy,Most orders,Midwest,produce,Old Adults,Upper Income
13,Female,Missouri,48,3,married,165665,6,New customer,Low spender,Regular customer,...,8,4,4.3,Low-range product,Busiest days,Average orders,Midwest,produce,Old Adults,Upper Income
14,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,15,4,4.3,Low-range product,Regularly busy,Most orders,Midwest,produce,Old Adults,Upper Income


## 6. Create parent groups

In [56]:
# Assign dependent groups

df_final_subset.loc[df_final_subset['number_of_dependents'] >0, 'dependents_group'] = 'Parent'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_subset.loc[df_final_subset['number_of_dependents'] >0, 'dependents_group'] = 'Parent'


In [57]:
df_final_subset.loc[df_final_subset['number_of_dependents'] ==0, 'dependents_group'] = 'No Children'

In [58]:
# Check frequency 

df_final_subset['dependents_group'].value_counts(dropna=False)

Parent         7289650
No Children    2427378
Name: dependents_group, dtype: int64

In [59]:
# Check the shape 

df_final_subset['dependents_group'].shape

(9717028,)

In [60]:
# Check output

df_final_subset.head()

Unnamed: 0,gender,state,age,number_of_dependents,marital_status,income,order_number,loyalty_flag,spending_flag,order_frequency_flag,...,department_id,prices,price_range_loc,busiest_day,busiest_period_of_day,region,department,age_group,income_group,dependents_group
1,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,7,9.0,Mid-range product,Regularly busy,Most orders,Midwest,beverages,Old Adults,Upper Income,Parent
7,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,16,12.6,Mid-range product,Regularly busy,Most orders,Midwest,dairy eggs,Old Adults,Upper Income,Parent
9,Female,Missouri,48,3,married,165665,1,New customer,Low spender,Regular customer,...,4,4.3,Low-range product,Regularly busy,Most orders,Midwest,produce,Old Adults,Upper Income,Parent
13,Female,Missouri,48,3,married,165665,6,New customer,Low spender,Regular customer,...,4,4.3,Low-range product,Busiest days,Average orders,Midwest,produce,Old Adults,Upper Income,Parent
14,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,4,4.3,Low-range product,Regularly busy,Most orders,Midwest,produce,Old Adults,Upper Income,Parent


## 7. Create customer profiles

In [61]:
# Assign customer profile group

df_final_subset.loc[(df_final_subset['marital_status'] != 'married') & (df_final_subset['number_of_dependents'] >0), 'customer_profile'] = 'Single parent'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_subset.loc[(df_final_subset['marital_status'] != 'married') & (df_final_subset['number_of_dependents'] >0), 'customer_profile'] = 'Single parent'


In [62]:
df_final_subset.loc[(df_final_subset['marital_status'] != 'married') & (df_final_subset['gender'] == 'Female') & (df_final_subset['number_of_dependents'] ==0), 'customer_profile'] = 'Single female without children'

In [63]:
df_final_subset.loc[(df_final_subset['marital_status'] != 'married') & (df_final_subset['gender'] == 'Male') & (df_final_subset['number_of_dependents'] ==0), 'customer_profile'] = 'Single male without children'

In [64]:
df_final_subset.loc[(df_final_subset['age'] < 30) & (df_final_subset['marital_status'] == 'married') & (df_final_subset['number_of_dependents'] >0), 'customer_profile'] = 'Married young parent'

In [65]:
df_final_subset.loc[(df_final_subset['age'] >= 30) & (df_final_subset['marital_status'] == 'married') & (df_final_subset['number_of_dependents'] >0), 'customer_profile'] = 'Married older parent'

In [66]:
df_final_subset.loc[(df_final_subset['marital_status'] == 'married') & (df_final_subset['number_of_dependents'] ==0), 'customer_profile'] = 'Married without children'

In [67]:
# Check frequency 

df_final_subset['customer_profile'].value_counts(dropna=False)

Married older parent              5917638
Single male without children      1217809
Single female without children    1209569
Married young parent               907443
Single parent                      464569
Name: customer_profile, dtype: int64

In [68]:
# Check shape 

df_final_subset['customer_profile'].shape

(9717028,)

In [69]:
# Check df output

df_final_subset.head()

Unnamed: 0,gender,state,age,number_of_dependents,marital_status,income,order_number,loyalty_flag,spending_flag,order_frequency_flag,...,prices,price_range_loc,busiest_day,busiest_period_of_day,region,department,age_group,income_group,dependents_group,customer_profile
1,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,9.0,Mid-range product,Regularly busy,Most orders,Midwest,beverages,Old Adults,Upper Income,Parent,Married older parent
7,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,12.6,Mid-range product,Regularly busy,Most orders,Midwest,dairy eggs,Old Adults,Upper Income,Parent,Married older parent
9,Female,Missouri,48,3,married,165665,1,New customer,Low spender,Regular customer,...,4.3,Low-range product,Regularly busy,Most orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent
13,Female,Missouri,48,3,married,165665,6,New customer,Low spender,Regular customer,...,4.3,Low-range product,Busiest days,Average orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent
14,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,4.3,Low-range product,Regularly busy,Most orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent


In [70]:
# Assign customer income profile group

df_final_subset.loc[(df_final_subset['marital_status'] != 'married') & (df_final_subset['income_group'] == 'Low Income') & (df_final_subset['number_of_dependents'] >0), 'customer_income_profile'] = 'Low income single parent'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_subset.loc[(df_final_subset['marital_status'] != 'married') & (df_final_subset['income_group'] == 'Low Income') & (df_final_subset['number_of_dependents'] >0), 'customer_income_profile'] = 'Low income single parent'


In [71]:
df_final_subset.loc[(df_final_subset['marital_status'] != 'married') & (df_final_subset['income_group'] == 'Middle Income') & (df_final_subset['number_of_dependents'] >0), 'customer_income_profile'] = 'Middle income single parent'

In [72]:
df_final_subset.loc[(df_final_subset['marital_status'] != 'married') & (df_final_subset['income_group'] == 'Upper Income') & (df_final_subset['number_of_dependents'] >0), 'customer_income_profile'] = 'Upper income single parent'

In [73]:
df_final_subset.loc[(df_final_subset['marital_status'] != 'married')  & (df_final_subset['income_group'] == 'Low Income') & (df_final_subset['gender'] == 'Female') & (df_final_subset['number_of_dependents'] ==0), 'customer_income_profile'] = 'Low income single female without children'

In [74]:
df_final_subset.loc[(df_final_subset['marital_status'] != 'married')  & (df_final_subset['income_group'] == 'Middle Income') & (df_final_subset['gender'] == 'Female') & (df_final_subset['number_of_dependents'] ==0), 'customer_income_profile'] = 'Middle income single female without children'

In [75]:
df_final_subset.loc[(df_final_subset['marital_status'] != 'married')  & (df_final_subset['income_group'] == 'Upper Income') & (df_final_subset['gender'] == 'Female') & (df_final_subset['number_of_dependents'] ==0), 'customer_income_profile'] = 'Upper income single female without children'

In [76]:
df_final_subset.loc[(df_final_subset['marital_status'] != 'married')  & (df_final_subset['income_group'] == 'Low Income')  & (df_final_subset['gender'] == 'Male') & (df_final_subset['number_of_dependents'] ==0), 'customer_income_profile'] = 'Low income single male without children'

In [77]:
df_final_subset.loc[(df_final_subset['marital_status'] != 'married')  & (df_final_subset['income_group'] == 'Middle Income')  & (df_final_subset['gender'] == 'Male') & (df_final_subset['number_of_dependents'] ==0), 'customer_income_profile'] = 'Middle income single male without children'

In [78]:
df_final_subset.loc[(df_final_subset['marital_status'] != 'married')  & (df_final_subset['income_group'] == 'Upper Income')  & (df_final_subset['gender'] == 'Male') & (df_final_subset['number_of_dependents'] ==0), 'customer_income_profile'] = 'Upper income single male without children'

In [79]:
df_final_subset.loc[(df_final_subset['age'] < 30) & (df_final_subset['marital_status'] == 'married') & (df_final_subset['income_group'] == 'Low Income')  & (df_final_subset['number_of_dependents'] >0), 'customer_income_profile'] = 'Low income married young parent'

In [80]:
df_final_subset.loc[(df_final_subset['age'] < 30) & (df_final_subset['marital_status'] == 'married') & (df_final_subset['income_group'] == 'Middle Income')  & (df_final_subset['number_of_dependents'] >0), 'customer_income_profile'] = 'Middle income married young parent'

In [81]:
df_final_subset.loc[(df_final_subset['age'] < 30) & (df_final_subset['marital_status'] == 'married') & (df_final_subset['income_group'] == 'Upper Income')  & (df_final_subset['number_of_dependents'] >0), 'customer_income_profile'] = 'Upper income married young parent'

In [82]:
df_final_subset.loc[(df_final_subset['age'] >= 30) & (df_final_subset['marital_status'] == 'married') & (df_final_subset['income_group'] == 'Low Income') & (df_final_subset['number_of_dependents'] >0), 'customer_income_profile'] = 'Low income married older parent'

In [83]:
df_final_subset.loc[(df_final_subset['age'] >= 30) & (df_final_subset['marital_status'] == 'married') & (df_final_subset['income_group'] == 'Middle Income') & (df_final_subset['number_of_dependents'] >0), 'customer_income_profile'] = 'Middle income married older parent'

In [84]:
df_final_subset.loc[(df_final_subset['age'] >= 30) & (df_final_subset['marital_status'] == 'married') & (df_final_subset['income_group'] == 'Upper Income') & (df_final_subset['number_of_dependents'] >0), 'customer_income_profile'] = 'Upper income married older parent'

In [85]:
df_final_subset.loc[(df_final_subset['marital_status'] == 'married') & (df_final_subset['income_group'] == 'Low Income')& (df_final_subset['number_of_dependents'] ==0), 'customer_income_profile'] = 'Low income married without children'

In [86]:
df_final_subset.loc[(df_final_subset['marital_status'] == 'married') & (df_final_subset['income_group'] == 'Middle Income')& (df_final_subset['number_of_dependents'] ==0), 'customer_income_profile'] = 'Middle income married without children'

In [87]:
df_final_subset.loc[(df_final_subset['marital_status'] == 'married') & (df_final_subset['income_group'] == 'Upper Income')& (df_final_subset['number_of_dependents'] ==0), 'customer_income_profile'] = 'Upper income married without children'

In [88]:
# Check frequency 

df_final_subset['customer_income_profile'].value_counts(dropna=False)

Middle income married older parent              3816456
Upper income married older parent               1712741
Middle income single male without children       837850
Middle income single female without children     829902
Middle income married young parent               804910
Middle income single parent                      416176
Low income married older parent                  388441
Upper income single female without children      292895
Upper income single male without children        292878
Low income married young parent                   91816
Low income single male without children           87081
Low income single female without children         86772
Low income single parent                          43028
Upper income married young parent                 10717
Upper income single parent                         5365
Name: customer_income_profile, dtype: int64

In [89]:
# Check shape 

df_final_subset['customer_income_profile'].shape

(9717028,)

In [90]:
# Check df output

df_final_subset.head()

Unnamed: 0,gender,state,age,number_of_dependents,marital_status,income,order_number,loyalty_flag,spending_flag,order_frequency_flag,...,price_range_loc,busiest_day,busiest_period_of_day,region,department,age_group,income_group,dependents_group,customer_profile,customer_income_profile
1,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,Mid-range product,Regularly busy,Most orders,Midwest,beverages,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent
7,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,Mid-range product,Regularly busy,Most orders,Midwest,dairy eggs,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent
9,Female,Missouri,48,3,married,165665,1,New customer,Low spender,Regular customer,...,Low-range product,Regularly busy,Most orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent
13,Female,Missouri,48,3,married,165665,6,New customer,Low spender,Regular customer,...,Low-range product,Busiest days,Average orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent
14,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,Low-range product,Regularly busy,Most orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent


## 8. Assign hourly groups

In [91]:
df_final_subset.loc[(df_final_subset['order_hour_of_day'] >= 0) & (df_final_subset['order_hour_of_day'] <8), 'time_period'] = 'Before 8am'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_subset.loc[(df_final_subset['order_hour_of_day'] >= 0) & (df_final_subset['order_hour_of_day'] <8), 'time_period'] = 'Before 8am'


In [92]:
df_final_subset.loc[(df_final_subset['order_hour_of_day'] >= 8) & (df_final_subset['order_hour_of_day'] <=17), 'time_period'] = 'Between 8am and 5pm'

In [93]:
df_final_subset.loc[(df_final_subset['order_hour_of_day'] > 17), 'time_period'] = 'After 5pm'

In [94]:
# Check frequency 

df_final_subset['time_period'].value_counts(dropna=False)

Between 8am and 5pm    7472963
After 5pm              1710301
Before 8am              533764
Name: time_period, dtype: int64

In [95]:
# Check shape 

df_final_subset['time_period'].shape

(9717028,)

In [96]:
# Check output

df_final_subset.head()

Unnamed: 0,gender,state,age,number_of_dependents,marital_status,income,order_number,loyalty_flag,spending_flag,order_frequency_flag,...,busiest_day,busiest_period_of_day,region,department,age_group,income_group,dependents_group,customer_profile,customer_income_profile,time_period
1,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,Regularly busy,Most orders,Midwest,beverages,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent,Between 8am and 5pm
7,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,Regularly busy,Most orders,Midwest,dairy eggs,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent,Between 8am and 5pm
9,Female,Missouri,48,3,married,165665,1,New customer,Low spender,Regular customer,...,Regularly busy,Most orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent,Between 8am and 5pm
13,Female,Missouri,48,3,married,165665,6,New customer,Low spender,Regular customer,...,Busiest days,Average orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent,Between 8am and 5pm
14,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,Regularly busy,Most orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent,Between 8am and 5pm


## 9. Assign days

In [97]:
# Assign day values using order_day_of_week column

df_final_subset.loc[df_final_subset['order_day_of_week'] == 0, 'day'] = 'Saturday'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_subset.loc[df_final_subset['order_day_of_week'] == 0, 'day'] = 'Saturday'


In [98]:
df_final_subset.loc[df_final_subset['order_day_of_week'] == 1, 'day'] = 'Sunday'

In [99]:
df_final_subset.loc[df_final_subset['order_day_of_week'] == 2, 'day'] = 'Monday'

In [100]:
df_final_subset.loc[df_final_subset['order_day_of_week'] == 3, 'day'] = 'Tuesday'

In [101]:
df_final_subset.loc[df_final_subset['order_day_of_week'] == 4, 'day'] = 'Wednesday'

In [103]:
df_final_subset.loc[df_final_subset['order_day_of_week'] == 5, 'day'] = 'Thursday'

In [104]:
df_final_subset.loc[df_final_subset['order_day_of_week'] == 6, 'day'] = 'Friday'

In [105]:
# Check frequency 

df_final_subset['day'].value_counts(dropna=False)

Saturday     1861925
Sunday       1697135
Friday       1347050
Monday       1263934
Thursday     1261544
Tuesday      1150767
Wednesday    1134673
Name: day, dtype: int64

In [106]:
# Check shape 

df_final_subset['day'].shape

(9717028,)

In [107]:
# Check output

df_final_subset.head()

Unnamed: 0,gender,state,age,number_of_dependents,marital_status,income,order_number,loyalty_flag,spending_flag,order_frequency_flag,...,busiest_period_of_day,region,department,age_group,income_group,dependents_group,customer_profile,customer_income_profile,time_period,day
1,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,Most orders,Midwest,beverages,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent,Between 8am and 5pm,Thursday
7,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,Most orders,Midwest,dairy eggs,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent,Between 8am and 5pm,Thursday
9,Female,Missouri,48,3,married,165665,1,New customer,Low spender,Regular customer,...,Most orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent,Between 8am and 5pm,Monday
13,Female,Missouri,48,3,married,165665,6,New customer,Low spender,Regular customer,...,Average orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent,Between 8am and 5pm,Sunday
14,Female,Missouri,48,3,married,165665,7,New customer,Low spender,Regular customer,...,Most orders,Midwest,produce,Old Adults,Upper Income,Parent,Married older parent,Upper income married older parent,Between 8am and 5pm,Thursday


## 10. Export data

In [108]:
# Export dataframe to pkl

df_final_subset.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'final_subset_sample.pkl'))