# 4.10 Coding Etiquette & Excel Reporting
Part 1

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Import ords_prods_customers_merge.pkl from PKL file

path = r'C:\Users\natha\OneDrive\Desktop\Data Analytics\Jupyter\09-2023 Instacart Basket Analysis'
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_customers_merge.pkl'))

In [3]:
# Confirmating dataframe

df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,order_frequency_flag,first_name,surname,gender,state,age,date_joined,number_of_dependents,family_status,income
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
2,907,Premium Sliced Bacon,106,12,20.0,3160996,138,1,5,13,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
3,907,Premium Sliced Bacon,106,12,20.0,2254091,138,10,5,14,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
4,1000,Apricots,18,10,12.9,505689,138,9,6,12,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620


# 2. Consider any security implications that might exist for this new data. We need to address any PII data in the data before continuing your analysis.

In [4]:
# Removing 'first_name' and 'surname' columns to alleviate PII data privacy concerns

df = df.drop(['first_name', 'surname'], axis=1)

# 3. The Instacart officers are interested in comparing customer behavior in different geographic areas. Create a regional segmentation of the data. We need to create a “Region” column based on the “State” column from your customers data set.

In [5]:
# Identify states

unique_states = df['state'].unique()
print(unique_states)

['Minnesota' 'Vermont' 'Wisconsin' 'Hawaii' 'District of Columbia'
 'Tennessee' 'Oregon' 'Alaska' 'Alabama' 'Indiana' 'Florida' 'Kansas'
 'North Carolina' 'Utah' 'Maine' 'North Dakota' 'South Dakota' 'Delaware'
 'Illinois' 'Wyoming' 'Maryland' 'Louisiana' 'New Jersey' 'Georgia'
 'Arkansas' 'California' 'Idaho' 'New Hampshire' 'West Virginia' 'Nevada'
 'New Mexico' 'New York' 'Rhode Island' 'Nebraska' 'Massachusetts' 'Texas'
 'Pennsylvania' 'Iowa' 'Kentucky' 'Michigan' 'Ohio' 'Oklahoma' 'Arizona'
 'Washington' 'Virginia' 'Connecticut' 'Mississippi' 'Colorado' 'Montana'
 'Missouri' 'South Carolina']


In [6]:
# Create dictionary mapping states to their regions

state_to_region = {
    'Maine': 'Northeast',
    'New Hampshire': 'Northeast',
    'Vermont': 'Northeast',
    'Massachusetts': 'Northeast',
    'Rhode Island': 'Northeast',
    'Connecticut': 'Northeast',
    'New York': 'Northeast',
    'Pennsylvania': 'Northeast',
    'New Jersey': 'Northeast',
    'Wisconsin': 'Midwest',
    'Michigan': 'Midwest',
    'Illinois': 'Midwest',
    'Indiana': 'Midwest',
    'Ohio': 'Midwest',
    'North Dakota': 'Midwest',
    'South Dakota': 'Midwest',
    'Nebraska': 'Midwest',
    'Kansas': 'Midwest',
    'Minnesota': 'Midwest',
    'Iowa': 'Midwest',
    'Missouri': 'Midwest',
    'Delaware': 'South',
    'Maryland': 'South',
    'District of Columbia': 'South',
    'Virginia': 'South',
    'West Virginia': 'South',
    'North Carolina': 'South',
    'South Carolina': 'South',
    'Georgia': 'South',
    'Florida': 'South',
    'Kentucky': 'South',
    'Tennessee': 'South',
    'Mississippi': 'South',
    'Alabama': 'South',
    'Oklahoma': 'South',
    'Texas': 'South',
    'Arkansas': 'South',
    'Louisiana': 'South',
    'Idaho': 'West',
    'Montana': 'West',
    'Wyoming': 'West',
    'Nevada': 'West',
    'Utah': 'West',
    'Colorado': 'West',
    'Arizona': 'West',
    'New Mexico': 'West',
    'Alaska': 'West',
    'Washington': 'West',
    'Oregon': 'West',
    'California': 'West',
    'Hawaii': 'West'
}

# Create new column 'region' to our dataframe based on this mapping
df['region'] = df['state'].map(state_to_region)

In [7]:
# Checking region value counts

df['region'].value_counts(dropna = False)

region
South        10790091
West          8291679
Midwest       7596065
Northeast     5721892
Name: count, dtype: int64

## Determine whether there’s a difference in spending habits between the different U.S. regions. (Hint: We can do this by crossing the variable we just created with the spending flag.)

In [8]:
# Create a cross-tabulation between region and spending_flag

crosstab = pd.crosstab(df['region'], df['spending_flag'], dropna=False)

In [9]:
# Copy the crosstab to paste excel clipboard

crosstab.to_clipboard()

In [10]:
# Display the cross-tabulation

crosstab

spending_flag,High spender,Low spender
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,139151,7456914
Northeast,98059,5623833
South,188152,10601939
West,147000,8144679


### *It appears more than 98% of all spenders in all regions are low spenders. The south has the highest total number of both high spenders and low spenders. The northeast has the lowest total number of high spenders and low spenders.*

# 4. The Instacart CFO isn’t interested in customers who don’t generate much revenue for the app. Create an exclusion flag for low-activity customers (customers with less than 5 orders) and exclude them from the data. Make sure you export this sample.

In [11]:
# Display column names for reference

df.columns

Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'prices',
       'order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'is_first_order',
       'add_to_cart_order', 'reordered', 'price_range_loc', 'busiest_days',
       'busiest_period_of_day', 'max_order', 'loyalty_flag', 'user_mean_price',
       'spending_flag', 'order_frequency_flag', 'gender', 'state', 'age',
       'date_joined', 'number_of_dependents', 'family_status', 'income',
       'region'],
      dtype='object')

In [12]:
df.loc[df['max_order'] < 5, 'activity_flag'] = 'low activity'

In [13]:
df.loc[df['max_order'] >= 5, 'activity_flag'] = 'high activity'

In [14]:
df['activity_flag'].value_counts(dropna=False)

activity_flag
high activity    30959687
low activity      1440040
Name: count, dtype: int64

### *Create subset dataframes for 'low activity' and 'high activity' customers*

In [15]:
df_low_activity = df[df['activity_flag'] == 'low activity']

In [16]:
df_high_activity = df[df['activity_flag'] == 'high activity']

In [17]:
# Subset number should equal 1440040

df_low_activity

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,order_frequency_flag,gender,state,age,date_joined,number_of_dependents,family_status,income,region,activity_flag
340,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,...,Frequent customer,Female,Wisconsin,40,2/9/2020,3,married,31308,Midwest,low activity
341,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,...,Frequent customer,Female,Wisconsin,40,2/9/2020,3,married,31308,Midwest,low activity
342,3260,Chips Ahoy!/Nutter Butter/Oreo Cookies,61,19,1.7,652770,764,1,3,13,...,Frequent customer,Female,Wisconsin,40,2/9/2020,3,married,31308,Midwest,low activity
343,3260,Chips Ahoy!/Nutter Butter/Oreo Cookies,61,19,1.7,705212,764,2,2,10,...,Frequent customer,Female,Wisconsin,40,2/9/2020,3,married,31308,Midwest,low activity
344,3260,Chips Ahoy!/Nutter Butter/Oreo Cookies,61,19,1.7,1813452,764,3,4,17,...,Frequent customer,Female,Wisconsin,40,2/9/2020,3,married,31308,Midwest,low activity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404156,49235,Organic Half & Half,53,16,1.8,2542346,80734,3,3,8,...,Non-frequent customer,Male,Alabama,45,9/28/2019,1,married,41072,South,low activity
32404157,49520,Orange Sparkling Water,115,7,7.5,1363278,74506,1,1,14,...,Regular customer,Male,Vermont,44,4/16/2018,3,married,97209,Northeast,low activity
32404158,49520,Orange Sparkling Water,115,7,7.5,2008002,74506,2,0,15,...,Regular customer,Male,Vermont,44,4/16/2018,3,married,97209,Northeast,low activity
32404159,49520,Orange Sparkling Water,115,7,7.5,1595973,74506,3,1,17,...,Regular customer,Male,Vermont,44,4/16/2018,3,married,97209,Northeast,low activity


In [18]:
# Subset number should equal 30959687

df_high_activity

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,order_frequency_flag,gender,state,age,date_joined,number_of_dependents,family_status,income,region,activity_flag
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,Frequent customer,Male,Minnesota,81,8/1/2019,1,married,49620,Midwest,high activity
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,Frequent customer,Male,Minnesota,81,8/1/2019,1,married,49620,Midwest,high activity
2,907,Premium Sliced Bacon,106,12,20.0,3160996,138,1,5,13,...,Frequent customer,Male,Minnesota,81,8/1/2019,1,married,49620,Midwest,high activity
3,907,Premium Sliced Bacon,106,12,20.0,2254091,138,10,5,14,...,Frequent customer,Male,Minnesota,81,8/1/2019,1,married,49620,Midwest,high activity
4,1000,Apricots,18,10,12.9,505689,138,9,6,12,...,Frequent customer,Male,Minnesota,81,8/1/2019,1,married,49620,Midwest,high activity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404145,49235,Organic Half & Half,53,16,1.8,3310701,27382,5,3,9,...,Regular customer,Female,Vermont,68,11/25/2019,3,married,37867,Northeast,high activity
32404146,49235,Organic Half & Half,53,16,1.8,527883,27382,6,1,12,...,Regular customer,Female,Vermont,68,11/25/2019,3,married,37867,Northeast,high activity
32404147,49235,Organic Half & Half,53,16,1.8,685496,27382,7,4,8,...,Regular customer,Female,Vermont,68,11/25/2019,3,married,37867,Northeast,high activity
32404148,49235,Organic Half & Half,53,16,1.8,1224680,27382,8,1,7,...,Regular customer,Female,Vermont,68,11/25/2019,3,married,37867,Northeast,high activity


In [19]:
# Export dataframe subset for 'low activity customers' as pickle file

df_low_activity.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'df_low_activity_customers.pkl'))

In [20]:
# Export dataframe subset for 'high activity customers' as pickle file

df_high_activity.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'df_high_activity_customers.pkl'))

# 5. The marketing and business strategy units at Instacart want to create more-relevant marketing strategies for different products and are, thus, curious about customer profiling in their database. Create a profiling variable based on age, income, certain goods in the “department_id” column, and number of dependents. You might also use the “orders_day_of_the_week” and “order_hour_of_day” columns if you can think of a way they would impact customer profiles. (Hint: As an example, try thinking of what characteristics would lead you to the profile “Single adult” or “Young parent.”)

### *Moving forward, we will continue all marketing strategies using only the 'high_activity_customers' as Instacart CFO has requested.*

In [21]:
# Import df_high_activity_customers from PKL file

path = r'C:\Users\natha\OneDrive\Desktop\Data Analytics\Jupyter\09-2023 Instacart Basket Analysis'
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'df_high_activity_customers.pkl'))

## Creating Age Groups

In [22]:
# Show the age lowest and highest ages of customers

print(df['age'].min())
print(df['age'].max())

18
81


In [23]:
# Check summary statistics for age

df['age'].describe()

count    3.095969e+07
mean     4.946803e+01
std      1.848527e+01
min      1.800000e+01
25%      3.300000e+01
50%      4.900000e+01
75%      6.500000e+01
max      8.100000e+01
Name: age, dtype: float64

*Based on these percentiles, I created age groups to evenly distribute the dataset. For instance, the first group includes individuals between the minimum age (18) and the 25th percentile (33), and so on.*

In [24]:
# Create age groups for young adults, middle-aged adults, and elderly

df.loc[df['age'] <= 34, 'age_group'] = 'Young Adults (34 and below)'
df.loc[df['age'].between(35, 64), 'age_group'] = 'Middle-Aged Adults (35-64)'
df.loc[df['age'] >= 65, 'age_group'] = 'Elderly (65 and above)'

In [25]:
df['age_group'].value_counts(dropna = False)

age_group
Middle-Aged Adults (35-64)     14524452
Young Adults (34 and below)     8240968
Elderly (65 and above)          8194267
Name: count, dtype: int64

## Creating Income Groups

In [26]:
# Show the age lowest and highest income of customers

print(df['income'].min())
print(df['income'].max())

25903
593901


In [27]:
# Check summary statistics for income

df['income'].describe()

count    3.095969e+07
mean     9.965502e+04
std      4.309143e+04
min      2.590300e+04
25%      6.728600e+04
50%      9.676100e+04
75%      1.280810e+05
max      5.939010e+05
Name: income, dtype: float64

In [28]:
# Create income groups

# Low Income: Below 40,000
df.loc[df['income'] <= 40000, 'income_group'] = 'Low Income'

# Middle Income: 40,001 to 100,000
df.loc[(df['income'] > 40000) & (df['income'] <= 100000), 'income_group'] = 'Middle Income'

# Upper Middle Income: 100,001 to 250,000
df.loc[(df['income'] > 100000) & (df['income'] <= 250000), 'income_group'] = 'Upper Middle Income'

# High Income (Including Very High Income): Above 250,000
df.loc[df['income'] > 250000, 'income_group'] = 'High Income'

In [29]:
df['income_group'].value_counts(dropna = False)

income_group
Middle Income          15145248
Upper Middle Income    14024442
Low Income              1612025
High Income              177972
Name: count, dtype: int64

## Creating Labels based on 'department_id'

In [30]:
# Import 'departments_wrangled' CSV file

path = r'C:\Users\natha\OneDrive\Desktop\Data Analytics\Jupyter\09-2023 Instacart Basket Analysis'
df_departments = pd.read_csv(os.path.join(path,'02 Data', 'Prepared Data', 'departments_wrangled.csv'))

In [31]:
df_departments

Unnamed: 0,department
0,frozen
1,other
2,bakery
3,produce
4,alcohol
5,international
6,beverages
7,pets
8,dry goods pasta
9,bulk


In [32]:
# Checking the main dataframe

unique_department_ids = df['department_id'].unique()
print(unique_department_ids)

[19 12 10  4 14 13 16  1  7 20 17 11 15  3 21  6  9  8  5  2 18]


In [33]:
# Fixing 'df_department' dataframe to have 'department_id' so that we can merge

df_departments['department_id'] = df_departments.index + 1

In [34]:
# Confirming 'df_departments' has been updated:

df_departments

Unnamed: 0,department,department_id
0,frozen,1
1,other,2
2,bakery,3
3,produce,4
4,alcohol,5
5,international,6
6,beverages,7
7,pets,8
8,dry goods pasta,9
9,bulk,10


In [35]:
# Merging the primary high activity customers df with the departments df
# Merge will utilize 'department_id' as the key

merged_df = df.merge(df_departments, on='department_id', how='inner')

In [36]:
merged_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,age,date_joined,number_of_dependents,family_status,income,region,activity_flag,age_group,income_group,department
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,81,8/1/2019,1,married,49620,Midwest,high activity,Elderly (65 and above),Middle Income,snacks
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,81,8/1/2019,1,married,49620,Midwest,high activity,Elderly (65 and above),Middle Income,snacks
2,4913,Table Water Crackers,78,19,4.4,894221,138,23,5,13,...,81,8/1/2019,1,married,49620,Midwest,high activity,Elderly (65 and above),Middle Income,snacks
3,11759,Organic Simply Naked Pita Chips,107,19,4.4,1986630,138,7,0,12,...,81,8/1/2019,1,married,49620,Midwest,high activity,Elderly (65 and above),Middle Income,snacks
4,13424,Almonds,45,19,4.2,3139998,138,28,6,11,...,81,8/1/2019,1,married,49620,Midwest,high activity,Elderly (65 and above),Middle Income,snacks


In [37]:
merged_df.columns

Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'prices',
       'order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'is_first_order',
       'add_to_cart_order', 'reordered', 'price_range_loc', 'busiest_days',
       'busiest_period_of_day', 'max_order', 'loyalty_flag', 'user_mean_price',
       'spending_flag', 'order_frequency_flag', 'gender', 'state', 'age',
       'date_joined', 'number_of_dependents', 'family_status', 'income',
       'region', 'activity_flag', 'age_group', 'income_group', 'department'],
      dtype='object')

## Creating Number of Dependents Groups

### Creation of dependents flag

In [38]:
merged_df.loc[merged_df['number_of_dependents'] == 0, 'dependents_group'] = 'No dependents'

In [39]:
merged_df.loc[merged_df['number_of_dependents'] > 0, 'dependents_group'] = 'Has dependents'

In [40]:
merged_df['number_of_dependents'].value_counts()

number_of_dependents
3    7771309
0    7738508
2    7731924
1    7717946
Name: count, dtype: int64

In [41]:
# Checking 'dependents_group'

merged_df['dependents_group'].value_counts(dropna = False)

dependents_group
Has dependents    23221179
No dependents      7738508
Name: count, dtype: int64

## Family Profile Grouping

In [42]:
# Examining 'family_status' column

merged_df['family_status'].value_counts(dropna = False)

family_status
married                             21740200
single                               5093677
divorced/widowed                     2644831
living with parents and siblings     1480979
Name: count, dtype: int64

In [43]:
# Examining 'gender' column

merged_df['gender'].value_counts(dropna = False)

gender
Male      15584279
Female    15375408
Name: count, dtype: int64

In [44]:
# Married male / female without children
merged_df.loc[(merged_df['family_status'] == 'married') & (merged_df['dependents_group'] == 'No dependents') & (merged_df['gender'] == 'Male'), 'family_profile'] = 'Married male without children'
merged_df.loc[(merged_df['family_status'] == 'married') & (merged_df['dependents_group'] == 'No dependents') & (merged_df['gender'] == 'Female'), 'family_profile'] = 'Married female without children'

# Married male / female with children
merged_df.loc[(merged_df['family_status'] == 'married') & (merged_df['dependents_group'] == 'Has dependents') & (merged_df['gender'] == 'Male'), 'family_profile'] = 'Married male with children'
merged_df.loc[(merged_df['family_status'] == 'married') & (merged_df['dependents_group'] == 'Has dependents') & (merged_df['gender'] == 'Female'), 'family_profile'] = 'Married female with children'

# Single male / female without children
merged_df.loc[(merged_df['family_status'].isin(['single','living with parents and siblings','divorced/widowed'])) & (merged_df['dependents_group'] == 'No dependents') & (merged_df['gender'] == 'Male'), 'family_profile'] = 'Single male without children'
merged_df.loc[(merged_df['family_status'].isin(['single','living with parents and siblings','divorced/widowed'])) & (merged_df['dependents_group'] == 'No dependents') & (merged_df['gender'] == 'Female'), 'family_profile'] = 'Single female without children'

# Single male / female with children
merged_df.loc[(merged_df['family_status'].isin(['single','living with parents and siblings','divorced/widowed'])) & (merged_df['dependents_group'] == 'Has dependents') & (merged_df['gender'] == 'Male'), 'family_profile'] = 'Single male with children'
merged_df.loc[(merged_df['family_status'].isin(['single','living with parents and siblings','divorced/widowed'])) & (merged_df['dependents_group'] == 'Has dependents') & (merged_df['gender'] == 'Female'), 'family_profile'] = 'Single female with children'


In [45]:
# Value counts show that Married Male/Female without children does not exist in our data
merged_df['family_profile'].value_counts(dropna = False)

family_profile
Married male with children        10946010
Married female with children      10794190
Single male without children       3880754
Single female without children     3857754
Single male with children           757515
Single female with children         723464
Name: count, dtype: int64

In [46]:
merged_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,number_of_dependents,family_status,income,region,activity_flag,age_group,income_group,department,dependents_group,family_profile
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,1,married,49620,Midwest,high activity,Elderly (65 and above),Middle Income,snacks,Has dependents,Married male with children
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,1,married,49620,Midwest,high activity,Elderly (65 and above),Middle Income,snacks,Has dependents,Married male with children
2,4913,Table Water Crackers,78,19,4.4,894221,138,23,5,13,...,1,married,49620,Midwest,high activity,Elderly (65 and above),Middle Income,snacks,Has dependents,Married male with children
3,11759,Organic Simply Naked Pita Chips,107,19,4.4,1986630,138,7,0,12,...,1,married,49620,Midwest,high activity,Elderly (65 and above),Middle Income,snacks,Has dependents,Married male with children
4,13424,Almonds,45,19,4.2,3139998,138,28,6,11,...,1,married,49620,Midwest,high activity,Elderly (65 and above),Middle Income,snacks,Has dependents,Married male with children


In [47]:
merged_df.tail()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,number_of_dependents,family_status,income,region,activity_flag,age_group,income_group,department,dependents_group,family_profile
30959682,39680,AdvanceCare Oral Electrolyte Solution - Cherry...,92,18,8.0,3060277,137486,1,4,8,...,2,married,64469,South,high activity,Young Adults (34 and below),Middle Income,babies,Has dependents,Married female with children
30959683,49215,Kids Sensible Foods Broccoli Littles,92,18,2.0,458558,168325,4,4,15,...,2,married,43709,Northeast,high activity,Middle-Aged Adults (35-64),Middle Income,babies,Has dependents,Married male with children
30959684,44787,Stage 1 - Just Prunes,92,18,7.2,1865563,160916,2,1,13,...,2,living with parents and siblings,43619,South,high activity,Young Adults (34 and below),Middle Income,babies,Has dependents,Single male with children
30959685,44787,Stage 1 - Just Prunes,92,18,7.2,195067,160916,6,4,7,...,2,living with parents and siblings,43619,South,high activity,Young Adults (34 and below),Middle Income,babies,Has dependents,Single male with children
30959686,44755,Training Pants Learning Designs 4T-5T - 18 CT,56,18,3.1,710384,175617,4,6,15,...,0,divorced/widowed,45515,Northeast,high activity,Elderly (65 and above),Middle Income,babies,No dependents,Single male without children


In [48]:
merged_df.shape

(30959687, 36)

In [49]:
# Export dataframe for 'high activity customers' with the new 'family profile' column as pickle

merged_df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'merged_df_final.pkl'))