# Tables of Contents

#### 1. Import libraries and data
#### 2. Data privacy
    a. Remove customer names
    b. Create age groups to remove individual ages    
#### 3. Customer profiling
    a. Create regional segmentation 
    b. Create income segmentation
    c. Create number of dependents segmentation
    d. Department id segmentation preparation
    e. Customer profile integration 
#### 4. Exclusion flag for low-activity customers
    a. Remove customers with less than 5 orders    
#### 5. Export data

## Step 1: Import libraries and data

In [3]:
# Importing libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [4]:
# Importing data

path = r'C:\Users\nodnarb\Documents\IC Basket Analysis'
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers.pkl'))
departments = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'))

In [5]:
# Display all data

pd.set_option('display.max_columns', None)

In [6]:
# Display data

df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_times,max_orders,loyalty_flag,average_spending,spending_flag,median_days_of_prior_order,frequency_flag,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2539329,1,1,2,8,0.0,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,2539329,1,1,2,8,0.0,12427,3,0,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2539329,1,1,2,8,0.0,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,2539329,1,1,2,8,0.0,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


In [7]:
# Data shape

df.shape

(32404859, 31)

## Step 2: Data Privacy

In [9]:
# Remove first_name and last name columns

df = df.drop(columns = ['first_name','last_name'])

In [10]:
# Check

df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_times,max_orders,loyalty_flag,average_spending,spending_flag,median_days_of_prior_order,frequency_flag,gender,state,age,date_joined,n_dependants,fam_status,income
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
1,2539329,1,1,2,8,0.0,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
2,2539329,1,1,2,8,0.0,12427,3,0,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
3,2539329,1,1,2,8,0.0,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423
4,2539329,1,1,2,8,0.0,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423


In [11]:
# Create age groups to eliminate individual age groups

## Conditions:
## Young adult: ages 0-29
## Middle aged adult: ages 30-59
## Old adult: ages 60+

# Create result list for age column

result = []

for age in df['age']:
    if age < 30:
        result.append('young adult')
    elif age >= 30 and age < 60:
        result.append('middle aged adult')
    elif age >= 60:
        result.append('old adult')
    else: result.append('nan')

In [12]:
# Create age column for result

df['age_group'] = result

In [13]:
# Check

df['age_group'].value_counts()

age_group
middle aged adult    15247163
old adult            11067255
young adult           6090441
Name: count, dtype: int64

In [14]:
# Drop age column

df = df.drop(columns = ['age'])

In [15]:
# Check 

df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_times,max_orders,loyalty_flag,average_spending,spending_flag,median_days_of_prior_order,frequency_flag,gender,state,date_joined,n_dependants,fam_status,income,age_group
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult
1,2539329,1,1,2,8,0.0,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult
2,2539329,1,1,2,8,0.0,12427,3,0,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult
3,2539329,1,1,2,8,0.0,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult
4,2539329,1,1,2,8,0.0,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult


## Step 3: Customer profiling

#### a. Regional segmentation

In [18]:
# Create regions based on wikipedia article: https://simple.wikipedia.org/wiki/List_of_regions_of_the_United_States

northeast = ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']
midwest = ['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']
south = ['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana'] 
west = ['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska','Washington', 'Oregon', 'California', 'Hawaii']

In [19]:
# Create results list for region column

result = []

for state in df['state']:
    if state in northeast:
        result.append('northeast')
    elif state in midwest:
        result.append('midwest')
    elif state in south:
        result.append('south')
    elif state in west:
        result.append('west')
    else: result.append('nan')

In [20]:
# Create region column for result

df['region'] = result 

In [21]:
# Check 

df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_times,max_orders,loyalty_flag,average_spending,spending_flag,median_days_of_prior_order,frequency_flag,gender,state,date_joined,n_dependants,fam_status,income,age_group,region
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south
1,2539329,1,1,2,8,0.0,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south
2,2539329,1,1,2,8,0.0,12427,3,0,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south
3,2539329,1,1,2,8,0.0,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south
4,2539329,1,1,2,8,0.0,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south


#### b. Income segmentation

In [23]:
# Create income groups

## Conditions:
## Lower class: $0 - $59,999
## Middle class: $60,000 - $119,999
## Upper class: $120,000+

result = []

for income in df['income']:
    if income < 60000:
        result.append('lower class')
    elif income >= 60000 and income < 120000:
        result.append('middle class')
    elif income >= 120000:
        result.append('upper class')
    else: result.append('nan')   

In [24]:
# Create income bracket column for result

df['income_bracket'] = result

In [25]:
# Check

df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_times,max_orders,loyalty_flag,average_spending,spending_flag,median_days_of_prior_order,frequency_flag,gender,state,date_joined,n_dependants,fam_status,income,age_group,region,income_bracket
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class
1,2539329,1,1,2,8,0.0,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class
2,2539329,1,1,2,8,0.0,12427,3,0,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class
3,2539329,1,1,2,8,0.0,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class
4,2539329,1,1,2,8,0.0,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class


#### c. Number of dependents segmentation

In [27]:
# Create dependent flag

def dependents(n_dependants):
    if n_dependants == 0:
        return 'no dependents'
    else: return 'has dependents'

df['dependents_flag'] = df['n_dependants'].apply(dependents)

In [28]:
# Check 

df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_times,max_orders,loyalty_flag,average_spending,spending_flag,median_days_of_prior_order,frequency_flag,gender,state,date_joined,n_dependants,fam_status,income,age_group,region,income_bracket,dependents_flag
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents
1,2539329,1,1,2,8,0.0,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents
2,2539329,1,1,2,8,0.0,12427,3,0,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents
3,2539329,1,1,2,8,0.0,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents
4,2539329,1,1,2,8,0.0,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents


#### d. Department id segmentation preparation

In [30]:
# Identify department id

departments.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [31]:
# Merge department id to original data

# Create string type for merge
departments['department_id'] = departments['department_id'].astype('str')
df['department_id'] = df['department_id'].astype('str')

# Merge
df_new = df.merge(departments, on = 'department_id', indicator = True)

In [44]:
# Check

df_new.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_times,max_orders,loyalty_flag,average_spending,spending_flag,median_days_of_prior_order,frequency_flag,gender,state,date_joined,n_dependants,fam_status,income,age_group,region,income_bracket,dependents_flag,department,_merge
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,beverages,both
1,2539329,1,1,2,8,0.0,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,dairy eggs,both
2,2539329,1,1,2,8,0.0,12427,3,0,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,snacks,both
3,2539329,1,1,2,8,0.0,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,snacks,both
4,2539329,1,1,2,8,0.0,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,household,both


In [33]:
# Remove _merge column

df_new.drop(columns = ['_merge'])

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_times,max_orders,loyalty_flag,average_spending,spending_flag,median_days_of_prior_order,frequency_flag,gender,state,date_joined,n_dependants,fam_status,income,age_group,region,income_bracket,dependents_flag,department
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,beverages
1,2539329,1,1,2,8,0.0,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,dairy eggs
2,2539329,1,1,2,8,0.0,12427,3,0,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,snacks
3,2539329,1,1,2,8,0.0,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,snacks
4,2539329,1,1,2,8,0.0,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,household
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,2977660,206209,13,1,12,7.0,14197,5,1,Tomato Paste,9,9,5.6,Mid-range product,Busiest days,Most orders,13,Regular customer,7.058915,Low spender,18.0,Regular customer,Female,Iowa,9/14/2019,3,married,137969,old adult,midwest,upper class,has dependents,dry goods pasta
32404855,2977660,206209,13,1,12,7.0,38730,6,0,Brownie Crunch High Protein Bar,3,19,5.9,Mid-range product,Busiest days,Most orders,13,Regular customer,7.058915,Low spender,18.0,Regular customer,Female,Iowa,9/14/2019,3,married,137969,old adult,midwest,upper class,has dependents,snacks
32404856,2977660,206209,13,1,12,7.0,31477,7,0,High Protein Bar Chunky Peanut Butter,3,19,4.2,Low-range product,Busiest days,Most orders,13,Regular customer,7.058915,Low spender,18.0,Regular customer,Female,Iowa,9/14/2019,3,married,137969,old adult,midwest,upper class,has dependents,snacks
32404857,2977660,206209,13,1,12,7.0,6567,8,0,Chocolate Peanut Butter Protein Bar,3,19,4.9,Low-range product,Busiest days,Most orders,13,Regular customer,7.058915,Low spender,18.0,Regular customer,Female,Iowa,9/14/2019,3,married,137969,old adult,midwest,upper class,has dependents,snacks


#### e. Customer profile integration 

In [56]:
# Combing family status and age group

def create_profile(row):
    return f"{row['fam_status']}, {row['age_group']}"

In [58]:
# Applying the define function

df_new['customer_profile'] = df_new.apply(create_profile, axis = 1)

## Step 4. Low activity customer excursion

In [60]:
# Create excursion flag

df_new.loc[df_new['max_orders'] < 5, 'customer_activity'] = 'Low activity'
df_new.loc[df_new['max_orders'] >= 5, 'customer_activity'] = 'Normal activity'

In [62]:
# Check

df_new.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_times,max_orders,loyalty_flag,average_spending,spending_flag,median_days_of_prior_order,frequency_flag,gender,state,date_joined,n_dependants,fam_status,income,age_group,region,income_bracket,dependents_flag,department,_merge,customer_activity,customer_profile
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,beverages,both,Normal activity,"married, middle aged adult"
1,2539329,1,1,2,8,0.0,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,dairy eggs,both,Normal activity,"married, middle aged adult"
2,2539329,1,1,2,8,0.0,12427,3,0,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,snacks,both,Normal activity,"married, middle aged adult"
3,2539329,1,1,2,8,0.0,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,snacks,both,Normal activity,"married, middle aged adult"
4,2539329,1,1,2,8,0.0,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.0,Regular customer,Female,Alabama,2/17/2019,3,married,40423,middle aged adult,south,lower class,has dependents,household,both,Normal activity,"married, middle aged adult"


In [64]:
# Check

df_new.shape

(32404859, 36)

In [66]:
# Remove low activity customers with new dataset

df_new_activity = df_new[df_new['customer_activity'] == 'Normal activity']

In [67]:
# Check

df_new_activity.shape

(30964564, 36)

## Step 5: Exporting data

In [70]:
# Export main dataframe

df_new_activity.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'final_dataset.pkl'))