# 1. Import Libraries and Data

In [4]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [5]:
# Turn your project folder path into a string

path = r'/Users/peterreadman/Desktop/Python Projects/CareerFoundry/04-2020 Instacart Basket Analysis/'

In [6]:
# import customers.csv as df
df = pd.read_csv(os.path.join(path, '02 Data', '02a Original Data', 'customers.csv'))

# Part 1: Step 4
*Wrangle the data so that it follows consistent logic; for example, rename columns with illogical names and drop columns that don’t add anything to your analysis.*

In [7]:
df.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [8]:
df.shape

(206209, 10)

In [9]:
df.columns

Index(['user_id', 'First Name', 'Surnam', 'Gender', 'STATE', 'Age',
       'date_joined', 'n_dependants', 'fam_status', 'income'],
      dtype='object')

Data Wrangling Procedures

- Dropping columns
- Renaming columns
- Changing data types
- Transposing data

In [10]:
# Rename columns

df.rename(columns = {'user_id' : 'User_ID', 'STATE' : 'State', 'date_joined' : 'Date_Joined', 'n_dependants' : 'Dependants', 'fam_status' : 'Family_Status', 'income' : 'Income'}, inplace = True)

In [11]:
df.rename(columns = {'First Name' : 'First_Name', 'Surnam' : 'Last_Name'}, inplace = True)

In [12]:
df.head()

Unnamed: 0,User_ID,First_Name,Last_Name,Gender,State,Age,Date_Joined,Dependants,Family_Status,Income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [13]:
# check data types
df.dtypes

User_ID           int64
First_Name       object
Last_Name        object
Gender           object
State            object
Age               int64
Date_Joined      object
Dependants        int64
Family_Status    object
Income            int64
dtype: object

In [14]:
# Change 'Date_Joined' to datetime format 
df['Date_Joined'] = df['Date_Joined'].astype('datetime64[ns]')

In [15]:
df.head()

Unnamed: 0,User_ID,First_Name,Last_Name,Gender,State,Age,Date_Joined,Dependants,Family_Status,Income
0,26711,Deborah,Esquivel,Female,Missouri,48,2017-01-01,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,2017-01-01,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2017-01-01,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,2017-01-01,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,2017-01-01,1,married,40374


In [15]:
df.value_counts('Date_Joined')

Date_Joined
2018-09-17    213
2018-02-10    212
2019-04-01    211
2019-09-21    211
2017-12-19    210
             ... 
2018-09-01    141
2018-01-22    140
2017-11-24    139
2019-07-18    138
2018-08-06    128
Length: 1187, dtype: int64

In [16]:
df.head()

Unnamed: 0,User_ID,First_Name,Last_Name,Gender,State,Age,Date_Joined,Dependants,Family_Status,Income
0,26711,Deborah,Esquivel,Female,Missouri,48,2017-01-01,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,2017-01-01,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2017-01-01,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,2017-01-01,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,2017-01-01,1,married,40374


I'm not going to drop any columns at this stage. They could all be useful and there's not so many

# Part 1: Step 5
*Complete the fundamental data quality and consistency checks you’ve learned throughout this Achievement; for example, check for and address missing values and duplicates, and convert any mixed-type data.*

## 5.i Descriptive Statistics

In [17]:
# Return descriptive statistics for the numeric values: 'Age, 'Dependents', 'Income' 

df.describe()

Unnamed: 0,User_ID,Age,Dependants,Income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


- The min and max values for 'Age', 'Dependents' are whole numbers within a typical range, so look correct
</br>
- There's a wide spread of 'Income' from 25,903 – 593,901 so would be interesting to check outliers.

## 5.ii Find Missing Values

In [18]:
# Check for errors in 'Gender'

df.value_counts('Gender')

Gender
Male      104067
Female    102142
dtype: int64

In [19]:
# Check counts of each 'State'
df.value_counts('State')

State
Alabama                 4044
District of Columbia    4044
Iowa                    4044
Indiana                 4044
Illinois                4044
Idaho                   4044
Georgia                 4044
Florida                 4044
Hawaii                  4044
Delaware                4044
Connecticut             4044
Colorado                4044
California              4044
Arkansas                4044
Arizona                 4044
Alaska                  4044
South Dakota            4043
Ohio                    4043
Oklahoma                4043
Oregon                  4043
Pennsylvania            4043
Rhode Island            4043
South Carolina          4043
Wisconsin               4043
Tennessee               4043
Texas                   4043
Utah                    4043
Vermont                 4043
Virginia                4043
Washington              4043
West Virginia           4043
North Carolina          4043
North Dakota            4043
Missouri                4043
New York

In [20]:
# Check length of unique 'State' values to check for spelling errors (should be 51 unique state names)
len(pd.unique(df['State']))

51

Find Missing Values

In [21]:
df.isnull().sum()

User_ID              0
First_Name       11259
Last_Name            0
Gender               0
State                0
Age                  0
Date_Joined          0
Dependants           0
Family_Status        0
Income               0
dtype: int64

In [22]:
# Create a subset to view missin 'First_Name' values

df_nan = df[df['First_Name'].isnull() == True]

In [23]:
df_nan

Unnamed: 0,User_ID,First_Name,Last_Name,Gender,State,Age,Date_Joined,Dependants,Family_Status,Income
53,76659,,Gilbert,Male,Colorado,26,2017-01-01,2,married,41709
73,13738,,Frost,Female,Louisiana,39,2017-01-01,0,single,82518
82,89996,,Dawson,Female,Oregon,52,2017-01-01,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,2017-01-01,1,married,155673
105,29778,,Dawson,Female,Utah,63,2017-01-01,3,married,151819
...,...,...,...,...,...,...,...,...,...,...
206038,121317,,Melton,Male,Pennsylvania,28,2020-03-31,3,married,87783
206044,200799,,Copeland,Female,Hawaii,52,2020-04-01,2,married,108488
206090,167394,,Frost,Female,Hawaii,61,2020-04-01,1,married,45275
206162,187532,,Floyd,Female,California,39,2020-04-01,0,single,56325


**Results**</br>
- there are 11,259 customers with no 'First_Name' entered. 
- it's worth checking to make sure these are not duplicated elsewhere with completed names

In [24]:
# Drop the First_Name column in a new dataframe:

df_nan_drop_First_Name = df_nan.drop(columns=['First_Name'])

In [25]:
# Create a new dataframe (df_dups) of only duplicate entries from new dataframe

df_dups = df_nan_drop_First_Name[df_nan_drop_First_Name.duplicated()]

In [26]:
df_dups

Unnamed: 0,User_ID,Last_Name,Gender,State,Age,Date_Joined,Dependants,Family_Status,Income


**Results**</br>
- there are no duplicate rows, even ignoring the 'First_Name' column

In [27]:
# Check counts of 'Family_Status' for errors

In [28]:
df['Family_Status'].value_counts()

married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: Family_Status, dtype: int64

In [29]:
xs = [144906, 33962, 17640, 9701]
print(sum(xs))

206209


There are four categories of 'Family_Status' and no errors or null values

### Customer data frame looks good

In [30]:
df_cust = df

In [31]:
df_cust

Unnamed: 0,User_ID,First_Name,Last_Name,Gender,State,Age,Date_Joined,Dependants,Family_Status,Income
0,26711,Deborah,Esquivel,Female,Missouri,48,2017-01-01,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,2017-01-01,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2017-01-01,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,2017-01-01,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,2017-01-01,1,married,40374
...,...,...,...,...,...,...,...,...,...,...
206204,168073,Lisa,Case,Female,North Carolina,44,2020-04-01,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,2020-04-01,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,2020-04-01,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,2020-04-01,1,married,99799


# Part 1: Step 6
*Combine your customer data with the rest of your prepared Instacart data. (Hint: Make sure the key columns are the same data type!)*


In [32]:
pd.set_option('display.max_columns', 500)

In [33]:
# Import orders_products_spend_freq_flags.pkl

df_ords_prods_1 = pd.read_pickle(os.path.join(path, '02 Data', '02b Prepared Data', 'orders_products_spend_freq_flags.pkl'))

In [34]:
df_ords_prods_1.head()

Unnamed: 0.1,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,Unnamed: 0,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_prices,spending_flag,median_days,frequency_flag
0,2539329,1,1,2,8,,196,1,0,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [35]:
# Clean up df_ords_prods_1 first

df_ords_prods_1.drop(columns='_merge')

Unnamed: 0.1,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,Unnamed: 0,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_prices,spending_flag,median_days,frequency_flag
0,2539329,1,1,2,8,,196,1,0,195,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,196,1,1,195,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,196,1,1,195,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,196,1,1,195,Soda,77,7,9.0,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,196,1,1,195,Soda,77,7,9.0,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,1320836,202557,17,2,15,1.0,43553,2,1,43557,Orange Energy Shots,64,7,3.7,Low-range product,Regularly busy,Regularly busy,Most orders,31,Regular customer,6.905655,Low spender,8.0,Frequent customer
32404855,31526,202557,18,5,11,3.0,43553,2,1,43557,Orange Energy Shots,64,7,3.7,Low-range product,Regularly busy,Regularly busy,Most orders,31,Regular customer,6.905655,Low spender,8.0,Frequent customer
32404856,758936,203436,1,2,7,,42338,4,0,42342,"Zucchini Chips, Pesto",50,19,6.9,Mid-range product,Regularly busy,Regularly busy,Average orders,3,New customer,7.631579,Low spender,15.0,Regular customer
32404857,2745165,203436,2,3,5,15.0,42338,16,1,42342,"Zucchini Chips, Pesto",50,19,6.9,Mid-range product,Regularly busy,Least busy days,Fewest orders,3,New customer,7.631579,Low spender,15.0,Regular customer


### Merge dataframes on 'User_ID'

In [36]:
# Rename 'user_id' in df_ords_prods_1 to 'User_ID'

df_ords_prods_1 = df_ords_prods_1.rename(columns={'user_id' : 'User_ID'})

In [37]:
# Check dtype of 'User_ID' matches in each dataframe

df_ords_prods_1['User_ID'].dtype

dtype('int64')

In [38]:
df_cust['User_ID'].dtype

dtype('int64')

In [39]:
# Merge

df_ords_prods_cust = df_ords_prods_1.merge(df_cust, on = 'User_ID')

In [40]:
df_ords_prods_cust.shape

(32404859, 34)

In [41]:
df_ords_prods_cust.head()

Unnamed: 0.1,order_id,User_ID,order_number,orders_day_of_the_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,Unnamed: 0,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_prices,spending_flag,median_days,frequency_flag,First_Name,Last_Name,Gender,State,Age,Date_Joined,Dependants,Family_Status,Income
0,2539329,1,1,2,8,,196,1,0,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423


In [42]:
# Clean up column names of new merged dataframe

df_ords_prods_cust = df_ords_prods_cust.rename(
    columns={
        'order_id' : 'Order_ID',
        'order_number' : 'Order_Number',
        'orders_day_of_the_week' : 'Orders_day_of_the_week',
        'order_hour_of_day' : 'Order_hour_of_day',
        'days_since_last_order' : 'Days_Since_Prior_Order',
        'product_id' : 'Product_ID',
        'add_to_cart_order' : 'Add_To_Cart_Order',
        'reordered' :'Reordered',
        'product_name' : 'Product_Name',
        'aisle_id' : 'Aisle_ID',
        'department_id' : 'Department_ID',
        'prices' : 'Prices',
        'price_range_loc' : 'Rrice_Range_Loc',
        'busiest_day' : 'Busiest_Day',
        'busiest_days' : 'Busiest_Days',
        'busiest_period_of_day' :'Busiest_Period_of_Day',
        'max_order' : 'Max_Order',
        'loyalty_flag' : 'Loyalty_Flag',
        'mean_prices' : 'Mean_Prices',
        'spending_flag' : 'Spending_Flag',
        'median_days' : 'Median_Days',
        'frequency_flag' : 'Frequency_Flag'
    })

In [43]:
df_ords_prods_cust.head()

Unnamed: 0.1,Order_ID,User_ID,Order_Number,Orders_day_of_the_week,Order_hour_of_day,Days_Since_Prior_Order,Product_ID,Add_To_Cart_Order,Reordered,Unnamed: 0,Product_Name,Aisle_ID,Department_ID,Prices,_merge,Rrice_Range_Loc,Busiest_Day,Busiest_Days,Busiest_Period_of_Day,Max_Order,Loyalty_Flag,Mean_Prices,Spending_Flag,Median_Days,Frequency_Flag,First_Name,Last_Name,Gender,State,Age,Date_Joined,Dependants,Family_Status,Income
0,2539329,1,1,2,8,,196,1,0,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,195,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423


In [44]:
# drop 'Unnamed: 0' column

df_ords_prods_cust.drop(columns=['Unnamed: 0'])

Unnamed: 0,Order_ID,User_ID,Order_Number,Orders_day_of_the_week,Order_hour_of_day,Days_Since_Prior_Order,Product_ID,Add_To_Cart_Order,Reordered,Product_Name,Aisle_ID,Department_ID,Prices,_merge,Rrice_Range_Loc,Busiest_Day,Busiest_Days,Busiest_Period_of_Day,Max_Order,Loyalty_Flag,Mean_Prices,Spending_Flag,Median_Days,Frequency_Flag,First_Name,Last_Name,Gender,State,Age,Date_Joined,Dependants,Family_Status,Income
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2019-02-17,3,married,40423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,156685,106143,26,4,23,5.0,19675,1,1,Organic Raspberry Black Tea,94,7,10.7,both,Mid-range product,Least busy,Least busy days,Average orders,26,Regular customer,10.700000,High spender,7.0,Frequent customer,Gerald,Yates,Male,Hawaii,25,2017-05-26,0,single,53755
32404855,484769,66343,1,6,11,,47210,1,0,Fresh Farmed Tilapia Fillet,15,12,8.1,both,Mid-range product,Regularly busy,Regularly busy,Most orders,4,New customer,8.100000,Low spender,30.0,Non-frequent customer,Jacqueline,Arroyo,Female,Tennessee,22,2017-09-12,3,married,46151
32404856,1561557,66343,2,1,11,30.0,47210,1,1,Fresh Farmed Tilapia Fillet,15,12,8.1,both,Mid-range product,Regularly busy,Busiest days,Most orders,4,New customer,8.100000,Low spender,30.0,Non-frequent customer,Jacqueline,Arroyo,Female,Tennessee,22,2017-09-12,3,married,46151
32404857,276317,66343,3,6,15,19.0,47210,1,1,Fresh Farmed Tilapia Fillet,15,12,8.1,both,Mid-range product,Regularly busy,Regularly busy,Most orders,4,New customer,8.100000,Low spender,30.0,Non-frequent customer,Jacqueline,Arroyo,Female,Tennessee,22,2017-09-12,3,married,46151


# Supplemental - Clean the outliers in 'Prices'

The 'Prices' column has some extremely high values as identifed previously. Find all prices higher than USD100 and change to nan

In [49]:
df_ords_prods_cust.loc[df_ords_prods_cust['Prices'] > 100, 'Prices'] = np.nan

In [51]:
df_ords_prods_cust['Prices'].max()

25.0

# Part 1: Step 8
*Export this new dataframe as a pickle file so you can continue to use it in the second part of this task.*

In [52]:
# Export as pickle for Part 2

df_ords_prods_cust.to_pickle(os.path.join(path, '02 Data', '02b Prepared Data', 'df_ords_prods_cust.pkl'))