# 4.9: Data Visualization with Python Part 1

### This script contains the following points:
#### 1. Check the dimensions of the imported dataframe
#### 2. Make column names consistent
#### 3. Check basic statistics for irregularities
#### 4. Check for missing values (NaN)
#### 5. Check for any mixed-type data
#### 6. Check for any duplicates
#### 7. Import orders_products_merged_v3
#### 8. Merge dataframes: orders_products_merged and df_customers
#### 9. Final changes and checks before exporting
#### 10. Export dataframe as pkl file


In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Path to main project folder

path = r'C:\Users\Mark\_Instacart Basket Analysis'

In [3]:
# Retrieve the customers.csv file

df_customers = pd.read_csv(os.path.join(path, '03 Scripts', 'Original Data', 'customers.csv'), index_col = False)

#### 1. Check the dimensions of the imported dataframe

In [4]:
# Ensure nothing looks out of place with the imported dataframe

df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [5]:
# Ensure nothing looks out of place with the imported dataframe

df_customers.tail()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799
206208,80148,Cynthia,Noble,Female,New York,55,4/1/2020,1,married,57095


In [6]:
# See if the data set is large, small, wide, or long

df_customers.shape

(206209, 10)

#### 2. Make column names consistent

In [7]:
# Ensure nothing looks out of place with the imported dataframes

df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [8]:
# Replace unintuitive column names (inplace = True)

df_customers.rename(
   columns = {
               'First Name' : 'first_name',
               'Surnam' : 'last_name',
               'Gender' : 'gender',
               'STATE' : 'state',
               'Age' : 'age',
               'n_dependants' : 'number_of_dependants',
               'fam_status' : 'family_status'
   },
   inplace = True
)

In [9]:
# Review updated column names

df_customers.columns

Index(['user_id', 'first_name', 'last_name', 'gender', 'state', 'age',
       'date_joined', 'number_of_dependants', 'family_status', 'income'],
      dtype='object')

#### 3. Check basic statistics for irregularities

In [10]:
# Check the basic statistics to see if there are any obvious irregularities

df_customers_stats1 = df_customers.groupby('state').agg({
    'income': ['min', 'max', 'mean'],
    'age': ['min', 'max', 'mean'],
    'number_of_dependants': ['min', 'max', 'mean']})

In [11]:
# Sort df_customers_stats1 to make it easier to gauge the largest and smallest income values

df_customers_stats1.reset_index().sort_values([('income', 'max')], ascending = False)

Unnamed: 0_level_0,state,income,income,income,age,age,age,number_of_dependants,number_of_dependants,number_of_dependants
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,min,max,mean,min,max,mean
47,Washington,26475,593901,94980.567648,18,81,49.875835,0,3,1.488746
18,Louisiana,26045,591089,96357.803611,18,81,49.410586,0,3,1.499629
50,Wyoming,26439,590790,95287.963394,18,81,49.122186,0,3,1.511007
38,Pennsylvania,26146,584097,95100.476132,18,81,49.44744,0,3,1.500866
14,Indiana,26348,579397,95488.914688,18,81,49.705737,0,3,1.48541
13,Illinois,26197,579169,94384.55094,18,81,49.31998,0,3,1.487883
44,Utah,26306,578551,94842.953995,18,81,49.845164,0,3,1.512738
33,North Carolina,26543,578251,94611.829087,18,81,49.279001,0,3,1.513975
24,Mississippi,25999,577728,94357.693792,18,81,49.514717,0,3,1.506307
45,Vermont,26411,576876,95767.368786,18,81,49.938412,0,3,1.498887


#### 4. Check for missing values (NaN)

In [12]:
# Check whether the df_customers dataframe has any missing values
# and count how many there are in each column

df_customers.isnull().sum()

user_id                     0
first_name              11259
last_name                   0
gender                      0
state                       0
age                         0
date_joined                 0
number_of_dependants        0
family_status               0
income                      0
dtype: int64

In [13]:
# Identfy any missing values in the "first_name" column, represented as NaN

df_customers['first_name'].value_counts(dropna = False)

NaN        11259
Marilyn     2213
Barbara     2154
Todd        2113
Jeremy      2104
           ...  
Eugene       197
Merry        197
Garry        191
Ned          186
David        186
Name: first_name, Length: 208, dtype: int64

In [14]:
# Identfy and retrieve any missing values (NaN) in the "first_name" column

df_customers_nan = df_customers[df_customers['first_name'].isnull() == True]

In [15]:
# Review "first_name" column NaN values

df_customers_nan.head(10)

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependants,family_status,income
53,76659,,Gilbert,Male,Colorado,26,1/1/2017,2,married,41709
73,13738,,Frost,Female,Louisiana,39,1/1/2017,0,single,82518
82,89996,,Dawson,Female,Oregon,52,1/1/2017,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,1/1/2017,1,married,155673
105,29778,,Dawson,Female,Utah,63,1/1/2017,3,married,151819
128,8562,,Oconnor,Male,Utah,46,1/1/2017,1,married,134898
140,149267,,Hutchinson,Male,South Carolina,20,1/1/2017,0,single,86778
149,82632,,Orr,Male,Hawaii,61,1/1/2017,1,married,118130
155,172331,,Williamson,Female,Alaska,27,1/1/2017,0,single,55047
236,182963,,Nicholson,Female,New Mexico,58,1/2/2017,1,married,163391


In [16]:
# Calculate the percentage of missing values
# The 11259 missing "first_name" rows are just 5% of the total,
# so, the "first_name" column will be kept.

x = round((11259 / 206209 * 100), 2)
y = '%'
print("{0}{1}".format(x, y))

5.46%


#### 5. Check for any mixed-type data

In [17]:
# Check whether the df_customers dataframe contains any mixed-type columns
# This code checks whether the data types within the column are consistent or not.
# If not, it prints the problematic column/s

for col in df_customers.columns.tolist():
    weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_customers[weird]) > 0: print (col)

first_name


#### 6. Check for any duplicates

In [18]:
# Check for full duplicates within the cleaned dataframe

df_customers_duplicates = df_customers[df_customers.duplicated()]

In [19]:
# Check the "shape" to see if there is any duplicate rows

df_customers_duplicates.shape

(0, 10)

In [20]:
# Review the main dataframe "df_customers", before combining with 

df_customers.head(10)

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependants,family_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
5,133128,Cynthia,Noble,Female,Kentucky,43,1/1/2017,2,married,49643
6,152052,Chris,Walton,Male,Montana,20,1/1/2017,0,single,61746
7,168851,Joseph,Hickman,Male,South Carolina,30,1/1/2017,0,single,63712
8,69965,Jeremy,Vang,Male,Texas,47,1/1/2017,1,married,162432
9,82820,Shawn,Chung,Male,Virginia,26,1/1/2017,2,married,32072


In [21]:
# Final check of "df_customers" dataframe before merging

df_customers.shape

(206209, 10)

#### 7. Import orders_products_merged_v3 

In [21]:
# Import the orders_products_merged.pkl file

orders_products_merged = pd.read_pickle(os.path.join(path, '03 Scripts', 'Prepared Data', 'orders_products_merged_v3.pkl'))

In [22]:
# Ensure nothing looks out of place with the imported dataframe

orders_products_merged.head()

Unnamed: 0,Unnamed: 0_x,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,...,match,price_range_loc,busiest day,busiest_period_of_day,max_order,loyalty_flag,mean_prices,spend_flag,mean_days_since_prior,frequency_flag
0,1,2398795,1,prior,2,3,7,15.0,196,1,...,both,Mid-range product,Least busy,Average orders,10,New customer,6.372222,Low spender,20.259259,Regular customer
1,2,473747,1,prior,3,3,12,21.0,196,1,...,both,Mid-range product,Least busy,Average orders,10,New customer,6.372222,Low spender,20.259259,Non-frequent customer
2,3,2254736,1,prior,4,4,7,29.0,196,1,...,both,Mid-range product,Least busy,Average orders,10,New customer,6.372222,Low spender,20.259259,Non-frequent customer
3,4,431534,1,prior,5,4,15,28.0,196,1,...,both,Mid-range product,Least busy,Average orders,10,New customer,6.372222,Low spender,20.259259,Non-frequent customer
4,5,3367565,1,prior,6,2,7,19.0,196,1,...,both,Mid-range product,Regularly busy,Average orders,10,New customer,6.372222,Low spender,20.259259,Regular customer


In [23]:
# Drop columns that are not needed

orders_products_merged.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y', 'eval_set'], inplace = True)

In [24]:
# Replace unintuitive column names (inplace = True)

orders_products_merged.rename(columns = {'order_dow' : 'orders_day_of_the_week', 'price_range_loc' : 'price_range'}, inplace = True)

In [25]:
# Review column names

orders_products_merged.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_the_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', '_merge', 'product_name', 'aisle_id',
       'department_id', 'prices', 'match', 'price_range', 'busiest day',
       'busiest_period_of_day', 'max_order', 'loyalty_flag', 'mean_prices',
       'spend_flag', 'mean_days_since_prior', 'frequency_flag'],
      dtype='object')

In [26]:
# Review the column count to see that the deleted columns have been dropped

orders_products_merged.shape

(30328763, 24)

In [27]:
# After reviewing the orders_products_merged dataframe, I noticed an unnecessary column but had to be sure before dropping it

orders_products_merged.groupby(['_merge']).agg({'_merge': ['count']})

Unnamed: 0_level_0,_merge
Unnamed: 0_level_1,count
_merge,Unnamed: 1_level_2
left_only,0
right_only,0
both,30328763


In [32]:
# Drop column that is not needed
# Only drop if in the droplist

orders_products_merged.drop(columns=['_merge'], inplace = True)

In [33]:
orders_products_merged.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_the_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'match', 'price_range', 'busiest day',
       'busiest_period_of_day', 'max_order', 'loyalty_flag', 'mean_prices',
       'spend_flag', 'mean_days_since_prior', 'frequency_flag'],
      dtype='object')

#### 8. Merge dataframes: orders_products_merged and df_customers

In [34]:
# Merge the orders_products_merged dataframe with the df_customers dataframe
# They have a different shape but share the "user_id" column
# The "indicator = 'combined' argument" checks for a full match,
# and because we had a previous merge that took the default name, this column had to be given a new name

df_merged_all = orders_products_merged.merge(df_customers, how = 'inner', on = 'user_id', indicator = 'combined')

In [35]:
# The value_counts() function sums up all the values in the "combined" column
# to see whether there is a full match or not

df_merged_all['combined'].value_counts()

both          30328763
right_only           0
left_only            0
Name: combined, dtype: int64

In [36]:
# Ensure nothing looks out of place with the merged dataframes

df_merged_all.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,first_name,last_name,gender,state,age,date_joined,number_of_dependants,family_status,income,combined
0,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,473747,1,3,3,12,21.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,431534,1,5,4,15,28.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,3367565,1,6,2,7,19.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


In [37]:
# Review the column count to see the 10 customer columns added

df_merged_all.shape

(30328763, 33)

In [38]:
# Ensure they are the correct columns

df_merged_all.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_the_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'match', 'price_range', 'busiest day',
       'busiest_period_of_day', 'max_order', 'loyalty_flag', 'mean_prices',
       'spend_flag', 'mean_days_since_prior', 'frequency_flag', 'first_name',
       'last_name', 'gender', 'state', 'age', 'date_joined',
       'number_of_dependants', 'family_status', 'income', 'combined'],
      dtype='object')

#### 9. Final changes and checks before exporting

In [39]:
# After reviewing the combined dataframes again, I noticed one unnecessary column but had to be sure before dropping them

df_merged_all.groupby(['combined']).agg({'combined': ['count']})

Unnamed: 0_level_0,combined
Unnamed: 0_level_1,count
combined,Unnamed: 1_level_2
left_only,0
right_only,0
both,30328763


In [40]:
# Drop columns that are not needed

df_merged_all.drop(columns=['combined'], inplace = True)

In [41]:
# A final review of the columns

df_merged_all.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_the_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'match', 'price_range', 'busiest day',
       'busiest_period_of_day', 'max_order', 'loyalty_flag', 'mean_prices',
       'spend_flag', 'mean_days_since_prior', 'frequency_flag', 'first_name',
       'last_name', 'gender', 'state', 'age', 'date_joined',
       'number_of_dependants', 'family_status', 'income'],
      dtype='object')

In [42]:
# A final review of the shape

df_merged_all.shape

(30328763, 32)

#### 10. Export dataframe as pkl file

In [43]:
# Export transformed and merged dataframes to pkl

df_merged_all.to_pickle(os.path.join(path, '03 Scripts','Prepared Data', 'orders_products_customers_merged.pkl'))