# 4.9: Intro to Data Visualization with Python

# Part 1

## This scripts contains the following:
### 1. importing customer data & wrangling
### 2. data quality and consistency checks
### 3. merging customer data and prepared data
### 4. exporting merged dataframe

In [5]:
#importing libraries and analysis tools
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

### 1. importing customer data and wrangling

In [14]:
#import customer original data

In [3]:
path = r'C:/Users/LocalAdmin/Instacart Basket Analysis'

In [19]:
path

'C:/Users/LocalAdmin/Instacart Basket Analysis'

In [5]:
customers = pd.read_csv(os.path.join(path,'02 Data', 'Original Data', 'customers.csv'))

In [7]:
#Shape for the customer dataframe
customers.shape

(206209, 10)

In [21]:
#checking the head for customers dataframe
customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [34]:
#Question 4: Wrangle the data so that it follows consistent logic; for example, rename columns with illogical names and drop columns that don’t add anything to your analysis.

In [9]:
#check the datatype for each column
customers.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

User_id must bechnaged to an object as its summary stats are meaningless as the user_id is only an identifier for a user

In [None]:
#changing datatype for user_id to string (object)


In [11]:
# Changing datatype for user_id from integer to string or object
customers['user_id'] = customers['user_id'].astype('str')

In [13]:
#checking if user_id is now a string
customers.dtypes

user_id         object
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [15]:
# Checking if there is mixed datatypes in the different columns
for col in customers.columns.tolist():
  weird = (customers[[col]].map(type) != customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customers[weird]) > 0:
    print (col)

First Name


In [23]:
# Proceed to make 'first_name' a string
customers['First Name'] = customers['First Name'].astype('str')

In [25]:
# Checking if there is mixed datatypes in the different columns
for col in customers.columns.tolist():
  weird = (customers[[col]].map(type) != customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customers[weird]) > 0:
    print (col)

No mixed datatypes for each column

In [29]:
#I use head to see how the data looks
customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


'First Name' must be captured as 'first_name', 'Surnam' must be captured as 'surname', 'Gender' must be captured as 'gender', 'STATE' must be captured as 'state' and 'Age' must be captured as 'age', 'family_status' must be captured as 'marital_status'

In [31]:
# Renaming the columns identifed above
customers.rename(columns = {'First Name' : 'first_name'}, inplace = True)

In [33]:
customers.rename(columns = {'Surnam' : 'surname'}, inplace = True)

In [35]:
customers.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [37]:
customers.rename(columns = {'Age' : 'age'}, inplace = True)

In [39]:
customers.rename(columns = {'STATE' : 'state'}, inplace = True)

In [41]:
customers.rename(columns = {'fam_status' : 'marital_status'}, inplace = True)

In [43]:
#Checking head again to see if the column titles have changed
customers.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependants,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


All columns are okay now

In [45]:
#check the shape for customers dataframe
customers.shape

(206209, 10)

In [47]:
#Getting summary stats for the customers dataframe
customers.describe()

Unnamed: 0,age,n_dependants,income
count,206209.0,206209.0,206209.0
mean,49.501646,1.499823,94632.852548
std,18.480962,1.118433,42473.786988
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


Summary stats are done for variables or columns whose datatype is numerical, hence all is okay here.

####  2. data quality and consistency checks 

In [51]:
# Check for missing data across the various columns
customers.isnull().sum()

user_id           0
first_name        0
surname           0
gender            0
state             0
age               0
date_joined       0
n_dependants      0
marital_status    0
income            0
dtype: int64

No data is missing for all the columns

In [53]:
# Checking the dataset for any duplicates
customers_dups = customers[customers.duplicated()]

In [55]:
#calling out customers_dups to see duplicates
customers_dups

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependants,marital_status,income


No duplicates across all columns, thus all is okay with the dataframe.

### 3. merging customer data and prepared Instacart data 

In [57]:
# Import the latest prepared data from Ex 4.8
orders_products = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_new.pkl'))

In [59]:
# checking the head of orders_products
orders_products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_product_price,spending_flag,median_days_between_orders,order_frequency
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,Mid-range product,Regularly busy,Regularly busy,Most orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,Mid-range product,Regularly busy,Regularly busy,Average orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,...,Mid-range product,Busiest day,Busiest days,Average orders,5,New customer,7.930208,Low spender,8.0,Frequent customer
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,...,Mid-range product,Regularly busy,Slowest days,Most orders,3,New customer,4.972414,Low spender,9.0,Frequent customer
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,...,Mid-range product,Least busy,Slowest days,Average orders,3,New customer,4.972414,Low spender,9.0,Frequent customer


In [61]:
# check the head for customers dataframe
customers.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependants,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


Both dataframes got 'user_id' as a common variable and can be used to merge the dataframes

In [146]:
#checking if the datatype for 'user_id' is tyhe same for the two dataframes
customers.dtypes['user_id']

dtype('O')

In [155]:
orders_products.dtypes['user_id']

dtype('O')

In [63]:
# Convert 'user_id' under orders_products to string
orders_products['user_id'] = orders_products['user_id'].astype('str')

In [64]:
#Check again the data type for 'user_id' under orders_products
orders_products.dtypes['user_id']

dtype('O')

In [67]:
# Doing away with'_merge' column from previous merging exercises
orders_products = orders_products.drop(columns=['_merge'])

In [173]:
# Using inner join to merge the two dataframes using the column 'user_id' as the key


In [69]:
# Using inner join to merge the two dataframes using the column 'user_id' as the key.
customers_ords_prods = orders_products.merge(customers, on = 'user_id', indicator = True)

In [73]:
#Checking the shape for the customers_ords_prods
customers_ords_prods.shape

(32434212, 33)

### 4. Export this new dataframe as a pickle file

In [170]:
customers_ords_prods.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'customers_ords_prods.pkl'))