# 4.9. Data Wrangling and  Merging Customer dataframe

Contents
1.  Import Libraries
2.  Import Dataframes
3.  Check Data Consistencies
4.  Data Wrangling
5.  Joining Dataframes
6.  Exporting Dataframe


# 01. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02. Import Dataframes

In [None]:
# create path to folder
path = r'C:\Users\mngun\Documents\11_2023_InstaCart_asket_Analysis'

In [None]:
# import customer dataframe
customer = pd.read_csv(os.path.join(path, '02_Data','Original Data','customers.csv'))

# 03. Check Data Consistencies

In [None]:
# reviewing dataframe shape
customer.shape

In [None]:
# reviewing dataframe columns
customer.head()

In [None]:
# checking descriptive stats
customer.describe()

In [None]:
# checking missing values in each column
customer.isna().sum()

- I'm going to drop the first_name and Surname columns. The names will not be useful for the analysis and has too many missing values.
- Also, for data protection purposes, their identies should be protected.

In [None]:
# checking the frequency of values by state
customer['STATE'].value_counts()

In [None]:
# confirming number of states
customer['STATE'].nunique()

In [None]:
customer['fam_status'].value_counts()

# 04. Data Wrangling

In [None]:
# create new dataframe with the first and last names dropped
customer = customer.drop(columns = ['First Name','Surnam'])

In [None]:
customer.shape

In [None]:
customer.head()

In [None]:
# create a mapping for the new column names
column_mapping = {'Gender': 'gender','STATE': 'state', 'Age':'age',}

In [None]:
# replacing several column names
customer.rename(columns = column_mapping, inplace = True)

In [None]:
customer.head()

In [None]:
# confirming data type of user_id column - for merge
customer['user_id'].dtype

In [None]:
# checking for mixed type data in columns 
for col in customer.columns.tolist():
    if len(customer[col].apply(type).unique()) > 1:
        print(col + ' has mixed datatype')
    else:
        print(col + ' no mixed datatype')

In [None]:
# creating a dataframe of duplicate rows
df_dups = customer[customer.duplicated()]

In [None]:
# seeing the duplicate values - none present
df_dups

# 05 Joining Dataframes

In [None]:
# import previous Instacart dataframe
ords_prods_merge_flags = pd.read_pickle(os.path.join(path, '02_Data', 'Prepared Data', 'ords_prods_merge_flags.pkl'))

In [None]:
# reviewing dataframe 
ords_prods_merge_flags.head()

In [None]:
# reviewing dataframe shape
ords_prods_merge_flags.shape

In [None]:
# confirming number of suitable columns for dropping and merger
ords_prods_merge_flags.columns.to_list()

In [None]:
# drop irrelevnt columns for less memory-intensive merger
ords_prods_new = ords_prods_merge_flags.drop(columns = ['price_range_loc','_merge','busiest_day','max_order'])

In [None]:
# confirming dropped columns in new dataframe
ords_prods_new.shape

In [None]:
# confirm data type of user_id in 'ord_prods_new' dataframe
ords_prods_new['user_id'].dtype

In [None]:
# merge df_customer and ords_prods_new dataframes
ords_prods_cust = ords_prods_new.merge(customer, on = 'user_id', how = 'left', indicator = True)

In [None]:
# reviewing dataframe
ords_prods_cust.head()

In [None]:
# confirming the success of merger
ords_prods_cust['_merge'].value_counts()

# 06. Exporting Dataframe

In [None]:
# export merged dataframe to pickle file
ords_prods_cust.to_pickle(os.path.join(path, '02_Data','Prepared Data','ords_prods_cust.pkl'))