### 🔹 Step 1: Setup & Imports

In [30]:
# Imports
import pandas as pd
import numpy as np
import os

# Set path to project directory
path = r'C:\Users\moein\anaconda3\4 - Python Fundamentals'

# Load customer dataset
df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

# Preview the dataset
df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


### 🔹 Step 1.5: Inspect and align key column types

In [31]:
# Describe the customer dataframe to identify any potential issues
df_customers.describe(include='all')

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
count,206209.0,194950,206209,206209,206209,206209.0,206209,206209.0,206209,206209.0
unique,,207,1000,2,51,,1187,,4,
top,,Marilyn,Hamilton,Male,Florida,,9/17/2018,,married,
freq,,2213,252,104067,4044,,213,,144906,
mean,103105.0,,,,,49.501646,,1.499823,,94632.852548
std,59527.555167,,,,,18.480962,,1.118433,,42473.786988
min,1.0,,,,,18.0,,0.0,,25903.0
25%,51553.0,,,,,33.0,,0.0,,59874.0
50%,103105.0,,,,,49.0,,1.0,,93547.0
75%,154657.0,,,,,66.0,,3.0,,124244.0


## Issues Identified
- Inconsistent column names: Columns like "First Name" and "Surname" have spaces and inconsistent casing.

- Mixed data types (e.g., fam_status) might not show up clearly but must be checked.

- Missing values: Some categorical columns (like fam_status) show fewer unique values than expected and may have empty strings or NaNs.

- Key column user_id must match user_id type in ords_prods_merge (usually int).

- No duplicates detected yet, but best to check.

In [32]:
# Standardize column names: lower case, replace spaces with underscores
df_customers.columns = df_customers.columns.str.lower().str.replace(' ', '_')

# Check for missing values
print("Missing values:\n", df_customers.isnull().sum())

# Check for duplicates
print("\nDuplicate rows:", df_customers.duplicated().sum())

# Drop duplicates if any
df_customers = df_customers.drop_duplicates()

# Convert user_id to int to match ords_prods_merge key
df_customers['user_id'] = df_customers['user_id'].astype(int)

# Convert fam_status to string type
df_customers['fam_status'] = df_customers['fam_status'].astype(str)

# Replace empty strings with NaN in fam_status
df_customers['fam_status'].replace('', pd.NA, inplace=True)

# Confirm cleanup
print("\nData types:\n", df_customers.dtypes)

Missing values:
 user_id             0
first_name      11259
surnam              0
gender              0
state               0
age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

Duplicate rows: 0

Data types:
 user_id          int32
first_name      object
surnam          object
gender          object
state           object
age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_customers['fam_status'].replace('', pd.NA, inplace=True)


### 🔹 Step 2: Wrangle the data

In [33]:
# Rename incorrect column
df_customers.rename(columns={'Surnam': 'Surname'}, inplace=True)

# Drop 'gender' dolumn as one which is not needed
df_customers.drop(columns=['gender'], inplace=True)

# Confirm changes
df_customers.columns

Index(['user_id', 'first_name', 'surnam', 'state', 'age', 'date_joined',
       'n_dependants', 'fam_status', 'income'],
      dtype='object')

### 🔹 Step 5: Clean data

In [34]:
# Check for missing values
df_customers.isnull().sum()

# Drop rows or fill in missing values
df_customers.dropna(subset=['first_name'], inplace=True)

# Check for duplicates
df_customers.duplicated().sum()

# Convert 'date_joined' to datetime
df_customers['date_joined'] = pd.to_datetime(df_customers['date_joined'])

# Check column data types
df_customers.dtypes

user_id                  int32
first_name              object
surnam                  object
state                   object
age                      int64
date_joined     datetime64[ns]
n_dependants             int64
fam_status              object
income                   int64
dtype: object

### 🔹 Step 4: Load merged data and combine with customer data

In [35]:
# Define the path to the prepared data folder
prepared_path = os.path.join(path, '02 Data', 'Prepared Data')

# Load the previously merged Instacart data
file_path_2 = os.path.join(prepared_path, 'ords_prods_merge_enriched.pkl')
ords_prods_merge = pd.read_pickle(file_path_2)

# Ensure 'user_id' is the same data type in both dataframes
df_customers['user_id'] = df_customers['user_id'].astype(int)
ords_prods_merge['user_id'] = ords_prods_merge['user_id'].astype(int)

# Merge customer data with Instacart data on 'user_id'
df_merged_final = ords_prods_merge.merge(df_customers, on='user_id', how='left')

# Check the result
print(df_merged_final.shape)
df_merged_final.head()

(32434212, 32)


Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,...,median_days_since_prior,frequency_flag,first_name,surnam,state,age,date_joined,n_dependants,fam_status,income
0,2539329,1,1,2,8,0.0,196,1,0,both,...,20.0,Regular customer,Linda,Nguyen,Alabama,31.0,2019-02-17,3.0,married,40423.0
1,2539329,1,1,2,8,0.0,14084,2,0,both,...,20.0,Regular customer,Linda,Nguyen,Alabama,31.0,2019-02-17,3.0,married,40423.0
2,2539329,1,1,2,8,0.0,12427,3,0,both,...,20.0,Regular customer,Linda,Nguyen,Alabama,31.0,2019-02-17,3.0,married,40423.0
3,2539329,1,1,2,8,0.0,26088,4,0,both,...,20.0,Regular customer,Linda,Nguyen,Alabama,31.0,2019-02-17,3.0,married,40423.0
4,2539329,1,1,2,8,0.0,26405,5,0,both,...,20.0,Regular customer,Linda,Nguyen,Alabama,31.0,2019-02-17,3.0,married,40423.0


In [36]:
# Export as pickle
df_merged_final.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_cust_merge.pkl'))