# 01 Importing libraries

In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02 Importing data

In [4]:
path = r'/Users/woodoooo/Desktop/Instacart Basket Analysis'

df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'),  index_col = False)

In [5]:
df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
df_customers.shape

(206209, 10)

# 04 Wrangling the data -- renaming columns

In [7]:
df_customers.rename(columns = {'Surnam': 'last_name'}, inplace = True)

In [8]:
df_customers.rename(columns = {'STATE': 'state'}, inplace = True)

In [9]:
df_customers.rename(columns = {'Gender': 'gender'}, inplace = True)

In [10]:
df_customers.rename(columns = {'Age': 'age'}, inplace = True)

In [11]:
df_customers.rename(columns = {'First Name': 'first_name'}, inplace = True)

In [12]:
df_customers.rename(columns = {'fam_status': 'marital_status'}, inplace = True)

In [13]:
df_customers.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


# 05 Data quality and consistency checks

In [14]:
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   user_id         206209 non-null  int64 
 1   first_name      194950 non-null  object
 2   last_name       206209 non-null  object
 3   gender          206209 non-null  object
 4   state           206209 non-null  object
 5   age             206209 non-null  int64 
 6   date_joined     206209 non-null  object
 7   n_dependants    206209 non-null  int64 
 8   marital_status  206209 non-null  object
 9   income          206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [15]:
# Changing the type of the 'date_joined' column to datetime type

df_customers['date_joined'] = pd.to_datetime(df_customers['date_joined'])

In [16]:
for col in df_customers.columns:
    # Check if there’s more than one type in the column
    if df_customers[col].apply(type).nunique() > 1:
        print(col)

first_name


In [17]:
# Change data types for multiple columns
df_customers = df_customers.astype({
    'user_id': 'int32',
    'age': 'int32',
    'n_dependants': 'int32',
     'income': 'int32'
    })

In [18]:
# Checking missing values 
df_customers['first_name'].isnull().sum()

np.int64(11259)

There are missing values in **first_name** (has **194,950 non‑null, so 11,259 missing)**

In [19]:
df_customers[df_customers['first_name'].isnull()].head(10)

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,marital_status,income
53,76659,,Gilbert,Male,Colorado,26,2017-01-01,2,married,41709
73,13738,,Frost,Female,Louisiana,39,2017-01-01,0,single,82518
82,89996,,Dawson,Female,Oregon,52,2017-01-01,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,2017-01-01,1,married,155673
105,29778,,Dawson,Female,Utah,63,2017-01-01,3,married,151819
128,8562,,Oconnor,Male,Utah,46,2017-01-01,1,married,134898
140,149267,,Hutchinson,Male,South Carolina,20,2017-01-01,0,single,86778
149,82632,,Orr,Male,Hawaii,61,2017-01-01,1,married,118130
155,172331,,Williamson,Female,Alaska,27,2017-01-01,0,single,55047
236,182963,,Nicholson,Female,New Mexico,58,2017-01-02,1,married,163391


In [20]:
# dropping the first_name and last_name columns because there is no need to keep them in our analysis, and because of the privacy issues
df_customers.drop(columns=['first_name', 'last_name'], inplace=True)

In [21]:
df_customers.isnull().sum()

user_id           0
gender            0
state             0
age               0
date_joined       0
n_dependants      0
marital_status    0
income            0
dtype: int64

In [22]:
# Checking for duplicates 

df_dup = df_customers.duplicated()

In [23]:
df_dup.sum()

np.int64(0)

# 06 Importing the Instacart data  (4.8_ords_prods_merge_final.pkl)

In [24]:
df_ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', '4.8_ords_prods_merge_final.pkl'))

In [25]:
df_ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,price_range_loc,Busiest day,Busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,avg_spending_flag,user_median_order_frequency,order_frequency_flag
0,2539329,1,1,2,8,,True,196,1,0,...,Mid-range,Regularly busy,Regularly busy day,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2539329,1,1,2,8,,True,14084,2,0,...,Mid-range,Regularly busy,Regularly busy day,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,2539329,1,1,2,8,,True,12427,3,0,...,Low-range,Regularly busy,Regularly busy day,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2539329,1,1,2,8,,True,26088,4,0,...,Low-range,Regularly busy,Regularly busy day,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,2539329,1,1,2,8,,True,26405,5,0,...,Low-range,Regularly busy,Regularly busy day,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [26]:
df_ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 25 columns):
 #   Column                       Dtype   
---  ------                       -----   
 0   order_id                     int32   
 1   user_id                      int32   
 2   order_number                 int8    
 3   order_day_of_week            int8    
 4   order_hour_of_day            int8    
 5   days_since_prior_order       float32 
 6   new_customer                 bool    
 7   product_id                   int32   
 8   add_to_cart_order            int32   
 9   reordered                    int8    
 10  product_name                 object  
 11  aisle_id                     int32   
 12  department_id                int32   
 13  prices                       float32 
 14  match                        category
 15  price_range_loc              object  
 16  Busiest day                  object  
 17  Busiest_days                 object  
 18  busiest_period_of_da

In [27]:
df_ords_prods_merge.shape

(32404859, 25)

In [28]:
df_ords_prods_merge['match'].head()

0    both
1    both
2    both
3    both
4    both
Name: match, dtype: category
Categories (3, object): ['left_only', 'right_only', 'both']

In [29]:
df_ords_prods_merge.drop(columns=['match'], inplace = True)

In [30]:
df_ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 24 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   order_id                     int32  
 1   user_id                      int32  
 2   order_number                 int8   
 3   order_day_of_week            int8   
 4   order_hour_of_day            int8   
 5   days_since_prior_order       float32
 6   new_customer                 bool   
 7   product_id                   int32  
 8   add_to_cart_order            int32  
 9   reordered                    int8   
 10  product_name                 object 
 11  aisle_id                     int32  
 12  department_id                int32  
 13  prices                       float32
 14  price_range_loc              object 
 15  Busiest day                  object 
 16  Busiest_days                 object 
 17  busiest_period_of_day        object 
 18  max_order                    int8   
 19

# 07 Merging the dataframes

In [31]:
# the common key is user_id

In [32]:
df_ords_prods_customers = pd.merge(df_ords_prods_merge,
                                  df_customers,
                                  on= 'user_id',  # common key column
                                  how = 'inner', # only rows that have a match in BOTH dataframes will be kept
                                  indicator=True)    # adds a column to show merge result

In [33]:
df_ords_prods_customers.shape

(32404859, 32)

In [34]:
df_ords_prods_customers.head(10)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,user_median_order_frequency,order_frequency_flag,gender,state,age,date_joined,n_dependants,marital_status,income,_merge
0,2539329,1,1,2,8,,True,196,1,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
1,2539329,1,1,2,8,,True,14084,2,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
2,2539329,1,1,2,8,,True,12427,3,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
3,2539329,1,1,2,8,,True,26088,4,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
4,2539329,1,1,2,8,,True,26405,5,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
5,2398795,1,2,3,7,15.0,False,196,1,1,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
6,2398795,1,2,3,7,15.0,False,10258,2,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
7,2398795,1,2,3,7,15.0,False,12427,3,1,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
8,2398795,1,2,3,7,15.0,False,13176,4,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
9,2398795,1,2,3,7,15.0,False,26088,5,1,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both


In [35]:
df_ords_prods_customers.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,aisle_id,department_id,prices,max_order,avg_price,user_median_order_frequency,age,date_joined,n_dependants,income
count,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,30328760.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404850.0,32404860.0,32404859,32404860.0,32404860.0
mean,1710745.0,102937.2,17.1423,2.738867,13.42515,11.10409,25598.66,8.352547,0.5895873,71.19612,9.919792,7.790887,33.05217,7.790884,10.39776,49.46527,2018-08-16 15:37:38.003034880,1.501896,99437.73
min,2.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,18.0,2017-01-01 00:00:00,0.0,25903.0
25%,855947.0,51422.0,5.0,1.0,10.0,5.0,13544.0,3.0,0.0,31.0,4.0,4.2,13.0,7.378478,6.0,33.0,2017-10-22 00:00:00,1.0,67004.0
50%,1711049.0,102616.0,11.0,3.0,13.0,8.0,25302.0,6.0,1.0,83.0,9.0,7.4,26.0,7.811882,8.0,49.0,2018-08-16 00:00:00,2.0,96618.0
75%,2565499.0,154389.0,24.0,5.0,16.0,15.0,37947.0,11.0,1.0,107.0,16.0,11.3,47.0,8.229228,13.0,65.0,2019-06-09 00:00:00,3.0,127912.0
max,3421083.0,206209.0,99.0,6.0,23.0,30.0,49688.0,145.0,1.0,134.0,21.0,25.0,99.0,23.2,30.0,81.0,2020-04-01 00:00:00,3.0,593901.0
std,987298.8,59466.1,17.53532,2.090077,4.24638,8.37857,14084.0,7.127071,0.4919087,38.21139,6.281485,4.100591,25.15525,1.020555,7.032845,18.48558,,1.118865,43057.27


In [36]:
df_ords_prods_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 32 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   order_id                     int32         
 1   user_id                      int32         
 2   order_number                 int8          
 3   order_day_of_week            int8          
 4   order_hour_of_day            int8          
 5   days_since_prior_order       float32       
 6   new_customer                 bool          
 7   product_id                   int32         
 8   add_to_cart_order            int32         
 9   reordered                    int8          
 10  product_name                 object        
 11  aisle_id                     int32         
 12  department_id                int32         
 13  prices                       float32       
 14  price_range_loc              object        
 15  Busiest day                  object        
 16

# 08 Exporting the data

In [38]:
os.makedirs(os.path.join(path, '02 Data', 'Prepared Data'), exist_ok=True)

In [39]:
df_ords_prods_customers.to_pickle(os.path.join(path, '02 Data','Prepared Data', '4.9_ords_prods_customers.pkl')) 