# 01. Cleaning up df_prods dataframe

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
path = r'C:\Users\jomok\Documents\Career Foundry\Achievement 4\07-2023 Instacart Basket Analysis'

In [3]:
# Import data
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [4]:
# Create a dataframe
df_test = pd.DataFrame()

In [5]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [6]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [7]:
# Check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [8]:
# Convert "mix" column data type from numeric to string
df_test['mix'] = df_test['mix'].astype('str')

In [9]:
# Find missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [10]:
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [11]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [12]:
df_prods.shape

(49693, 5)

In [13]:
# Create new data frame without missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [14]:
df_prods_clean.shape

(49677, 5)

In [15]:
# Look for full duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [16]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [17]:
# Create new dataframe without duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [18]:
df_prods_clean_no_dups.shape

(49672, 5)

In [19]:
# Export new dataframe without missing and duplicate values
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

# 02. Cleaning up df_ords dataframe

In [20]:
# Step 2 - Run describe function for df_ords dataframe
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


Would expect max for order_hour_of_day to be 24 instead of 2

In [21]:
# Step 3 - Check for mixed-type data
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

No mixed-type data

In [22]:
# Step 5 - Check for missing values
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

206,209 missing values for the days_since_prior_order, which means these are probably first time orders

In [23]:
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [24]:
df_ords_nan.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
11,11,2168274,2,prior,1,2,11,
26,26,1374495,3,prior,1,1,14,
39,39,3343014,4,prior,1,6,11,
45,45,2717275,5,prior,1,3,12,


In [25]:
df_ords.shape

(3421083, 8)

In [26]:
# Step 6 - Replace missing values with zero
df_ords['days_since_prior_order'].fillna(0, inplace=True)

Replaced Nan values with zero because they are probably first time orders, so zero days since prior order would be an appropriate number. That way you can do analysis on the column without having to worry about NaN values.

In [27]:
# Step 7 - Check for duplicate values
df_ords_dups = df_ords[df_ords.duplicated()]

In [28]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


No duplicates found

In [29]:
# Step 9 - Export final data to "Prepared Data" folder
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))