# 4.5 Data Consistency Checks

## Import Libraries

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

## Import df_prods

In [2]:
path = r'C:\Users\natha\OneDrive\Desktop\Data Analytics\Jupyter\09-2023 Instacart Basket Analysis'
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [3]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


## Import df_ords

In [4]:
path = r'C:\Users\natha\OneDrive\Desktop\Data Analytics\Jupyter\09-2023 Instacart Basket Analysis'
df_ords = pd.read_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [5]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [6]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


# Checking For Mixed Type Data

In [7]:
# Create a sample dataframe
df_test = pd.DataFrame()

In [8]:
# Create a mixed type column in the sample dataframe
df_test['mix'] = ['a', 'b', 1, True]

In [9]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [10]:
# **The function for checking whether a dataframe contains any mixed-type columns**

for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [11]:
# **The function for checking whether a dataframe contains any mixed-type columns**

for col in df_prods.columns.tolist():
  weird = (df_prods[[col]].applymap(type) != df_prods[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_prods[weird]) > 0:
    print (col)

product_name


In [12]:
print(df_prods.dtypes)

product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object


# Missing Values (df_prods)

In [13]:
# Checking for missing values

df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [14]:
# Create subset of df_prods containing only product name

df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [15]:
df_nan.shape

(16, 5)

In [16]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


# Addressing Missing Values (df_prods)

In [17]:
# There are a few ways to deal with missing data:

# 1. Create a new variable that acts like a flag based on the missing value.
# 2. Impute the value with the mean or median of the column (if the variable is numeric).
# 3. ***Remove or filter out the missing data.***


### ***Remove or filter out the missing data.***


In [18]:
df_prods.shape

(49693, 5)

In [19]:
# Create a subset of df_prods again, this time with NON-MISSING values instead of missing values

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [20]:
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [21]:
df_prods_clean.shape

(49677, 5)

In [22]:
# df_prods_clean should have exactly 16 less rows than df_prods (the number of rows of df_nan)
# 49693 - 49677 = 16 !!!

In [23]:
# Confirming math

49693 - 49677

16

In [24]:
# Another way you can drop all missing values is via the following command:

# df_prods.dropna(inplace = True)
# If you wanted to use this command to drop only the NaNs from a particular column, the code would look like this:

# df_prods.dropna(subset = [‘product_name’], inplace = True)
# In both cases, rather than creating an entirely new dataframe, you’re overwriting df_prods with a new version of df_prods that doesn’t contain the missing values. This is done by way of the inplace = True function, which overwrites the original dataframe. If you don’t specify an inplace argument in your code, the function will take the default setting, which is inplace = False. When specified as False, the command will only return a view of the changed dataframe, leaving the original dataframe untouched.

# Overwriting can be risky. Unless you’re absolutely sure it’s safe to drop the values in question, you should create a new dataframe instead.

# Duplicates (df_prods)

In [25]:
# Finding Duplicates
# This code creates a new subset of df_prods_clean—df_dups—containing only rows that are duplicates. The duplicated() function is what identifies duplicate rows. It’s run on the df_prods_clean dataframe. Any duplicate rows that it finds are saved within the new df_dups dataframe.

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [26]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


## Addressing Duplicates (df_prods)

In [27]:
# Check current number of rows in our dataframe:

df_prods_clean.shape

(49677, 5)

In [28]:
# Create a new dataframe that DOESN'T include the duplicates we have identified

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [29]:
df_prods_clean_no_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [30]:
# We notice our new dataframe has exactly 5 less rows than df_prods_clean
# This is exactly the same number of rows as df_dups
# This confirms our new dataframe has successfully removed the duplicates!

# Task 4.5 Questions

# Checking For Mixed Type Data (df_ords)

In [31]:
# **The function for checking whether a dataframe contains any mixed-type columns**

for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [32]:
# **The function for checking whether a dataframe contains any mixed-type columns**

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

In [33]:
print(df_ords.dtypes)

order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object


# Missing Values (df_ords)

In [34]:
# Checking for missing values

df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [35]:
# Create subset of df_ords containing only days_since_prior_order

df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [36]:
df_ords_nan

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


In [37]:
df_ords_nan.shape

(206209, 6)

In [38]:
# This shows there are 206209 rows of NaN values for days_since_prior_order
# This is not likely an error, but simply orders which don't have prior orders
# This means, this is the customers first order
# We need to create a new column to indicate whether it's the customer's first order

# Addressing Missing Values (df_ords)

In [39]:
# There are a few ways to deal with missing data:

# 1. ***Create a new variable that acts like a flag based on the missing value.***
# 2. Impute the value with the mean or median of the column (if the variable is numeric).
# 3. Remove or filter out the missing data.

### ***Create a new variable that acts like a flag based on the missing value.***

In [40]:
# df_ords['days_since_prior_order'].isna() creates a boolean mask where True indicates rows where 'days_since_prior_order' is NaN.

# .astype(int) converts the boolean values to integers, where True becomes 1 and False becomes 0.

# df_ords['is_first_order'] = ... assigns the result to a new column named 'is_first_order' in our DataFrame.

# Now, in the 'is_first_order' column, you will have a 1 for rows where 'days_since_prior_order' is NaN (indicating the first order) and a 0 for rows where it's not NaN.

df_ords['is_first_order'] = df_ords['days_since_prior_order'].isna().astype(int)

In [41]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,is_first_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0,3421083.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484,0.06027594
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737,0.2379974
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0,0.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0,0.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0,0.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0,1.0


# Duplicates (df_ords)

In [42]:
# Finding Duplicates
# This code creates a new subset of df_ords_clean—df_ords_dups—containing only rows that are duplicates. The duplicated() function is what identifies duplicate rows. It’s run on the df_ords_clean dataframe. Any duplicate rows that it finds are saved within the new df_ords_dups dataframe.

df_ords_dups = df_ords[df_ords.duplicated()]

In [43]:
df_ords_dups

# No duplicates found

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,is_first_order


# Prices anomaly in df_prods

In [44]:
# Identify prices anomaly:

df_prods_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


In [45]:
# How many times does this anomaly appear?

count_of_99999 = df_prods_clean_no_dups['prices'].value_counts().get(99999, 0)

print(f"The value 99999 appears {count_of_99999} times in the 'prices' column.")

The value 99999 appears 1 times in the 'prices' column.


In [46]:
df_prods_clean_no_dups = df_prods_clean_no_dups[df_prods_clean_no_dups['prices'] != 99999]

# Now df_prods_clean_no_dups does not contain rows with 'prices' equal to 99999

In [47]:
df_prods_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49671.0,49671.0,49671.0,49671.0
mean,24850.172334,67.762115,11.728856,7.980256
std,14340.795118,38.3161,5.850806,66.952504
min,1.0,1.0,1.0,1.0
25%,12432.5,35.0,7.0,4.1
50%,24850.0,69.0,13.0,7.1
75%,37268.5,100.0,17.0,11.1
max,49688.0,134.0,21.0,14900.0


# Tidying Up and Exporting Changes

In [48]:
# Overwriting / Renaming the checked dataframe:

df_prods = df_prods_clean_no_dups.copy()

In [49]:
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49671.0,49671.0,49671.0,49671.0
mean,24850.172334,67.762115,11.728856,7.980256
std,14340.795118,38.3161,5.850806,66.952504
min,1.0,1.0,1.0,1.0
25%,12432.5,35.0,7.0,4.1
50%,24850.0,69.0,13.0,7.1
75%,37268.5,100.0,17.0,11.1
max,49688.0,134.0,21.0,14900.0


In [50]:
# Export df_prods

df_prods.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'), index=False)

In [51]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,is_first_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0,3421083.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484,0.06027594
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737,0.2379974
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0,0.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0,0.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0,0.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0,1.0


In [52]:
# Export df_ords

df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'), index=False)

In [53]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [54]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,is_first_order
0,2539329,1,1,2,8,,1
1,2398795,1,2,3,7,15.0,0
2,473747,1,3,3,12,21.0,0
3,2254736,1,4,4,7,29.0,0
4,431534,1,5,4,15,28.0,0
