# Task A4.05 Data Consistency Checks

### Table of Contents
##### 01. Import Libraries & Dataframe Creation
##### 02. Exercise 4.5 Practice: Consistency Checks
##### 03. Run df.describe on df_ords dataframe
##### 04. Check for mixed-type data in df_ords
##### 05. Missing values check for df_ords
##### 06. Duplicate values check
##### 07. Export dataframe

### 01. Import Libraries & Dataframe Creation

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# path shortcut
path = r'/Users/nicolechiu/OneDrive - InterVarsity Christian Fellowship USA/Documents/CF Data Analytics/Achievement 4/05-2023 Instacart Basket Analysis'

In [3]:
# products dataframe creation
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [4]:
# orders dataframe creation
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [5]:
# Create dataframe
df_test = pd.DataFrame()

### 02. Exercise 4.5 Practice: Consistency Checks

In [6]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [7]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [8]:
# Check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [9]:
# Convert data type to string
df_test['mix'] = df_test['mix'].astype('str')

In [10]:
df_test['mix'].dtype

dtype('O')

In [11]:
# Finding missing values in df_prods
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [12]:
# Subset of df_prods dataframe where product_name is missing
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [13]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [14]:
df_prods.shape

(49693, 5)

In [15]:
# Removing rows with missing product_name value
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [16]:
df_prods_clean.shape

(49677, 5)

In [17]:
# Looking for full duplicates within the df
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [18]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [19]:
df_prods_clean.shape

(49677, 5)

In [20]:
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [21]:
df_prods_clean_no_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [22]:
# Exporting df_prods
df_prods.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'))

In [23]:
# Exporting clean no duplicates version of df_prods - correction
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'))

### 03. Run df.describe on df_ords dataframe

In [24]:
# Run df.describe on df_ords dataframe
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [25]:
# Anything in data look off or should be investigated?

In [26]:
# Would ask for clarification around numbering convention of orders_day_of_week, what day is 0 and what day is 6?
# Order numbers only go up to 100 - are numbers reused? What is the numbering convention?

### 04. Check for mixed-type data in df_ords

In [27]:
# Check for mixed types in df_ords dataframe
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

In [28]:
# No mixed types in df_ords dataframe

### 05. Missing values check for df_ords

In [29]:
# Finding missing values in df_ords
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [30]:
# Missing values findings
# 206,209 missing values from the days_since_prior_order. This could indicate that there are 206,209 one-time customers who did not put in another order
# Would propose creating a new variable that acts like a flag based on the missing value, indicating that these were one-time customers 

In [31]:
# Investigating the missing 'days_since_prior_order' variable
df_missing_days_since = df_ords[df_ords['days_since_prior_order']=='NaN']

In [32]:
df_missing_days_since

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


In [33]:
df_missing_days_since.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


In [34]:
# Investigating the missing 'days_since_prior_order' variable - attempt #2
df_missing_days_since = df_ords[df_ords['days_since_prior_order']==NaN]

NameError: name 'NaN' is not defined

In [35]:
# Investigating the missing 'days_since_prior_order' variable - attempt #3
df_missing_days_since = df_ords[df_ords['days_since_prior_order'].isnull()==True]

In [36]:
df_missing_days_since

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


In [37]:
# From the above, we see that the order number for all of these corresponding missing values is 1, which confirms the theory that these customers did not return to place another order
# As such, I would leave the missing values as is and simply remember that the missing values actually indicate that this is the only order of the customer

In [38]:
# Last note regarding missing values: 6% of the dataset is missing days_since_prior_order

### 06. Duplicate values check

In [39]:
# Looking for full duplicates within the df_ords
df_ords_dups = df_ords[df_ords.duplicated()]

In [40]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


In [41]:
# No full duplicates found. If duplicates had been found, would have deleted duplicates using drop duplicates function

### 07. Export dataframe

In [42]:
# Exporting clean no duplicates version of df_ords
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'))

In [43]:
df_ords.shape

(3421083, 7)