# Exercise 4.5 IC Data Consistency Checks

#This script contains the following:

#1 Import Libraries & datasets;

#2 Missing values/duplicates identified and addressed in `df_prods` and `df_ords` and explanations provided;

#3 Anomaly detected in `df_prods` using `df.describe()`;

#4 Mixed data type column addressed in `df_ords`;

#5 Dataframes exported as “.csv” files and renamed appropriately;

# 01 Importing Libraries

In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
# importing dataset
path=r'C:\\Users\\User01\\16.05.2023 Instacart Basket Analysis'
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

In [5]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [6]:
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [7]:
# create a test dataframe
df_test=pd.DataFrame()

In [8]:
df_test['mix']=['a','b', 1, True]

In [9]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [10]:
# check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [11]:
df_test['mix'] = df_test['mix'].astype('str')

# 02 Finding missing value in df_prods

In [12]:
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [13]:
# creating new dataframe for null values
df_nan=df_prods[df_prods['product_name'].isnull()==True]

In [14]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [15]:
df_prods.shape

(49693, 5)

In [16]:
#creating new dataframe without missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [17]:
df_prods_clean.shape

(49677, 5)

# 03 Finding Duplicates in df_prods

In [18]:
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [19]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [20]:
#creating new dataframe without duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [21]:
df_prods_clean_no_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [22]:
df_prods_clean_no_dups.shape

(49672, 5)

# 04 Exporting cleaned products.csv

In [23]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

In [24]:
df_ords=df_ords.drop(columns = ['eval_set'])
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


# 05 Checking mixed types in df_ords

In [25]:
for col2 in df_ords.columns.tolist():
  weird2 = (df_ords[[col2]].applymap(type) != df_ords[[col2]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird2]) > 0:
    print (col2)

In [26]:
weird2

0          False
1          False
2          False
3          False
4          False
           ...  
3421078    False
3421079    False
3421080    False
3421081    False
3421082    False
Length: 3421083, dtype: bool

In [27]:
# There is no mixed data types in df_ords.

# 06 Finding missing values in df_ords

In [28]:
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [29]:
# creating subset of missing values in days_since_prior_order
df_nan2=df_ords[df_ords['days_since_prior_order'].isnull()==True]

In [30]:
df_nan2

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


In [31]:
df_nan2.tail()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,
3421069,3154581,206209,1,3,11,


In [32]:
df_nan2.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,


# After running check on null value subset first 5 rows and last 5 rows using head() & tail() function, it was examined that all these user id's have day_since_prior_order as these are their first orders. Total user id is 206209 and total missing values for days_since_prior_order is also 206209, meaning each user will have one NaN value located in their first order." 

# 07 Finding Duplicates in df_ords

In [33]:
df_ords_clean = df_ords[df_ords.duplicated()]

In [34]:
df_ords_clean

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


# There are no duplicate values in df_ords.

# 08 Exporting cleaned df_ords

In [35]:
df_ords_clean.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))

In [38]:
df_prods.shape

(49693, 5)