# 01. Import libraries

In [1]:
# Import libraries 
import pandas as pd
import numpy as np
import os

# 02. Importing csv files

In [2]:
path = r'C:\Users\myra_\Documents\Instacart Basket Analysis 02-2023'

In [3]:
# Import the dataset 'products.csv'
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original data', 'products.csv'), index_col = False)

In [4]:
# Import the dataset 'orders_wrangled.csv'
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared data', 'orders_wrangled.csv'), index_col = False)

# 03. Consistency checks on 'df_prods'

## a) Checking for missing values

In [5]:
# Checking to see if there are any missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [6]:
# Creating a dataframe, 'df_nan', that has only null values in'product_name'
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [7]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [8]:
# To check the number of rows and columns in df_prods
df_prods.shape

(49693, 5)

In [9]:
# Creating a dataframe that would have non-missing values 
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [10]:
df_prods_clean.shape

(49677, 5)

The new dataframe 'df_prods_clean' has 16 less rows than 'df_prods'. 

## b) Checking for duplicates

In [11]:
# Checking for duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [12]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [13]:
# To check the number of rows and columns in df_prods_clean
df_prods_clean.shape

(49677, 5)

In [14]:
# New dataframe that does not include the duplicates 
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [15]:
df_prods_clean_no_dups.shape

(49672, 5)

The dataframe 'df_prods_clean_no_dups' has no duplicates. The above function shows that the rows have been reduced by 5, showing that the 5 duplicate rows have been deleted. 

The only possible solution for these duplicates were to delete them since there was no other explanation than the fact that they were accidentally recorded twice. 

## c) Checking for mixed types

In [16]:
# Checking for mixed types
for col in df_prods_clean_no_dups.columns.tolist():
  weird_prods = (df_prods_clean_no_dups[[col]].applymap(type) != df_prods_clean_no_dups[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_prods_clean_no_dups[weird]) > 0:
    print (col)

NameError: name 'weird' is not defined

In [None]:
weird_prods

If weird is true, the command print(col) is executed, which prints the problematic column and because of the for-loop, this command will be executed on every column in the dataframe, printing every mixed-type column it finds. So, since it did not print anything, it shows that there is no mixed types in the columns.

In [17]:
df_prods_clean_no_dups.shape

(49672, 5)

## d) Making a csv file

In [None]:
# Making a csv file for df_prods_clean_no_dups
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared data', 'products_checked.csv'))

# 04. Consistency checks on 'df_ords'

## a) Renaming the column

In [None]:
# Renaming the column 'order_dow'
df_ords.rename(columns={'order_dow': 'order_day_of_week'}, inplace=True)

## b) Interpreting the function df.describe()

In [None]:
df_ords.describe()

The data from the df.describe function looks fine. Looking at the column for day of the week - the maximum value is 6 and the minimum value is 0, which indicates that all values in this column are between this range. The values in column 'order_hour_of_day' has a range from 0 to 23, which also indicates that the data does not have any values that can indicate that it is incorrect. All values are also positive, as it should be.

## c) Checking for mixed types

In [None]:
# Checking for mixed types
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

In [None]:
weird

If weird is true, the command print(col) is executed, which prints the problematic column and because of the for-loop, this command will be executed on every column in the dataframe, printing every mixed-type column it finds. So, since it did not print anything, it shows that there is no mixed types in the columns.

## d) Checking for missing values

In [None]:
df_ords.isnull().sum()

There are 206209 missing values in the column 'days_since_prior_order'

In [None]:
# Subset table of only those values that have missing values
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [None]:
df_ords_nan

In [None]:
df_ords_nan.describe()

There are 206209 missing values. For all these values the order number is 1. This means that these are new customers that are ordering for the first time, that is why the value in 'days_since_prior_order' is null, since it cannot be applied to these customers. 

I would not be deleting these rows with missing values because it is not an error in the data but these values are only there because those customers are ordering for the first time so that specific column is not applicable for them.

## e) Checking for duplicates

In [None]:
df_ords_dups = df_ords[df_ords.duplicated()]

In [None]:
df_ords_dups

There are no duplicate values found in df_ords. 

In [18]:
df_ords.shape

(3421083, 8)

## f) Making a csv file

In [None]:
# Making a csv file for df_ords
df_ords.to_csv(os.path.join(path, '02 Data','Prepared data', 'orders_checked.csv'))