In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# Import dataframes
path = r"D:\0 - Data Analytics\4 - Python fundamentals for Data Analysts\Instacart Basket Analisys"
# vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']
# df_ords = pd.read_csv(os.path.join(path, '02 - Data', 'Original Data', 'orders.csv'), usecols = vars_list)
df_ords = pd.read_csv(
    os.path.join(path, "02 Data", "Prepared Data", "orders_renamed_wrangled.csv")
)
df_prods = pd.read_csv(os.path.join(path, "02 Data", "Original Data", "products.csv"))
# df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'))

# Data Consistency Checks


In [3]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


## MixedType Data


In [5]:
# create a dataframe
df_test = pd.DataFrame()

In [None]:
# Create a mixed type column
df_test["mix"] = ["a", "b", 1, True]

In [7]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


### Function to check for mixed-type columns in a Dataframe


In [9]:
# Check for mixed types

# for col in df_test.columns.tolist():
#   weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
#   if len (df_test[weird]) > 0:
#     print (col)

for col in df_test.columns.tolist():
    weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df_test[weird]) > 0:
        print(col)

mix


In [None]:
# I will change the datatype of the mix column to String
df_test["mix"] = df_test["mix"].astype("str")

## Missing values

### Finding missing values


In [12]:
# This function look in every column for missing values and when it find one it sum it
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

We can see that in the product_name column we are missing 16 values.


In [None]:
# Create a dataframe with the null values using 'isNull() = True'
df_nan = df_prods[df_prods["product_name"].isnull() == True]

In [15]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


Here er identified the rows with missing values


### Addressing missing values


Options:

- Create a new variable that acts like a flag based on the missing value.


- Impute the value with the mean or median of the column (if the variable is numeric) normally with mean or median.

  Example using the mean. Use df.describe() to find the mean, then use df['column with missings'].fillna(mean value, inplace=True).

  Example using the median. Use df.median() to find the median, then use df['column with missings'].fillna(median value, inplace=True).

  Linear Interpolation
  An alternative way to impute missing values is to use linear interpolation, which is a special way of dealing with missing data in time- series data. Linear interpolation is a fancy way to say “connecting two points with a line,” and it involves finding the mean of the rows before the missing value occurs, finding the mean of the rows after the missing value occurs, and estimating where the missing value should fall between those two means.


- Remove or filter out the missing data.


In [21]:
# In this case we cant inpute values because they are strings, before removing them let's check the shape.
# Missing values are ok for the size of dataframe.
df_prods.shape

(49693, 5)

In [None]:
# create new dataframe df_prods_clean
df_prods_clean = df_prods[df_prods["product_name"].isnull() == False]
df_prods_clean.shape

(49677, 5)

In [None]:
# Other way to drop missing values
df_prods.dropna(inplace=True)
# Dropping values from a specific column
df_prods.dropna(subset=["product_name"], inplace=True)

# # Both cases we are overwriting  the daframe using the 'inplace = True' function
# If you don’t specify an inplace argument in your code, the function will take the default setting,
# which is inplace = False. When specified as False, the command will only return a view of the changed dataframe,
# leaving the original dataframe untouched.

## Duplicates

Duplicate values are a common issue in data analysis and must be carefully examined before removal.

#### **Key Points:**

- **Types of Duplicates:**

  - **Column-level duplicates** (e.g., repeated `product_id` values).
  - **Full row duplicates** (identical rows across all columns).

- **Handling Approach:**
  - Identify and analyze duplicates before removal.
  - Export and report duplicates to clients for clarification.
  - Clients may discover underlying issues in their data collection.


### Finding duplicates

In this case we are looking for full rows because there are multiple single duplicates. ex: `aisle_id` and `department_id`


In [26]:
# This command look for full duplicates
df_prods_dups = df_prods_clean[df_prods_clean.duplicated()]
# we run the `duplicated()` function on the df_prods and when it find a duplicated row it save it in the subset dp_prods_dups
df_prods_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


## Addressing duplicates

function to drop duplicates
`df.drop_duplicates()`


In [28]:
# Check the shape of daframe
df_prods_clean.shape

(49677, 5)

In [29]:
# Create dataframe with no duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [30]:
# check the shape
df_prods_clean_no_dups.shape

(49672, 5)

# Export Products


In [None]:
# Export the dataframe
df_prods_clean_no_dups.to_csv(
    os.path.join(path, "02 Data", "Prepared data", "products_cleaned.csv")
)

# Task 4.5


## 2

Run the df.describe() function on your df_ords dataframe. Using your new knowledge about how to interpret the output of this function, share in a markdown cell whether anything about the data looks off or should be investigated further.
Tip: Keep an eye on min and max values!


In [35]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


- First of all the columns 'order_id', 'user_id' and 'order_number' should not be present when we run the function 'describe()' because in the last exercise I transformed them in strings. Probably what's happening here is that the .csv file don't store the dataype and we have to set at the moment of the import the datatype for the columns.
  So since the first 3 columns should not be here we can skip them.
- Looking at the count in 'days-since-last-order' it look likes we are missing some values in that column. This may be caused bysome customers beeing first -time-buyers or they might be incomplete or corruped.
- In the 'order_day_of_week' the min value are 0 and 6. 7 numbers as the days of the week, they just start indexing the Sunday(?) with 0.


## 3

Check for mixed-type data in your df_ords dataframe


In [None]:
# Run functfor loop to check for inconsistent datatype.
# Using this sytax because is not deprecated as the one suggested in the course.

for col in df_ords.columns.tolist():
    weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df_ords[weird]) > 0:
        print(col)

There's no miced values in the dataframe variables.


## 4

If you find mixed-type data, fix it. The column in question should contain observations of a single data type.


In [41]:
# checking the columns
df_ords.dtypes

order_id                   int64
user_id                    int64
order_number               int64
orders_day_of_week         int64
order_hour_of_day          int64
days_since_last_order    float64
dtype: object

## 5

Run a check for missing values in your df_ords dataframe.
In a markdown cell, report your findings and propose an explanation for any missing values you find.


In [43]:
# run the function to chedck for missing values
df_ords.isnull().sum()

order_id                      0
user_id                       0
order_number                  0
orders_day_of_week            0
order_hour_of_day             0
days_since_last_order    206209
dtype: int64

'days_since_last_order' has a lot of missing values. Here we have to understand if this is caused by the fact that some of theese people have never made an order or an error of the transciption of the databse.
This information should be stored in another dataframe and sent to the customer to have a check.


In [None]:
# Create a dataframe with the null values using 'isNull() = True'
df_ords_missing = df_ords[df_ords["days_since_last_order"].isnull() == True]

## 6

Address the missing values using an appropriate method.
In a markdown cell, explain why you used your method of choice.


In [None]:
# I already have the daframe with all the missing value now i want to check if all the missing orders have the same order_number
def is_column_constant(df, col_name):
    # Checks if all values in the specified column are the same and returns the value if true.

    # Args:
    # df (pd.DataFrame): The DataFrame to check.
    # col_name (str): The column name to check.

    # Returns:
    # tuple: (True, value) if all values are the same, (False, None) otherwise.

    unique_values = df[col_name].dropna().unique()  # Get unique non-null values
    if len(unique_values) == 1:
        return True, unique_values[0]  # Return True and the constant value
    return False, None  # Return False if values are not the same


# Run the function on df_ords_missing['order_number']
result, value = is_column_constant(df_ords_missing, "order_number")

if result:
    print(f"'order_number' is constant with value: {value}")
else:
    print("'order_number' is NOT constant")

'order_number' is constant with value: 1


It looks like that the missing values have all in common that was the first order.
Erasing the rows would be a wrong move, here the only solution is to contact the client and tell them that they probably have a problem in the data collection since customers that have done only one order should have a placeholder in the 'days_since_last_order ' untill they make a second order.


In [None]:
# Create a new column 'first_order' where 1 means first-time order, 0 means repeat order
df_ords_modified = df_ords.copy()  # Make a copy to keep the original safe

# Add the 'first_order' column in the new DataFrame
df_ords_modified["first_order"] = df_ords_modified["days_since_last_order"].isna()
df_ords_modified.head()


Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,first_order
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False


Now we have our new dataframe with an added column for understanding if that was the first order or not,


## 7

Run a check for duplicate values in your df_ords data.
In a markdown cell, report your findings and propose an explanation for any duplicate values you find.


In [52]:
# This command look for full duplicates

df_ords_dups = df_ords[df_ords.duplicated()]
df_ords_dups

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order


It looks like that there are not duplicate.


## 9

Export your final, cleaned df_prods and df_ords data as “.csv” files in your “Prepared Data” folder and give them appropriate, succinct names.


In [None]:
# Export the df_ords_modified
df_ords_modified.to_csv(
    os.path.join(path, "02 Data", "Prepared data", "orders_cleaned.csv")
)