In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Set the path to the main project folder
path = r'C:\Users\luis\Desktop\Carrer Foundry boot camp\Python Fundamentals for Data Analysts'

# Import the orders data
df_ords = pd.read_csv(os.path.join(path, 'Data', 'Original Data', '4.3_orders_products', 'orders.csv'))

# Import the products data
df_prods = pd.read_csv(os.path.join(path, 'Data', 'Original Data', '4.3_orders_products', 'products.csv'))


In [13]:
# Run df.describe() on df_prods
df_prods.describe()


Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


In [14]:
# Check for mixed-type data in df_prods
mixed_found = False

for col in df_prods.columns:
    weird = (df_prods[col].map(type) != df_prods[col].map(type).iloc[0]).any()
    if weird:
        print(col)
        mixed_found = True

if not mixed_found:
    print("No mixed values")


product_name


## It looks like the column product_name contains mixed data types. Let's fix this by converting the product_name column to a single data type, in this case, string. After that, we can continue with the consistency checks.

In [15]:
# Fix mixed-type data in product_name column
df_prods['product_name'] = df_prods['product_name'].astype('str')


In [16]:
# Check for mixed-type data in df_prods
mixed_found = False

for col in df_prods.columns:
    weird = (df_prods[col].map(type) != df_prods[col].map(type).iloc[0]).any()
    if weird:
        print(col)
        mixed_found = True

if not mixed_found:
    print("No mixed values")

No mixed values


In [17]:
# Check for missing values in df_prods
missing_values = df_prods.isnull().sum()
print(missing_values)


product_id       0
product_name     0
aisle_id         0
department_id    0
prices           0
dtype: int64


In [29]:
# Check for duplicate values in df_prods
duplicate_values = df_prods.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_values}")


Number of duplicate rows: 5


In [30]:
# Remove duplicate rows in df_prods
df_prods_clean_no_dups = df_prods.drop_duplicates()

In [31]:
# Check the shape to ensure duplicates are removed
print("Shape before removing duplicates:", df_prods.shape)
print("Shape after removing duplicates:", df_prods_clean_no_dups.shape)

Shape before removing duplicates: (49693, 5)
Shape after removing duplicates: (49688, 5)


In [32]:
# Check for duplicate values again to confirm removal
duplicate_values_after = df_prods_clean_no_dups.duplicated().sum()
print(f"Number of duplicate rows after cleaning: {duplicate_values_after}")

Number of duplicate rows after cleaning: 0


In [33]:
# Export the final cleaned df_prods dataframe
df_prods_clean_no_dups.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'products_checked.csv'), index=False)

# DF_ORDS

In [3]:
# Run df.describe() on df_ords
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [4]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


### Analysis of df_ords.describe() Output
Based on the output of the `df_ords.describe()` function, the following observations are made:

- `order_hour_of_day` has a min value of 0 and a max value of 23, which is expected as there are 24 hours in a day.
- `days_since_prior_order` has a min value of 0, which is reasonable, but the presence of NaN values should be further investigated.
- `order_dow` ranges from 0 to 6, which is consistent with the seven days of the week.

There are no immediate red flags in the min and max values, but the missing values in `days_since_prior_order` should be examined.


In [9]:
# Check for mixed-type data in df_ords
mixed_found = False

for col in df_ords.columns:
    weird = (df_ords[col].map(type) != df_ords[col].map(type).iloc[0]).any()
    if weird:
        print(col)
        mixed_found = True

if not mixed_found:
    print("No mixed values")


No mixed values


# Check for Missing Values in df_ords:

In [10]:
# Check for missing values in df_ords
df_ords.isnull().sum()


order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

### Findings on Missing Values
The `df_ords` dataframe has missing values in the `days_since_prior_order` column. These missing values likely indicate that this is the first order for the user, so there is no prior order to calculate the days since.

### Proposed Method for Addressing Missing Values
We will replace the missing values in the `days_since_prior_order` column with 0, indicating that this is the first order.


In [11]:
# Addressing missing values
df_ords['days_since_prior_order'].fillna(0, inplace=True)


### Method Explanation for Addressing Missing Values
We used the fillna method with 0 because it logically indicates that there is no prior order. This preserves the integrity of the data while ensuring there are no NaN values that could disrupt analysis.


In [12]:
# Check for duplicate values in df_ords
df_ords.duplicated().sum()


0

### Findings on Duplicate Values
The `df_ords` dataframe has 0 duplicate values. Therefore, no further action is required for duplicate values.


In [34]:
df_ords.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'orders_cleaned.csv'), index=False)