<a href="https://colab.research.google.com/github/nesredingebeyehu15-dot/My-AI-Engineering-Journey/blob/main/Module-02-Data-Science-Foundations/Day-22_and_23-Pandas-Missing-Values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# ======================================================================
# PROJECT: Cleaning Product Data for a Fictional Online Store
# GOAL: Handle missing values in the price and units_sold columns.
# ======================================================================

# --- 1. SETUP: Create the initial messy DataFrame ---
product_data = {
    'product_name': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Webcam'],
    'price_usd': [1200, 25, 75, np.nan, 40],
    'units_sold': [150, 200, 180, 120, np.nan]
}
products_df = pd.DataFrame(product_data)

print("--- STEP 1: Initial Messy Data ---")
print(products_df)
print("\n")


# --- 2. INVESTIGATION: Find out where the problems are ---
print("--- STEP 2: Investigating Missing Values ---")
print("Count of missing values in each column:")
print(products_df.isnull().sum())
print("\n")


# --- 3. CLEANING: Apply logical fixes ---
print("--- STEP 3: Cleaning the Data ---")
# Create a copy to work on, preserving the original data
cleaned_df = products_df.copy()

# --- Fixing the 'price_usd' column ---
# Logic: It's reasonable to fill a missing price with the average price of other products.
mean_price = cleaned_df['price_usd'].mean()
print(f"Decision for 'price_usd': Filling NaN with the mean price of ${mean_price:.2f}")
cleaned_df['price_usd'] = cleaned_df['price_usd'].fillna(mean_price)

# --- Fixing the 'units_sold' column ---
# Logic: If units sold is missing, it's safer to assume 0 units were sold than to guess.
print("Decision for 'units_sold': Filling NaN with 0.")
cleaned_df['units_sold'] = cleaned_df['units_sold'].fillna(0)
print("\n")


# --- 4. VERIFICATION: Show the final clean DataFrame ---
print("--- STEP 4: Final Cleaned Data ---")
print(cleaned_df)

# Final check to confirm there are no more missing values
print("\nFinal check for missing values:")
print(cleaned_df.isnull().sum())
print("\nMission Accomplished. The data is now ready for analysis!")

--- STEP 1: Initial Messy Data ---
  product_name  price_usd  units_sold
0       Laptop     1200.0       150.0
1        Mouse       25.0       200.0
2     Keyboard       75.0       180.0
3      Monitor        NaN       120.0
4       Webcam       40.0         NaN


--- STEP 2: Investigating Missing Values ---
Count of missing values in each column:
product_name    0
price_usd       1
units_sold      1
dtype: int64


--- STEP 3: Cleaning the Data ---
Decision for 'price_usd': Filling NaN with the mean price of $335.00
Decision for 'units_sold': Filling NaN with 0.


--- STEP 4: Final Cleaned Data ---
  product_name  price_usd  units_sold
0       Laptop     1200.0       150.0
1        Mouse       25.0       200.0
2     Keyboard       75.0       180.0
3      Monitor      335.0       120.0
4       Webcam       40.0         0.0

Final check for missing values:
product_name    0
price_usd       0
units_sold      0
dtype: int64

Mission Accomplished. The data is now ready for analysis!
