In [None]:
## Data Cleaning & EDA: handling missing values, outliers, exploratory analysis

# --- Import libraries ---
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# --- Load dataset ---
df = sns.load_dataset("diamonds")  # diamonds dataset has numerical + categorical vars

print("Dataset loaded! Shape:", df.shape)
df.head()

# --- Inspect the data ---
print("\nInfo:")
print(df.info())

print("\nSummary:")
print(df.describe(include="all").T)

# --- Handling Missing Values ---
print("\nMissing values per column:")
print(df.isnull().sum())

# Artificially introduce some missing values for demo
df.loc[df.sample(50, random_state=42).index, 'depth'] = np.nan

print("\nAfter adding NaNs:")
print(df.isnull().sum())

# Fill missing with median
df['depth'] = df['depth'].fillna(df['depth'].median())
print("\nMissing values handled:")
print(df.isnull().sum())

# --- Outlier Detection ---
# Let's check carat distribution
sns.boxplot(x=df['carat'])
plt.title("Carat Value Distribution (Outliers Detection)")
plt.show()

# Define IQR method for outliers
def detect_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    return series[(series < lower) | (series > upper)]

outliers = detect_outliers_iqr(df['carat'])
print("\nNumber of outliers in 'carat':", len(outliers))

# Remove outliers (optional)
df_no_outliers = df[~df['carat'].isin(outliers)]
print("After removing outliers: ", df_no_outliers.shape)

# --- Exploratory Data Analysis ---
# Distribution plot
sns.histplot(df['price'], bins=50, kde=True)
plt.title("Distribution of Diamond Prices")
plt.show()

# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Categorical vs numerical analysis
sns.boxplot(x='cut', y='price', data=df)
plt.title("Diamond Price by Cut Quality")
plt.show()

# --- Mission Task  ---
print("\nMISSION: Now it's your turn!")
print("1. Detect outliers in 'price' using the IQR method.")
print("2. Remove them and compare the average price before vs. after.")
print("3. Explore relationships between 'carat' and 'price' using a scatter plot.")
