In [None]:
# Day 2 - Exploratory Data Analysis (EDA)

# Step 1: Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("../data/synthetic_water_quality.csv")

# Quick check
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

# Step 2: Visualize distributions
plt.figure(figsize=(10, 6))
df.hist(bins=20, figsize=(12, 8), color='skyblue', edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

# Step 3: Boxplots to check outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data=df.drop(columns=["Water_Safe"]))
plt.title("Boxplots - Outlier Check", fontsize=14)
plt.show()

# Step 4: Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, cmap="Blues", fmt=".2f")
plt.title("Feature Correlation Heatmap", fontsize=14)
plt.show()

# Step 5: Relation of each feature with Water_Safe
for col in df.columns[:-1]:
    plt.figure(figsize=(6,4))
    sns.boxplot(x="Water_Safe", y=col, data=df)
    plt.title(f"{col} vs Water_Safe")
    plt.show()


"Step 1 – Importing & Loading Data"

We load the dataset and check the shape and columns.

"Step 2 – Feature Distributions"

Histograms show how each feature is distributed and if the data is skewed.

"Step 3 – Outlier Detection"

Boxplots help identify outliers in each numeric feature.

"Step 4 – Correlation Matrix"

Correlation heatmap shows relationships between features.

"Step 5 – Feature vs Target"

We visualize how each parameter differs between Safe and Unsafe water samples.