In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load cleaned dataset
train_df = pd.read_csv("../data/train_cleaned.csv")

In [None]:
# 1️⃣ Summary statistics
print("📊 Summary Statistics:")
print(train_df.describe())

In [None]:
# 2️⃣ Class distribution
print("\n🔍 Class Distribution:")
print(train_df["income"].value_counts())

In [None]:
# 3️⃣ Feature correlations (Only Numerical Columns)
numeric_df = train_df.select_dtypes(
    include=["number"])  # Keep only numeric columns

plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# 4️⃣ Visualizing distributions
numerical_cols = ["age", "education.num", "capital.gain", "capital.loss", "hours.per.week"]

plt.figure(figsize=(12,8))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 3, i)
    sns.histplot(train_df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")

plt.tight_layout()
plt.show()


In [None]:
# 5️⃣ Bar charts for categorical features
categorical_cols = ["workclass", "marital.status", "occupation", "relationship", "sex"]

plt.figure(figsize=(12,10))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(3, 2, i)
    sns.countplot(y=train_df[col], order=train_df[col].value_counts(
    ).index, hue=train_df[col], palette="viridis", legend=False)
    plt.title(f"Distribution of {col}")

plt.tight_layout()
plt.show()