In [None]:
# =============================================================================
# 1. LOAD LIBRARIES AND DATASET
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
import os
from pathlib import Path

# Inline plot settings
%matplotlib inline
plt.style.use("default")

# Path to your file
DATA_PATH      = Path(os.getenv(
    "DATASET_PATH",
    "data/processed_data/case_3.csv",
))

# Load the CSV
df = pd.read_csv(DATA_PATH)

print(f"Dataset shape: {df.shape}")
df.head()



In [None]:
# =============================================================================
# 2. OVERVIEW: DATA TYPES AND MISSING VALUES
# =============================================================================
display(df.dtypes.to_frame("dtype"))

missing_cnt  = df.isna().sum()
missing_pct  = 100 * missing_cnt / len(df)
missing_tbl  = pd.DataFrame({"missing_cnt": missing_cnt,
                             "missing_pct": missing_pct})
display(missing_tbl[missing_tbl.missing_cnt > 0]
        .sort_values("missing_pct", ascending=False))


In [None]:
# =============================================================================
# 3. DESCRIPTIVE STATISTICS
# =============================================================================
num_cols  = df.select_dtypes(include=[np.number]).columns
cat_cols  = df.select_dtypes(exclude=[np.number]).columns

print("Numeric features")
display(df[num_cols].describe().T)

print("\nCategorical features - top 10 levels")
for col in cat_cols:
    print(f"\n{col}:")
    display(df[col].value_counts(dropna=False).head(10))


In [None]:
# =============================================================================
# 4. HISTOGRAMS AND BOX PLOTS (NUMERIC)
# =============================================================================
for col in num_cols:
    fig, axes = plt.subplots(1, 2, figsize=(10, 3.5))
    # Histogram
    axes[0].hist(df[col].dropna(), bins=30)
    axes[0].set_title(f"{col} - histogram")
    # Box plot
    axes[1].boxplot(df[col].dropna(), vert=False)
    axes[1].set_title(f"{col} - box plot")
    plt.tight_layout()
    plt.show()


In [None]:
# =============================================================================
# 5. CORRELATION HEATMAP
# =============================================================================
corr = df[num_cols].corr().abs()

plt.figure(figsize=(10, 8))
sns.heatmap(corr,
            cmap="viridis",
            square=True,
            fmt=".2f",
            cbar_kws={"shrink": .8})
plt.title("Absolute Correlation Matrix")
plt.tight_layout()
plt.show()

# High-correlation pairs (|r| > 0.80)
THRESHOLD = 0.80
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
high_corr = [(row, col, upper.loc[row, col])
             for row in upper.index
             for col in upper.columns
             if upper.loc[row, col] > THRESHOLD]

high_corr_df = (pd.DataFrame(high_corr,
                             columns=["Feature 1", "Feature 2", "Correlation"])
                .sort_values("Correlation", ascending=False))
display(high_corr_df)


In [None]:
# TODO: add a short summary of correlation findings