
# 07 — Outliers: Z-score, Robust Z (MAD), IQR Rule 

Work with two datasets (A: no outlier; B: plus 500). Compare SD vs MAD and IQR fences, and visualize.


In [None]:

import numpy as np, pandas as pd
import matplotlib.pyplot as plt

A = np.array([25,30,35,40,45,50,55,60,65])
B = np.array([25,30,35,40,45,50,55,60,65,500])

def mad(arr):
    med = np.median(arr)
    return np.median(np.abs(arr - med))

def robust_z(x):
    med = np.median(x)
    m = mad(x)
    if m == 0:
        return np.zeros_like(x, dtype=float)
    return 0.6745*(x - med)/m

def iqr_flags(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    flags = (x < lo) | (x > hi)
    return flags, (lo, hi)

for name, x in [("A", A), ("B", B)]:
    rz = robust_z(x)
    flags, (lo, hi) = iqr_flags(x)
    print(f"{name}: mean={np.mean(x):.2f}, sd={np.std(x, ddof=1):.2f}, MAD={mad(x):.2f}")
    print(f"  robust-Z>3 count: {(np.abs(rz)>3).sum()}, IQR flags: {flags.sum()}, fences={lo:.2f},{hi:.2f}")

# Simple plot (no seaborn, single axes, no color set)
plt.figure()
plt.boxplot([A,B], labels=["A","B"])
plt.title("Boxplots (IQR) — Dataset A vs B")
plt.xlabel("Dataset")
plt.ylabel("Value")
plt.show()

# ---- Classwork ----
# 1) Winsorize dataset B at the IQR fences (clip values) and recompute mean and SD.
# 2) Compare robust-Z>3 vs IQR rule for detecting the 500 outlier. Which is more stable here?
