In [50]:
# Generate a modern pedagogical SVG diagram of the FeatureEngineer pipeline with pastel colors
# The SVG is saved to feature_engineer_pipeline_modern.svg for download.

from math import ceil
from textwrap import wrap

SVG_PATH = "feature_engineer_pipeline_modern.svg"

# ----------------------------
# Modern Pastel Color Palette
# ----------------------------
PALETTE = {
    "bg": "#FCFCFD",
    "lane_lbl": "#334155",
    "stroke": "#CBD5E1",
    "text": "#0F172A",
    "muted": "#475569",
    "title": "#0B1020",
    
    # Category fills (cards)
    "num1": "#E0F2FE",  # math blue
    "num2": "#EDE9FE",  # interactions purple
    "num3": "#FEF3C7",  # binning amber
    "num4": "#CCFBF1",  # clustering teal
    "num5": "#FEF9C3",  # stats yellow
    "num6": "#FCE7F3",  # fourier pink
    "corr": "#F1F5F9",  # corr removal gray
    "imp": "#FAE8FF",   # imputation violet
    "scale": "#E2F7F0", # scaling mint
    "ae": "#FFE4E6",    # autoencoder rose
    "cat1": "#DCFCE7",  # rare handling green
    "cat2": "#ECFCCB",  # target enc light green
    "cat3": "#E5E7EB",  # one-hot neutral
    "cat4": "#FFF7ED",  # freq enc peach
    "bool": "#F1F5F9",
    "sel1": "#E0E7FF",  # AdaptiveMI indigo
    "sel2": "#E9D5FF",  # SHAP lavender
    "join": "#F8FAFC",
    "out": "#F0FDFA",
    
    # Lane backgrounds
    "lane1": "#F8FAFC",
    "lane2": "#F9FAFB",
    "lane3": "#F8FAFC",
    "lane4": "#F9FAFB",
    "lane5": "#F8FAFC",
    
    "optional_dash": "#9CA3AF",
}

FONT = "Inter, ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, Noto Sans, sans-serif"

# ----------------------------
# Layout
# ----------------------------
W, H = 1420, 1280
MARGIN = 32
COL_W = 280
ROW_H = 120
GAP_X = 36
GAP_Y = 28
RX = 16  # Slightly more rounded corners for modern look

LANE_Y = {
    "header": 20,
    "ingest": 110,
    "numerical": 250,
    "categorical": 720,
    "boolean": 900,
    "selection": 1060,
}

LANE_H = {
    "ingest": 130,
    "numerical": 455,
    "categorical": 150,
    "boolean": 140,
    "selection": 160,
}

def esc(s: str) -> str:
    return (s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;"))

parts = []
parts.append(f'<svg xmlns="http://www.w3.org/2000/svg" width="{W}" height="{H}" '
             f'viewBox="0 0 {W} {H}" style="background:{PALETTE["bg"]};">')

# --------- Defs ----------
parts.append(f"""
  <defs>
    <filter id="shadow" x="-20%" y="-20%" width="140%" height="140%">
      <feDropShadow dx="0" dy="2" stdDeviation="3" flood-color="#000000" flood-opacity="0.06"/>
    </filter>
    <filter id="glow" x="-50%" y="-50%" width="200%" height="200%">
      <feGaussianBlur stdDeviation="2" result="coloredBlur"/>
      <feMerge> 
        <feMergeNode in="coloredBlur"/>
        <feMergeNode in="SourceGraphic"/>
      </feMerge>
    </filter>
    <marker id="arrow" viewBox="0 0 10 10" refX="8.5" refY="5" markerWidth="8" markerHeight="8" orient="auto-start-reverse">
      <path d="M 0 0 L 10 5 L 0 10 z" fill="{PALETTE["stroke"]}" />
    </marker>
    <linearGradient id="titleGrad" x1="0%" y1="0%" x2="100%" y2="0%">
      <stop offset="0%" style="stop-color:{PALETTE["title"]};stop-opacity:1" />
      <stop offset="100%" style="stop-color:#475569;stop-opacity:0.8" />
    </linearGradient>
  </defs>
""")

# --------- Title ----------
title = "Feature Engineering Pipeline (2025+)"
subtitle = "Modern pedagogical diagram • enriched with SOTA blocks & pastel aesthetics"
parts.append(f'''
  <g>
    <text x="{MARGIN}" y="{LANE_Y["header"]+10}" font-family="{FONT}" font-size="28" font-weight="800" fill="url(#titleGrad)">{esc(title)}</text>
    <text x="{MARGIN}" y="{LANE_Y["header"]+38}" font-family="{FONT}" font-size="15" fill="{PALETTE["muted"]}" font-weight="500">{esc(subtitle)}</text>
  </g>
''')

# --------- Swimlanes ----------
def lane(y, h, label, fill):
    parts.append(f'<rect x="{MARGIN}" y="{y}" width="{W-2*MARGIN}" height="{h}" rx="{RX+2}" fill="{fill}" stroke="{PALETTE["stroke"]}" stroke-width="1.5" opacity="0.7"/>')
    parts.append(f'<text x="{MARGIN+16}" y="{y+26}" font-family="{FONT}" font-size="15" fill="{PALETTE["lane_lbl"]}" font-weight="700">{esc(label)}</text>')

lane(LANE_Y["ingest"], LANE_H["ingest"], "🔄 Ingestion & Type Inference", PALETTE["lane1"])
lane(LANE_Y["numerical"], LANE_H["numerical"], "🔢 Numerical Feature Engineering", PALETTE["lane2"])
lane(LANE_Y["categorical"], LANE_H["categorical"], "🏷️ Categorical Processing", PALETTE["lane3"])
lane(LANE_Y["boolean"], LANE_H["boolean"], "✅ Boolean Handling", PALETTE["lane4"])
lane(LANE_Y["selection"], LANE_H["selection"], "🎯 Selection • Scaling • Output", PALETTE["lane5"])

# --------- Block helpers ----------
def block(x, y, w, h, title, lines=None, optional=False, dashed=False, small=False, color="join"):
    fill = PALETTE.get(color, PALETTE["join"])
    stroke = PALETTE["stroke"]
    dash = ' stroke-dasharray="8,4"' if dashed or optional else ""
    shadow = "url(#shadow)" if not optional else "url(#glow)"
    
    parts.append(f'<rect x="{x}" y="{y}" width="{w}" height="{h}" rx="{RX}" fill="{fill}" stroke="{stroke}" stroke-width="1.5"{dash} filter="{shadow}"/>')
    parts.append(f'<text x="{x+16}" y="{y+24}" font-family="{FONT}" font-size="{13 if small else 15}" font-weight="700" fill="{PALETTE["text"]}">{esc(title)}</text>')
    
    if lines:
        start_y = y + (40 if not small else 34)
        fs = 13 if not small else 12
        for i, line in enumerate(lines):
            parts.append(f'<text x="{x+16}" y="{start_y + i*18}" font-family="{FONT}" font-size="{fs}" fill="{PALETTE["muted"]}" font-weight="400">{esc(line)}</text>')
    
    if optional:
        parts.append(f'<rect x="{x+w-76}" y="{y-12}" rx="12" ry="12" width="68" height="24" fill="{PALETTE["bg"]}" stroke="{PALETTE["optional_dash"]}" stroke-width="1.5"/>')
        parts.append(f'<text x="{x+w-42}" y="{y+4}" text-anchor="middle" font-family="{FONT}" font-size="11" fill="{PALETTE["optional_dash"]}" font-weight="600">optional</text>')

def arrow(x1, y1, x2, y2, curved=False):
    if curved:
        mid_x = (x1 + x2) / 2
        curve_y = min(y1, y2) - 20
        parts.append(f'<path d="M {x1} {y1} Q {mid_x} {curve_y} {x2} {y2}" stroke="{PALETTE["stroke"]}" stroke-width="2.5" fill="none" marker-end="url(#arrow)"/>')
    else:
        parts.append(f'<line x1="{x1}" y1="{y1}" x2="{x2}" y2="{y2}" stroke="{PALETTE["stroke"]}" stroke-width="2.5" marker-end="url(#arrow)"/>')

# --------- Ingestion ----------
x0 = MARGIN + 16
b_w = COL_W
b_h = 90

block(x0, LANE_Y["ingest"]+36, b_w, b_h, "📊 Input Data X, y", [
    "• X ∈ ℝⁿˣᵖ, y ∈ ℝⁿ",
    "• Mixed dtypes, NaNs handled",
    "• Robust data validation"
], color="imp")

block(x0 + (b_w+GAP_X), LANE_Y["ingest"]+36, b_w, b_h, "🔍 Type Inference", [
    "• Detect {ℝ, Cat, Bool, Time}",
    "• Smart dtype parsing",
    "• DateTime recognition"
], color="corr")

arrow(x0 + b_w, LANE_Y["ingest"]+36+b_h/2, x0+(b_w+GAP_X), LANE_Y["ingest"]+36+b_h/2)

# --------- Numerical ----------
nx = MARGIN + 16
ny = LANE_Y["numerical"] + 36

# Row 1
block(nx, ny, b_w, 120, "⚡ Math Transforms", [
    "• log, √, 1/x, Box-Cox/Yeo-Johnson",
    "• Cyclical encodings (sin, cos)",
    "• Power transformations"
], color="num1")

block(nx+(b_w+GAP_X), ny, b_w, 120, "🔗 Smart Interactions", [
    "• Polynomial {x², x·y}, ratios",
    "• HSIC / MI gating ≤50 kept",
    "• Automated feature crossing"
], color="num2")

block(nx+2*(b_w+GAP_X), ny, b_w, 120, "📊 Adaptive Binning", [
    "• Bayesian blocks, FD rule",
    "• Auto choice by entropy",
    "• Equal-width/frequency bins"
], color="num3")

# Row 2
ny2 = ny+120+GAP_Y
block(nx, ny2, b_w, 110, "🎯 Clustering Features", [
    "• KMeans/GMM cluster IDs",
    "• Distances, silhouette scores",
    "• Cluster membership probs"
], optional=True, color="num4")

block(nx+(b_w+GAP_X), ny2, b_w, 110, "📈 Row-wise Statistics", [
    "• μ, σ, min, max, median, skew",
    "• Non-null count, null ratio",
    "• Percentile features"
], optional=True, color="num5")

block(nx+2*(b_w+GAP_X), ny2, b_w, 110, "🌊 Fourier & Wavelet", [
    "• FFT |X(f)| top-k frequencies",
    "• DWT multi-resolution coeffs",
    "• Spectral density features"
], optional=True, color="num6")

# Row 3 - SOTA
ny3 = ny2+110+GAP_Y
block(nx, ny3, b_w, 100, "🕸️ Graph Features", [
    "• kNN/correlation graphs",
    "• Node2vec embeddings",
    "• Graph Laplacian features"
], optional=True, dashed=True, color="ae")

block(nx+(b_w+GAP_X), ny3, b_w, 100, "⚠️ Drift Detection", [
    "• KS/PSI tests train vs test",
    "• Population stability index",
    "• Drop unstable features"
], optional=True, dashed=True, color="corr")

# --------- Categorical ----------
cx = MARGIN+16
cy = LANE_Y["categorical"]+36

block(cx, cy, b_w, 100, "🧹 Rare Category Handling", [
    "• p < 1% → 'OTHER'",
    "• Mode imputation",
    "• Smart grouping"
], color="cat1")

block(cx+(b_w+GAP_X), cy, b_w, 100, "🎯 Target Encoding", [
    "• μc = (nc·ȳc + α·ȳ)/(nc+α)",
    "• Regularized smoothing",
    "• Cross-validation aware"
], color="cat2")

block(cx+2*(b_w+GAP_X), cy, b_w, 100, "🔢 One-Hot Encoding", [
    "• Ignore unknown categories",
    "• Drop binary redundancy",
    "• Sparse matrix optimization"
], color="cat3")

block(cx+3*(b_w+GAP_X), cy, b_w, 100, "🤖 Embedding Models", [
    "• Tab-transformer, cat2vec",
    "• LLM embeddings for text",
    "• Neural categorical encoding"
], optional=True, dashed=True, color="cat4")

# --------- Boolean ----------
bx = MARGIN+16
by = LANE_Y["boolean"]+36
block(bx, by, b_w*1.5, 88, "✅ Boolean Processing", [
    "• fillna(mode) → {0,1}",
    "• Logical feature combinations"
], color="bool")

# --------- Selection ----------
sx = MARGIN+16
sy = LANE_Y["selection"]+32

block(sx, sy, b_w, 100, "🧠 Adaptive MI", [
    "• MI(x;y) KSG/Copula estimator",
    "• Retain MI > 10⁻³",
    "• Non-linear dependencies"
], dashed=True, color="sel1")

block(sx+(b_w+GAP_X), sy, b_w, 100, "🔍 SHAP Pruning", [
    "• Mean |φⱼ| > θ (θ≈10⁻³)",
    "• Model-agnostic importance",
    "• Interaction effects"
], dashed=True, color="sel2")

block(sx+2*(b_w+GAP_X), sy, b_w+100, 100, "🔗 Concatenate", [
    "• numerical + categorical + boolean",
    "• Smart dtype handling",
    "• → X_final ∈ ℝⁿˣᵈ"
], color="join")

block(sx+2*(b_w+GAP_X)+(b_w+100+GAP_X), sy, b_w, 100, "🎉 Final Output", [
    "• Clean, encoded matrix",
    "• Optimized for ML pipeline",
    "• Production-ready features"
], color="out")

# Add some curved arrows for visual flow
# arrow(x0 + b_w/2, LANE_Y["ingest"]+36+b_h, nx + b_w/2, ny, curved=True)
# arrow(nx + b_w/2, ny + 120, cx + b_w/2, cy, curved=True)
# arrow(cx + b_w/2, cy + 100, bx + b_w*0.75, by, curved=True)
# arrow(bx + b_w*0.75, by + 88, sx + b_w/2, sy, curved=True)

# --------- Modern Legend ----------
lgx = W-MARGIN-420
lgy = LANE_Y["header"]+10
parts.append(f'<g transform="translate({lgx},{lgy})">')
parts.append(f'<rect x="0" y="0" width="400" height="110" rx="{RX}" fill="{PALETTE["join"]}" stroke="{PALETTE["stroke"]}" stroke-width="1.5" filter="url(#shadow)"/>')
parts.append(f'<text x="16" y="24" font-family="{FONT}" font-size="15" font-weight="700" fill="{PALETTE["text"]}">🎨 Legend</text>')

# Legend items with colored examples
legend_items = [
    ("Solid block", "standard processing", "num1"),
    ("Dashed block", "optional/SOTA methods", None),
    ("Curved arrow", "data flow between lanes", None)
]

y_offset = 38
for i, (label, desc, color) in enumerate(legend_items):
    y_pos = y_offset + i * 22
    if color:
        parts.append(f'<rect x="16" y="{y_pos-8}" width="32" height="16" rx="6" fill="{PALETTE[color]}" stroke="{PALETTE["stroke"]}" stroke-width="1.5"/>')
    elif "Dashed" in label:
        parts.append(f'<rect x="16" y="{y_pos-8}" width="32" height="16" rx="6" fill="{PALETTE["sel2"]}" stroke="{PALETTE["stroke"]}" stroke-width="1.5" stroke-dasharray="8,4"/>')
    else:  # arrow
        parts.append(f'<path d="M 16 {y_pos} Q 32 {y_pos-8} 48 {y_pos}" stroke="{PALETTE["stroke"]}" stroke-width="2.5" fill="none" marker-end="url(#arrow)"/>')
    
    parts.append(f'<text x="56" y="{y_pos+4}" font-family="{FONT}" font-size="13" fill="{PALETTE["text"]}" font-weight="600">{label}</text>')
    parts.append(f'<text x="56" y="{y_pos+18}" font-family="{FONT}" font-size="11" fill="{PALETTE["muted"]}">{desc}</text>')

parts.append('</g>')

# Add a subtle watermark
parts.append(f'<text x="{W-MARGIN-100}" y="{H-20}" font-family="{FONT}" font-size="10" fill="{PALETTE["optional_dash"]}" opacity="0.6">Generated with ❤️</text>')

parts.append("</svg>")

# Write the SVG file
with open(SVG_PATH, "w", encoding="utf-8") as f:
    f.write("".join(parts))

print(f"✨ Modern pastel Feature Engineering Pipeline diagram generated!")
print(f"📁 Saved to: {SVG_PATH}")
print(f"🎨 Features: Pastel colors, modern typography, enhanced visual hierarchy")
print(f"📏 Dimensions: {W}×{H}px")

✨ Modern pastel Feature Engineering Pipeline diagram generated!
📁 Saved to: feature_engineer_pipeline_modern.svg
🎨 Features: Pastel colors, modern typography, enhanced visual hierarchy
📏 Dimensions: 1420×1280px


In [47]:
# Generate an enhanced SOTA Feature Engineering Pipeline with 2024-2025 techniques
# Includes: Neural FE, AutoML, Foundation Models, Causal Discovery, etc.

from math import ceil
from textwrap import wrap

SVG_PATH = "feature_engineer_pipeline_sota.svg"

# ----------------------------
# Enhanced Style (modern)
# ----------------------------
PALETTE = {
    "bg": "#FEFEFE",
    "lane": "#F8FAFF",
    "lane2": "#FFF8F5",
    "lane3": "#F5FFF8",
    "lane4": "#FFF5F8",
    "card": "#FFFFFF",
    "neural": "#E0F2FE",
    "foundation": "#F3E8FF",
    "causal": "#ECFDF5",
    "stroke": "#B8C5D1",
    "text": "#0F172A",
    "muted": "#475569",
    "accent": "#7C3AED",
    "optional_dash": "#94A3B8",
    "join": "#E0E7FF",
    "title": "#0F172A",
    "sota": "#DC2626",
    "new": "#059669",
}

FONT = "Inter, ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, Noto Sans, sans-serif"

# ----------------------------
# Enhanced Layout
# ----------------------------
W, H = 1600, 1600
MARGIN = 32
COL_W = 280
ROW_H = 120
GAP_X = 32
GAP_Y = 24
RX = 12

LANE_Y = {
    "header": 20,
    "ingest": 120,
    "neural": 280,
    "numerical": 460,
    "categorical": 800,
    "foundation": 960,
    "causal": 1120,
    "selection": 1320,
}

LANE_H = {
    "ingest": 140,
    "neural": 160,
    "numerical": 320,
    "categorical": 140,
    "foundation": 140,
    "causal": 140,
    "selection": 180,
}

def esc(s: str) -> str:
    return (s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;"))

parts = []
parts.append(f'<svg xmlns="http://www.w3.org/2000/svg" width="{W}" height="{H}" '
             f'viewBox="0 0 {W} {H}" style="background:{PALETTE["bg"]};">')

# --------- Enhanced Defs ----------
parts.append(f"""
  <defs>
    <filter id="shadow" x="-20%" y="-20%" width="140%" height="140%">
      <feDropShadow dx="0" dy="2" stdDeviation="3" flood-color="#000000" flood-opacity="0.1"/>
    </filter>
    <filter id="glow" x="-50%" y="-50%" width="200%" height="200%">
      <feGaussianBlur stdDeviation="2" result="coloredBlur"/>
      <feMerge>
        <feMergeNode in="coloredBlur"/>
        <feMergeNode in="SourceGraphic"/>
      </feMerge>
    </filter>
    <marker id="arrow" viewBox="0 0 10 10" refX="8.5" refY="5" markerWidth="8" markerHeight="8" orient="auto-start-reverse">
      <path d="M 0 0 L 10 5 L 0 10 z" fill="{PALETTE["stroke"]}" />
    </marker>
    <linearGradient id="neural-grad" x1="0%" y1="0%" x2="100%" y2="100%">
      <stop offset="0%" style="stop-color:{PALETTE['neural']};stop-opacity:1" />
      <stop offset="100%" style="stop-color:#FFFFFF;stop-opacity:1" />
    </linearGradient>
    <linearGradient id="foundation-grad" x1="0%" y1="0%" x2="100%" y2="100%">
      <stop offset="0%" style="stop-color:{PALETTE['foundation']};stop-opacity:1" />
      <stop offset="100%" style="stop-color:#FFFFFF;stop-opacity:1" />
    </linearGradient>
  </defs>
""")

# --------- Enhanced Title ----------
title = "State-of-the-Art Feature Engineering Pipeline (2025)"
subtitle = "Neural FE • Foundation Models • Causal Discovery • AutoML Integration"
parts.append(f'''
  <g>
    <text x="{MARGIN}" y="{LANE_Y["header"]+12}" font-family="{FONT}" font-size="28" font-weight="800" fill="{PALETTE["title"]}">{esc(title)}</text>
    <text x="{MARGIN}" y="{LANE_Y["header"]+42}" font-family="{FONT}" font-size="15" fill="{PALETTE["accent"]}" font-weight="500">{esc(subtitle)}</text>
    <rect x="{W-MARGIN-80}" y="{LANE_Y["header"]}" width="70" height="25" rx="12" fill="{PALETTE["sota"]}" opacity="0.9"/>
    <text x="{W-MARGIN-45}" y="{LANE_Y["header"]+17}" text-anchor="middle" font-family="{FONT}" font-size="11" fill="white" font-weight="700">2025 SOTA</text>
  </g>
''')

# --------- Enhanced Swimlanes ----------
def lane(y, h, label, fill, icon=""):
    parts.append(f'<rect x="{MARGIN}" y="{y}" width="{W-2*MARGIN}" height="{h}" rx="{RX}" fill="{fill}" stroke="{PALETTE["stroke"]}" opacity="0.6" stroke-width="1.5"/>')
    parts.append(f'<text x="{MARGIN+16}" y="{y+28}" font-family="{FONT}" font-size="15" fill="{PALETTE["text"]}" font-weight="700">{icon} {esc(label)}</text>')

lane(LANE_Y["ingest"], LANE_H["ingest"], "Ingestion & Advanced Type Detection", PALETTE["lane3"], "📊")
lane(LANE_Y["neural"], LANE_H["neural"], "Neural Feature Engineering", PALETTE["neural"], "🧠")
lane(LANE_Y["numerical"], LANE_H["numerical"], "Enhanced Numerical Processing", PALETTE["lane"], "📈")
lane(LANE_Y["categorical"], LANE_H["categorical"], "Advanced Categorical Encoding", PALETTE["lane2"], "🏷️")
lane(LANE_Y["foundation"], LANE_H["foundation"], "Foundation Model Features", PALETTE["foundation"], "🤖")
lane(LANE_Y["causal"], LANE_H["causal"], "Causal & Graph Discovery", PALETTE["causal"], "🔗")
lane(LANE_Y["selection"], LANE_H["selection"], "Adaptive Selection & AutoML", PALETTE["lane4"], "⚡")

# --------- Enhanced Block helpers ----------
def block(x, y, w, h, title, lines=None, optional=False, dashed=False, small=False, join=False, 
          special="", badge="", fill_override=None):
    if fill_override:
        fill = fill_override
    elif join:
        fill = PALETTE["join"]
    elif special == "neural":
        fill = "url(#neural-grad)"
    elif special == "foundation":
        fill = "url(#foundation-grad)"
    else:
        fill = PALETTE["card"]
    
    stroke = PALETTE["stroke"]
    dash = ' stroke-dasharray="8,4"' if dashed or optional else ""
    filter_effect = ' filter="url(#glow)"' if special else ' filter="url(#shadow)"'
    
    parts.append(f'<rect x="{x}" y="{y}" width="{w}" height="{h}" rx="{RX}" fill="{fill}" stroke="{stroke}"{dash}{filter_effect} stroke-width="1.5"/>')
    
    # Title with potential badge
    title_y = y + 24
    if badge:
        parts.append(f'<rect x="{x+w-60}" y="{y+4}" rx="8" ry="8" width="50" height="18" fill="{PALETTE["new"]}" opacity="0.9"/>')
        parts.append(f'<text x="{x+w-35}" y="{y+16}" text-anchor="middle" font-family="{FONT}" font-size="10" fill="white" font-weight="700">{badge}</text>')
    
    parts.append(f'<text x="{x+16}" y="{title_y}" font-family="{FONT}" font-size="{13 if small else 15}" font-weight="700" fill="{PALETTE["text"]}">{esc(title)}</text>')
    
    if lines:
        start_y = y + (42 if not small else 36)
        fs = 12 if not small else 11
        for i, line in enumerate(lines):
            parts.append(f'<text x="{x+16}" y="{start_y + i*17}" font-family="{FONT}" font-size="{fs}" fill="{PALETTE["muted"]}">{esc(line)}</text>')
    
    if optional:
        parts.append(f'<rect x="{x+w-80}" y="{y-12}" rx="10" ry="10" width="70" height="24" fill="{PALETTE["bg"]}" stroke="{PALETTE["optional_dash"]}" stroke-width="1.5"/>')
        parts.append(f'<text x="{x+w-45}" y="{y+4}" text-anchor="middle" font-family="{FONT}" font-size="11" fill="{PALETTE["optional_dash"]}" font-weight="600">optional</text>')

def arrow(x1, y1, x2, y2, curved=False):
    if curved:
        mid_x = (x1 + x2) / 2
        mid_y = min(y1, y2) - 20
        parts.append(f'<path d="M {x1} {y1} Q {mid_x} {mid_y} {x2} {y2}" stroke="{PALETTE["stroke"]}" stroke-width="2.5" fill="none" marker-end="url(#arrow)"/>')
    else:
        parts.append(f'<line x1="{x1}" y1="{y1}" x2="{x2}" y2="{y2}" stroke="{PALETTE["stroke"]}" stroke-width="2.5" marker-end="url(#arrow)"/>')

# --------- Enhanced Ingestion ----------
x0 = MARGIN + 20
b_w = COL_W
b_h = 100

block(x0, LANE_Y["ingest"]+40, b_w, b_h, "Smart Data Ingestion", [
    "• Streaming + batch unification",
    "• Auto-schema detection",
    "• Quality profiling (Great Expectations)"
])

block(x0 + (b_w+GAP_X), LANE_Y["ingest"]+40, b_w, b_h, "ML-Based Type Detection", [
    "• LLM semantic typing",
    "• Fuzzy datetime parsing", 
    "• Multi-modal detection"
], badge="NEW")

block(x0 + 2*(b_w+GAP_X), LANE_Y["ingest"]+40, b_w, b_h, "Data Drift Monitor", [
    "• Real-time distribution shift",
    "• Concept drift detection",
    "• Alert system integration"
], badge="2025")

arrow(x0 + b_w, LANE_Y["ingest"]+40+b_h/2, x0+(b_w+GAP_X), LANE_Y["ingest"]+40+b_h/2)
arrow(x0 + (b_w+GAP_X) + b_w, LANE_Y["ingest"]+40+b_h/2, x0+2*(b_w+GAP_X), LANE_Y["ingest"]+40+b_h/2)

# --------- Neural Feature Engineering ----------
nx_neural = MARGIN + 20
ny_neural = LANE_Y["neural"] + 40

block(nx_neural, ny_neural, b_w, 100, "Learned Embeddings", [
    "• TabNet feature selection",
    "• NODE/SAINT transformers",
    "• End-to-end learnable FE"
], special="neural", badge="SOTA")

block(nx_neural + (b_w+GAP_X), ny_neural, b_w, 100, "Neural Interactions", [
    "• Deep FM factorization",
    "• xDeepFM compressed interaction",
    "• Attention-based selection"
], special="neural")

block(nx_neural + 2*(b_w+GAP_X), ny_neural, b_w, 100, "Automated FE", [
    "• AutoML feature synthesis",
    "• Genetic programming",
    "• Neural architecture search"
], special="neural", badge="AUTO")

# --------- Enhanced Numerical ----------
nx = MARGIN + 20
ny = LANE_Y["numerical"] + 40

block(nx, ny, b_w, 100, "Advanced Transforms", [
    "• Quantile/Power transforms",
    "• Robust scalers (Huber, etc)",
    "• Seasonal decomposition"
])

block(nx+(b_w+GAP_X), ny, b_w, 100, "Interaction Discovery", [
    "• SHAP-based interaction ranking",
    "• Symbolic regression (PySR)",
    "• Automated polynomial search"
], badge="NEW")

block(nx+2*(b_w+GAP_X), ny, b_w, 100, "Time-Aware Features", [
    "• Lag features with auto-correlation",
    "• Rolling window statistics",
    "• Trend & seasonality extraction"
])

# Second row
ny2 = ny + 100 + GAP_Y
block(nx, ny2, b_w, 90, "Representation Learning", [
    "• t-SNE/UMAP embeddings",
    "• Variational autoencoders",
    "• Contrastive learning"
], optional=True)

block(nx+(b_w+GAP_X), ny2, b_w, 90, "Signal Processing", [
    "• Wavelet transforms",
    "• Spectral features (FFT)",
    "• Empirical mode decomposition"
], optional=True)

block(nx+2*(b_w+GAP_X), ny2, b_w, 90, "Physics-Informed", [
    "• Domain knowledge injection",
    "• Conservation law constraints",
    "• Invariance-aware features"
], optional=True, dashed=True, badge="2025")

# Third row
ny3 = ny2 + 90 + GAP_Y
block(nx, ny3, b_w, 85, "Graph-Based Features", [
    "• Network centrality measures",
    "• Community detection features",
    "• Graph neural net embeddings"
], optional=True, dashed=True)

block(nx+(b_w+GAP_X), ny3, b_w, 85, "Stability Analysis", [
    "• Permutation importance",
    "• Leave-one-covariate-out",
    "• Adversarial robustness"
], optional=True, dashed=True)

# --------- Advanced Categorical ----------
cx = MARGIN + 20
cy = LANE_Y["categorical"] + 40

block(cx, cy, b_w, 90, "LLM-Enhanced Encoding", [
    "• Semantic similarity grouping",
    "• Context-aware embeddings",
    "• Multi-language support"
], badge="LLM")

block(cx+(b_w+GAP_X), cy, b_w, 90, "Hierarchical Encoding", [
    "• Tree-structured categories",
    "• Multi-level target encoding",
    "• Taxonomy-aware features"
])

block(cx+2*(b_w+GAP_X), cy, b_w, 90, "Dynamic Encoding", [
    "• Online target encoding",
    "• Frequency-based adaptation",
    "• Cold-start handling"
])

block(cx+3*(b_w+GAP_X), cy, b_w, 90, "Entity Embeddings", [
    "• Deep learning categorical emb.",
    "• Learned similarity metrics",
    "• Transfer from pre-trained"
], optional=True)

# --------- Foundation Model Features ----------
fx = MARGIN + 20
fy = LANE_Y["foundation"] + 40

block(fx, fy, b_w+40, 90, "LLM Feature Extraction", [
    "• GPT/BERT text embeddings",
    "• Instruction-based feature generation",
    "• Few-shot feature engineering"
], special="foundation", badge="GPT-4")

block(fx+(b_w+40+GAP_X), fy, b_w+40, 90, "Multimodal Features", [
    "• CLIP vision-language features",
    "• Audio spectral embeddings",
    "• Cross-modal attention"
], special="foundation", badge="CLIP")

block(fx+2*(b_w+40+GAP_X), fy, b_w, 90, "Tool Integration", [
    "• Code generation for FE",
    "• Automated EDA insights",
    "• Natural language queries"
], special="foundation")

# --------- Causal & Graph Discovery ----------
gx = MARGIN + 20
gy = LANE_Y["causal"] + 40

block(gx, gy, b_w, 90, "Causal Discovery", [
    "• PC/GES/LiNGAM algorithms",
    "• Causal feature selection",
    "• Confounding adjustment"
], fill_override=PALETTE["causal"], badge="CAUSAL")

block(gx+(b_w+GAP_X), gy, b_w, 90, "Treatment Effects", [
    "• Propensity score features",
    "• Instrumental variables",
    "• Double/debiased ML"
], fill_override=PALETTE["causal"])

block(gx+2*(b_w+GAP_X), gy, b_w, 90, "Graph Neural Features", [
    "• Message passing aggregation",
    "• Graph attention networks",
    "• Heterogeneous graph emb."
], fill_override=PALETTE["causal"])

block(gx+3*(b_w+GAP_X), gy, b_w, 90, "Fairness Constraints", [
    "• Bias-aware feature selection",
    "• Demographic parity features",
    "• Counterfactual fairness"
], fill_override=PALETTE["causal"], optional=True)

# --------- Adaptive Selection & AutoML ----------
sx = MARGIN + 20
sy = LANE_Y["selection"] + 35

block(sx, sy, b_w, 100, "Multi-Objective Selection", [
    "• Pareto-optimal feature sets",
    "• Performance vs interpretability",
    "• Stability-aware selection"
], dashed=True, badge="MULTI")

block(sx+(b_w+GAP_X), sy, b_w, 100, "AutoML Integration", [
    "• AutoGluon/H2O.ai pipeline",
    "• Neural architecture search",
    "• Hyperparameter optimization"
], dashed=True, badge="AUTO")

block(sx+2*(b_w+GAP_X), sy, b_w+60, 100, "Smart Concatenation", [
    "• Learned feature fusion",
    "• Attention-weighted combination",
    "• Dynamic feature routing"
], join=True)

block(sx+2*(b_w+GAP_X)+(b_w+60+GAP_X), sy, b_w, 100, "Production Pipeline", [
    "• MLOps integration",
    "• A/B testing framework",
    "• Real-time inference"
])

# Enhanced arrows with curved connections
arrow(x0 + b_w/2, LANE_Y["ingest"] + 40 + b_h, nx_neural + b_w/2, LANE_Y["neural"] + 40, curved=True)
arrow(nx_neural + b_w/2, LANE_Y["neural"] + 40 + 100, nx + b_w/2, LANE_Y["numerical"] + 40, curved=True)

# --------- Enhanced Legend ----------
lgx = W - MARGIN - 420
lgy = LANE_Y["header"] + 10
parts.append(f'<g transform="translate({lgx},{lgy})">')
parts.append(f'<rect x="0" y="0" width="400" height="120" rx="{RX}" fill="{PALETTE["card"]}" stroke="{PALETTE["stroke"]}" filter="url(#shadow)" stroke-width="1.5"/>')
parts.append(f'<text x="16" y="24" font-family="{FONT}" font-size="14" font-weight="700" fill="{PALETTE["text"]}">Enhanced Legend</text>')

# Legend items
items = [
    ("Standard block", "", "solid"),
    ("Optional/SOTA", "(cutting-edge)", "dashed"),
    ("Neural FE", "(deep learning)", "neural"),
    ("Foundation", "(LLMs/multimodal)", "foundation")
]

y_offset = 35
for i, (label, desc, style) in enumerate(items):
    y_pos = y_offset + i * 18
    if style == "solid":
        parts.append(f'<rect x="16" y="{y_pos}" width="24" height="12" rx="3" fill="{PALETTE["card"]}" stroke="{PALETTE["stroke"]}" stroke-width="1.5"/>')
    elif style == "dashed":
        parts.append(f'<rect x="16" y="{y_pos}" width="24" height="12" rx="3" fill="{PALETTE["card"]}" stroke="{PALETTE["optional_dash"]}" stroke-dasharray="6,3" stroke-width="1.5"/>')
    elif style == "neural":
        parts.append(f'<rect x="16" y="{y_pos}" width="24" height="12" rx="3" fill="url(#neural-grad)" stroke="{PALETTE["stroke"]}" stroke-width="1.5"/>')
    elif style == "foundation":
        parts.append(f'<rect x="16" y="{y_pos}" width="24" height="12" rx="3" fill="url(#foundation-grad)" stroke="{PALETTE["stroke"]}" stroke-width="1.5"/>')
    
    parts.append(f'<text x="48" y="{y_pos + 9}" font-family="{FONT}" font-size="12" fill="{PALETTE["muted"]}">{label} {desc}</text>')

parts.append('</g>')

# Add version badge
parts.append(f'<rect x="{MARGIN}" y="{H-50}" width="200" height="30" rx="15" fill="{PALETTE["accent"]}" opacity="0.1" stroke="{PALETTE["accent"]}" stroke-width="1.5"/>')
parts.append(f'<text x="{MARGIN+100}" y="{H-32}" text-anchor="middle" font-family="{FONT}" font-size="12" fill="{PALETTE["accent"]}" font-weight="600">Enhanced Pipeline v2025.1</text>')

parts.append("</svg>")

# Save the enhanced pipeline
with open(SVG_PATH, "w", encoding="utf-8") as f:
    f.write("".join(parts))

print(f"Enhanced SOTA Feature Engineering Pipeline saved to: {SVG_PATH}")
SVG_PATH

Enhanced SOTA Feature Engineering Pipeline saved to: feature_engineer_pipeline_sota.svg


'feature_engineer_pipeline_sota.svg'

In [3]:
# Regenerate the SVG with a livelier, pastel scientific palette and improved spacing.
# Saves to /mnt/data/feature_engineer_pipeline_colored.svg

from math import ceil

SVG_PATH = "feature_engineer_pipeline_colored.svg"

# ----------------------------
# Style (pastel, accessible)
# ----------------------------
PALETTE = {
    "bg": "#FCFCFD",
    "lane_lbl": "#334155",
    "stroke": "#CBD5E1",
    "text": "#0F172A",
    "muted": "#475569",
    "title": "#0B1020",

    # Category fills (cards)
    "num1": "#E0F2FE",   # math blue
    "num2": "#EDE9FE",   # interactions purple
    "num3": "#FEF3C7",   # binning amber
    "num4": "#CCFBF1",   # clustering teal
    "num5": "#FEF9C3",   # stats yellow
    "num6": "#FCE7F3",   # fourier pink
    "corr": "#F1F5F9",   # corr removal gray
    "imp":  "#FAE8FF",   # imputation violet
    "scale":"#E2F7F0",   # scaling mint
    "ae":   "#FFE4E6",   # autoencoder rose

    "cat1": "#DCFCE7",   # rare handling green
    "cat2": "#ECFCCB",   # target enc light green
    "cat3": "#E5E7EB",   # one-hot neutral
    "cat4": "#FFF7ED",   # freq enc peach

    "bool": "#F1F5F9",

    "sel1": "#E0E7FF",   # AdaptiveMI indigo
    "sel2": "#E9D5FF",   # SHAP lavender
    "join": "#F8FAFC",
    "out":  "#F0FDFA",

    # lane backgrounds
    "lane1": "#F8FAFC",
    "lane2": "#F9FAFB",
    "lane3": "#F8FAFC",
    "lane4": "#F9FAFB",
    "lane5": "#F8FAFC",
}

FONT = "Inter, ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, Noto Sans, sans-serif"

# ----------------------------
# Layout (more space)
# ----------------------------
W, H = 1800, 1250
MARGIN = 36
COL_W = 310
ROW_H = 120
GAP_X = 48     # wider
GAP_Y = 32     # taller
RX = 14

LANE_Y = {
    "header": 20,
    "ingest": 120,
    "numerical": 280,
    "categorical": 570,
    "boolean": 820,
    "selection": 1000,
}
LANE_H = {
    "ingest": 110,
    "numerical": 240,
    "categorical": 200,
    "boolean": 130,
    "selection": 120,
}

def esc(s: str) -> str:
    return (s.replace("&","&amp;").replace("<","&lt;").replace(">","&gt;"))

parts = []
parts.append(f'<svg xmlns="http://www.w3.org/2000/svg" width="{W}" height="{H}" viewBox="0 0 {W} {H}" style="background:{PALETTE["bg"]};">')

parts.append(f"""
  <defs>
    <filter id="shadow" x="-20%" y="-20%" width="140%" height="140%">
      <feDropShadow dx="0" dy="1" stdDeviation="2" flood-color="#000" flood-opacity="0.08"/>
    </filter>
    <marker id="arrow" viewBox="0 0 10 10" refX="8.5" refY="5" markerWidth="8" markerHeight="8" orient="auto-start-reverse">
      <path d="M 0 0 L 10 5 L 0 10 z" fill="{PALETTE["stroke"]}" />
    </marker>
  </defs>
""")

# Title
parts.append(f'''
  <g>
    <text x="{MARGIN}" y="{LANE_Y["header"]+10}" font-family="{FONT}" font-size="28" font-weight="800" fill="{PALETTE["title"]}">Feature Engineering Pipeline</text>
    <text x="{MARGIN}" y="{LANE_Y["header"]+40}" font-family="{FONT}" font-size="14" fill="{PALETTE["muted"]}">Pastel scientific style • with math notation and improved spacing</text>
  </g>
''')

def lane(y,h,label,fill):
    parts.append(f'<rect x="{MARGIN}" y="{y}" width="{W-2*MARGIN}" height="{h}" rx="{RX}" fill="{fill}" stroke="{PALETTE["stroke"]}" opacity="0.7"/>')
    parts.append(f'<text x="{MARGIN+12}" y="{y+26}" font-family="{FONT}" font-size="14" fill="{PALETTE["lane_lbl"]}" font-weight="700">{esc(label)}</text>')

lane(LANE_Y["ingest"], LANE_H["ingest"], "Ingestion & Type Inference", PALETTE["lane1"])
lane(LANE_Y["numerical"], LANE_H["numerical"], "Numerical Path", PALETTE["lane2"])
lane(LANE_Y["categorical"], LANE_H["categorical"], "Categorical Path", PALETTE["lane3"])
lane(LANE_Y["boolean"], LANE_H["boolean"], "Boolean Path", PALETTE["lane4"])
lane(LANE_Y["selection"], LANE_H["selection"], "Selection • Scaling • Output", PALETTE["lane5"])

def block(x, y, w, h, title, lines=None, fill="#FFF", dashed=False, small=False):
    dash = ' stroke-dasharray="6,6"' if dashed else ""
    stroke = PALETTE["stroke"]
    parts.append(f'<rect x="{x}" y="{y}" width="{w}" height="{h}" rx="{RX}" fill="{fill}" stroke="{stroke}"{dash} filter="url(#shadow)"/>')
    parts.append(f'<text x="{x+16}" y="{y+24}" font-family="{FONT}" font-size="{12 if small else 15}" font-weight="800" fill="{PALETTE["text"]}">{esc(title)}</text>')
    if lines:
        start_y = y + (40 if not small else 34)
        fs = 12 if not small else 11
        for i, line in enumerate(lines):
            parts.append(f'<text x="{x+16}" y="{start_y + i*18}" font-family="{FONT}" font-size="{fs}" fill="{PALETTE["muted"]}">{esc(line)}</text>')

def arrow(x1,y1,x2,y2):
    parts.append(f'<line x1="{x1}" y1="{y1}" x2="{x2}" y2="{y2}" stroke="{PALETTE["stroke"]}" stroke-width="2" marker-end="url(#arrow)"/>')

def v_connector(x,y1,y2):
    parts.append(f'<path d="M {x} {y1} L {x} {y2}" fill="none" stroke="{PALETTE["stroke"]}" stroke-width="2" marker-end="url(#arrow)"/>')

# ----------------------------
# Ingestion
# ----------------------------
x0 = MARGIN + 16
b_w = COL_W
b_h = 96

block(x0, LANE_Y["ingest"]+36, b_w, b_h, "Input Data X, y",
      ["X ∈ ℝ^{n×p}, y ∈ ℝ^{n}", "NaNs and mixed dtypes handled"], fill="#FFFFFF")

block(x0 + (b_w + GAP_X), LANE_Y["ingest"]+36, b_w, b_h, "Type Inference",
      ["numerical / categorical / datetime / boolean", "robust dtype detection"], fill="#FFFFFF")

arrow(x0 + b_w, LANE_Y["ingest"]+36 + b_h/2, x0 + b_w + GAP_X, LANE_Y["ingest"]+36 + b_h/2)

split_x = x0 + (b_w + GAP_X)*2 + 16
split_y = LANE_Y["ingest"]+36 + b_h/2
arrow(x0 + (b_w + GAP_X)*2, split_y, split_x, split_y)
v_connector(split_x, split_y, LANE_Y["numerical"]-10)
v_connector(split_x, split_y, LANE_Y["categorical"]-10)
v_connector(split_x, split_y, LANE_Y["boolean"]-10)

# ----------------------------
# Numerical Path
# ----------------------------
nx = MARGIN + 16
ny = LANE_Y["numerical"] + 36

block(nx, ny, b_w, 120, "Math Transforms",
      ["log(x+1), √x, 1/x", "cyclical time encodings", "domain-safe guards"],
      fill=PALETTE["num1"])

block(nx + (b_w + GAP_X), ny, b_w, 120, "Smart Interactions",
      ["x+y, x·y, x/y, x−y, x²", "XGBoost screening", "≤ 50 features"],
      fill=PALETTE["num2"])

block(nx + 2*(b_w + GAP_X), ny, b_w, 120, "Adaptive Binning",
      ["KBins (quantile, B=5)", "skip constant/ill-posed"],
      fill=PALETTE["num3"])

# Numerical row 2
ny2 = ny + 120 + GAP_Y
block(nx, ny2, b_w, 112, "Clustering (KMeans)",
      ["variance-ranked features", "cluster_id, dist_min, dist_mean"],
      fill=PALETTE["num4"])

block(nx + (b_w + GAP_X), ny2, b_w, 112, "Row-wise Statistics",
      ["mean, std, min, max, median, range, skew", "nonnull count, null ratio"],
      fill=PALETTE["num5"])

block(nx + 2*(b_w + GAP_X), ny2, b_w, 112, "Fourier Features",
      ["FFT → top-|X(f)|", "sin(2πft), cos(2πft)"],
      fill=PALETTE["num6"])

# Row arrows
y_mid = ny + 60
arrow(nx + b_w, y_mid, nx + b_w + GAP_X, y_mid)
arrow(nx + (b_w + GAP_X) + b_w, y_mid, nx + 2*(b_w + GAP_X), y_mid)

for i in range(3):
    sx = nx + i*(b_w + GAP_X) + b_w/2
    arrow(sx, ny + 120, sx, ny2)

# Corr/Impute/Scale and AE
nx_end = nx + 3*(b_w + GAP_X)
block(nx_end, ny, b_w, 120, "Corr Removal",
      ["|corr(xᵢ,xⱼ)|>τ ⇒ drop lower var", "τ = 0.95"], fill=PALETTE["corr"], dashed=True)
arrow(nx + 2*(b_w + GAP_X) + b_w, y_mid, nx_end, y_mid)

block(nx_end + (b_w + GAP_X), ny, b_w, 120, "Imputation",
      ["median/mean/mode by skew", "per-feature"], fill=PALETTE["imp"])
arrow(nx_end + b_w, y_mid, nx_end + (b_w + GAP_X), y_mid)

block(nx_end + 2*(b_w + GAP_X), ny, b_w, 120, "Scaling",
      ["QuantileTransformer → 𝒩(0,1)", "fallback: standardize"], fill=PALETTE["scale"])
arrow(nx_end + (b_w + GAP_X) + b_w, y_mid, nx_end + 2*(b_w + GAP_X), y_mid)

ae_y = ny + 148
block(nx_end + 2*(b_w + GAP_X), ae_y, b_w, 96, "Autoencoder Latents z",
      ["MaskedTabAE, d_token, heads", "z ∈ ℝ^k"], fill=PALETTE["ae"], dashed=True, small=True)
v_connector(nx_end + 2*(b_w + GAP_X) + b_w/2, ny + 120, ae_y)

# ----------------------------
# Categorical
# ----------------------------
cx = MARGIN + 16
cy = LANE_Y["categorical"] + 36

block(cx, cy, b_w, 106, "Rare Category Handling",
      ["p<1% → OTHER", "mode imputation"], fill=PALETTE["cat1"])

block(cx + (b_w + GAP_X), cy, b_w, 106, "Target Encoding",
      ["μ_c = (n_c·ȳ_c + α·ȳ)/(n_c+α)", "high-card only"], fill=PALETTE["cat2"])

block(cx + 2*(b_w + GAP_X), cy, b_w, 106, "One-Hot Encoding",
      ["ignore unknown", "drop if binary"], fill=PALETTE["cat3"])

block(cx + 3*(b_w + GAP_X), cy, b_w, 106, "Frequency Encoding",
      ["p(x=c) per feature"], fill=PALETTE["cat4"])

cy_mid = cy + 53
arrow(cx + b_w, cy_mid, cx + b_w + GAP_X, cy_mid)
arrow(cx + (b_w + GAP_X) + b_w, cy_mid, cx + 2*(b_w + GAP_X), cy_mid)
arrow(cx + 2*(b_w + GAP_X) + b_w, cy_mid, cx + 3*(b_w + GAP_X), cy_mid)

# ----------------------------
# Boolean
# ----------------------------
bx = MARGIN + 16
by = LANE_Y["boolean"] + 36
block(bx, by, b_w, 96, "Boolean Handling",
      ["fillna(mode) → {0,1}"], fill=PALETTE["bool"])

# ----------------------------
# Selection & Output
# ----------------------------
sx = MARGIN + 16
sy = LANE_Y["selection"] + 30

block(sx, sy, b_w, 100, "AdaptiveMI",
      ["I(x; y) adaptive estimator", "keep MI > 10⁻³"], fill=PALETTE["sel1"], dashed=True)

block(sx + (b_w + GAP_X), sy, b_w, 100, "SHAP Pruning",
      ["mean |ϕ_j| > θ", "θ = 10⁻³"], fill=PALETTE["sel2"], dashed=True)

join_w = b_w + 110
join_x = sx + 2*(b_w + GAP_X)
block(join_x, sy, join_w, 100, "Concatenate Selected Features",
      ["numerical + z + categorical + boolean → X_final"], fill=PALETTE["join"])

out_x = join_x + (join_w + GAP_X)
block(out_x, sy, b_w, 100, "Output",
      ["X_final ∈ ℝ^{n×d}", "clean • encoded • scaled"], fill=PALETTE["out"])

# Connections to selection
numerical_exit_x = nx_end + 2*(b_w + GAP_X) + b_w/2
numerical_exit_y = ae_y + 96
arrow(numerical_exit_x, numerical_exit_y, sx + b_w/2, sy + 50)

categorical_exit_x = cx + 3*(b_w + GAP_X) + b_w
categorical_exit_y = cy + 53
arrow(categorical_exit_x, categorical_exit_y, join_x, sy + 50)

arrow(bx + b_w, by + 48, join_x, sy + 50)

arrow(sx + b_w, sy + 50, sx + (b_w + GAP_X), sy + 50)
arrow(sx + (b_w + GAP_X) + b_w, sy + 50, join_x, sy + 50)
arrow(join_x + join_w, sy + 50, out_x, sy + 50)

# ----------------------------
# Legend
# ----------------------------
lgx = W - MARGIN - 400
lgy = LANE_Y["header"] + 6
parts.append(f'<g transform="translate({lgx},{lgy})">')
parts.append(f'<rect x="0" y="0" width="380" height="108" rx="{RX}" fill="#FFFFFF" stroke="{PALETTE["stroke"]}" filter="url(#shadow)"/>')
parts.append(f'<text x="14" y="24" font-family="{FONT}" font-size="13" font-weight="800" fill="{PALETTE["text"]}">Legend</text>')
parts.append(f'<rect x="14" y="34" width="28" height="14" rx="4" fill="{PALETTE["num1"]}" stroke="{PALETTE["stroke"]}"/>')
parts.append(f'<text x="50" y="46" font-family="{FONT}" font-size="12" fill="{PALETTE["muted"]}">Block (category-colored)</text>')
parts.append(f'<rect x="14" y="56" width="28" height="14" rx="4" fill="#FFF" stroke="{PALETTE["stroke"]}" stroke-dasharray="6,6"/>')
parts.append(f'<text x="50" y="68" font-family="{FONT}" font-size="12" fill="{PALETTE["muted"]}">Dashed = optional/conditional</text>')
parts.append(f'<line x1="14" y1="84" x2="42" y2="84" stroke="{PALETTE["stroke"]}" stroke-width="2" marker-end="url(#arrow)"/>')
parts.append(f'<text x="50" y="88" font-family="{FONT}" font-size="12" fill="{PALETTE["muted"]}">Arrow = feature/data flow</text>')
parts.append('</g>')

parts.append('</svg>')

with open(SVG_PATH, "w", encoding="utf-8") as f:
    f.write("".join(parts))

print(f"SVG diagram saved to {SVG_PATH}")


SVG diagram saved to feature_engineer_pipeline_colored.svg
