In [1]:
# ============================================================
# Notebook 1: Data Processing for Multi-Stage NIDS
# Step 1: Environment Setup and Canonical Path Definitions
# ============================================================

# -----------------------------
# Standard library imports
# -----------------------------
import os
import sys
import json
import gc
import warnings
from collections import defaultdict

# -----------------------------
# Third-party imports
# -----------------------------
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from sklearn.preprocessing import StandardScaler

# -----------------------------
# Global configuration
# -----------------------------

# Silence non-critical warnings to keep logs readable
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Chunk size for streaming large CSV files (tunable)
CSV_CHUNK_SIZE = 250_000

# -----------------------------
# Resolve project root directory
# Assumption:
#   project/
#     ├── data/
#     │    ├── raw/
#     │    └── processed/
#     ├── notebooks/
#     └── venv/
# -----------------------------

CURRENT_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(CURRENT_DIR, ".."))

# -----------------------------
# Canonical data directories
# -----------------------------

RAW_DATA_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT, "data", "processed")

FEATURE_CHUNKS_DIR = os.path.join(PROCESSED_DATA_DIR, "feature_chunks")
LABELS_DIR = os.path.join(PROCESSED_DATA_DIR, "labels")
METADATA_DIR = os.path.join(PROCESSED_DATA_DIR, "metadata")

# -----------------------------
# Create processed directories if they do not exist
# -----------------------------

os.makedirs(FEATURE_CHUNKS_DIR, exist_ok=True)
os.makedirs(LABELS_DIR, exist_ok=True)
os.makedirs(METADATA_DIR, exist_ok=True)

# -----------------------------
# Defensive sanity checks
# -----------------------------

assert os.path.isdir(RAW_DATA_DIR), "Raw data directory does not exist"
assert os.path.isdir(PROCESSED_DATA_DIR), "Processed data directory missing"

# -----------------------------
# Logging helper (simple and explicit)
# -----------------------------

def log(msg):
    print(msg)

log("[OK] Environment initialized")
log(f"[INFO] Project root: {PROJECT_ROOT}")
log(f"[INFO] Raw data path: {RAW_DATA_DIR}")
log(f"[INFO] Processed data path: {PROCESSED_DATA_DIR}")
log(f"[INFO] CSV chunk size: {CSV_CHUNK_SIZE}")


[OK] Environment initialized
[INFO] Project root: C:\Users\DELL\Desktop\Network Intrusion Detection System
[INFO] Raw data path: C:\Users\DELL\Desktop\Network Intrusion Detection System\data\raw
[INFO] Processed data path: C:\Users\DELL\Desktop\Network Intrusion Detection System\data\processed
[INFO] CSV chunk size: 250000


In [2]:
# ============================================================
# Step 2: Raw CSV Discovery and Schema Inspection
# Purpose:
#   - Identify available raw CSV files
#   - Inspect column schema safely (no full load)
#   - Validate presence of labels and identifiers
#   - Establish authoritative column names for downstream steps
# ============================================================

# -----------------------------
# Locate raw CSV files
# -----------------------------

raw_csv_files = [
    f for f in os.listdir(RAW_DATA_DIR)
    if f.lower().endswith(".csv")
]

assert len(raw_csv_files) > 0, "No CSV files found in raw data directory"

log(f"[INFO] Found {len(raw_csv_files)} raw CSV files")

# Print file names for traceability
for fname in raw_csv_files:
    log(f"  - {fname}")

# -----------------------------
# Select one CSV for schema inspection
# This does NOT assume uniform ordering across files
# -----------------------------

sample_csv_path = os.path.join(RAW_DATA_DIR, raw_csv_files[0])
log(f"[INFO] Inspecting schema from: {raw_csv_files[0]}")

# -----------------------------
# Read a very small sample (safe for memory)
# -----------------------------

sample_df = pd.read_csv(
    sample_csv_path,
    nrows=5
)

# -----------------------------
# Inspect basic structure
# -----------------------------

log(f"[INFO] Number of columns: {sample_df.shape[1]}")

log("[INFO] Column names:")
for col in sample_df.columns:
    print(f"  - {col}")

# -----------------------------
# Check for critical columns
# These checks are defensive, not destructive
# -----------------------------

EXPECTED_IDENTIFIER_COLUMNS = [
    "Flow ID",
    "Src IP",
    "Dst IP",
    "Timestamp"
]

EXPECTED_RAW_LABEL_COLUMNS = [
    "Label",
    "Traffic Type",
    "Traffic Subtype"
]

missing_identifiers = [
    c for c in EXPECTED_IDENTIFIER_COLUMNS
    if c not in sample_df.columns
]

missing_labels = [
    c for c in EXPECTED_RAW_LABEL_COLUMNS
    if c not in sample_df.columns
]

if missing_identifiers:
    log("[WARNING] Missing identifier columns:")
    for c in missing_identifiers:
        log(f"  - {c}")
else:
    log("[OK] All identifier columns detected")

if missing_labels:
    log("[WARNING] Missing raw label columns:")
    for c in missing_labels:
        log(f"  - {c}")
else:
    log("[OK] All raw label columns detected")

# -----------------------------
# Persist schema metadata
# This locks column order and names for the entire project
# -----------------------------

schema_metadata = {
    "raw_columns": list(sample_df.columns),
    "identifier_columns": EXPECTED_IDENTIFIER_COLUMNS,
    "raw_label_columns": EXPECTED_RAW_LABEL_COLUMNS,
    "num_columns": sample_df.shape[1]
}

schema_metadata_path = os.path.join(
    METADATA_DIR, "raw_schema.json"
)

with open(schema_metadata_path, "w") as f:
    json.dump(schema_metadata, f, indent=2)

log(f"[DONE] Raw schema saved to metadata: {schema_metadata_path}")

# -----------------------------
# Cleanup
# -----------------------------

del sample_df
gc.collect()


[INFO] Found 1 raw CSV files
  - TII-SSRC-23.csv
[INFO] Inspecting schema from: TII-SSRC-23.csv
[INFO] Number of columns: 86
[INFO] Column names:
  - Flow ID
  - Src IP
  - Src Port
  - Dst IP
  - Dst Port
  - Protocol
  - Timestamp
  - Flow Duration
  - Total Fwd Packet
  - Total Bwd packets
  - Total Length of Fwd Packet
  - Total Length of Bwd Packet
  - Fwd Packet Length Max
  - Fwd Packet Length Min
  - Fwd Packet Length Mean
  - Fwd Packet Length Std
  - Bwd Packet Length Max
  - Bwd Packet Length Min
  - Bwd Packet Length Mean
  - Bwd Packet Length Std
  - Flow Bytes/s
  - Flow Packets/s
  - Flow IAT Mean
  - Flow IAT Std
  - Flow IAT Max
  - Flow IAT Min
  - Fwd IAT Total
  - Fwd IAT Mean
  - Fwd IAT Std
  - Fwd IAT Max
  - Fwd IAT Min
  - Bwd IAT Total
  - Bwd IAT Mean
  - Bwd IAT Std
  - Bwd IAT Max
  - Bwd IAT Min
  - Fwd PSH Flags
  - Bwd PSH Flags
  - Fwd URG Flags
  - Bwd URG Flags
  - Fwd Header Length
  - Bwd Header Length
  - Fwd Packets/s
  - Bwd Packets/s
  - Packet 

33

In [3]:
# ============================================================
# Step 3: Streaming CSV Ingestion and Chunked Parquet Conversion
# Purpose:
#   - Read the large raw CSV safely in chunks
#   - Persist chunks as Parquet files
#   - Preserve raw data exactly (no cleaning, no labeling)
#   - Enable scalable downstream processing
# ============================================================

# -----------------------------
# Output directory for raw chunks
# -----------------------------

RAW_CHUNKS_DIR = os.path.join(PROCESSED_DATA_DIR, "raw_chunks")
os.makedirs(RAW_CHUNKS_DIR, exist_ok=True)

log(f"[INFO] Raw chunks output directory: {RAW_CHUNKS_DIR}")

# -----------------------------
# Load authoritative schema
# -----------------------------

schema_path = os.path.join(METADATA_DIR, "raw_schema.json")
with open(schema_path, "r") as f:
    raw_schema = json.load(f)

expected_columns = raw_schema["raw_columns"]

# -----------------------------
# Initialize streaming read
# -----------------------------

chunk_index = 0
total_rows = 0

csv_path = os.path.join(RAW_DATA_DIR, raw_csv_files[0])

log(f"[INFO] Starting streaming ingestion from: {raw_csv_files[0]}")

for chunk in pd.read_csv(
    csv_path,
    chunksize=CSV_CHUNK_SIZE,
    low_memory=False
):
    # -------------------------
    # Enforce column order
    # -------------------------
    chunk = chunk[expected_columns]

    # -------------------------
    # Convert to Parquet
    # -------------------------
    chunk_path = os.path.join(
        RAW_CHUNKS_DIR,
        f"raw_chunk_{chunk_index:03d}.parquet"
    )

    table = pa.Table.from_pandas(chunk, preserve_index=False)
    pq.write_table(table, chunk_path)

    rows_in_chunk = len(chunk)
    total_rows += rows_in_chunk

    # -------------------------
    # Progress logging
    # -------------------------
    if (chunk_index + 1) % 5 == 0:
        log(f"[INFO] Processed {chunk_index + 1} chunks...")

    chunk_index += 1

    # -------------------------
    # Memory cleanup
    # -------------------------
    del chunk, table
    gc.collect()

# -----------------------------
# Persist ingestion metadata
# -----------------------------

ingestion_metadata = {
    "source_csv": raw_csv_files[0],
    "chunks_written": chunk_index,
    "rows_processed": total_rows,
    "chunk_size": CSV_CHUNK_SIZE
}

ingestion_metadata_path = os.path.join(
    METADATA_DIR, "raw_ingestion_stats.json"
)

with open(ingestion_metadata_path, "w") as f:
    json.dump(ingestion_metadata, f, indent=2)

log("[DONE] Raw CSV ingestion completed")
log(f"[INFO] Total chunks written: {chunk_index}")
log(f"[INFO] Total rows processed: {total_rows:,}")


[INFO] Raw chunks output directory: C:\Users\DELL\Desktop\Network Intrusion Detection System\data\processed\raw_chunks
[INFO] Starting streaming ingestion from: TII-SSRC-23.csv
[INFO] Processed 5 chunks...
[INFO] Processed 10 chunks...
[INFO] Processed 15 chunks...
[INFO] Processed 20 chunks...
[INFO] Processed 25 chunks...
[INFO] Processed 30 chunks...
[INFO] Processed 35 chunks...
[DONE] Raw CSV ingestion completed
[INFO] Total chunks written: 35
[INFO] Total rows processed: 8,656,767


In [4]:
# ============================================================
# Step 4: Label Normalization and Stage-wise Label Construction
# Purpose:
#   - Construct clean labels for the 3-stage IDS architecture
#   - Do NOT modify or touch feature columns
#   - Operate chunk-wise to preserve scalability
#
# Stage 1:
#   Binary classification
#   0 -> Benign
#   1 -> Malicious
#
# Stage 2:
#   Attack family classification + Non-Attack
#
# Stage 3:
#   Attack subtype (fine-grained)
# ============================================================

# -----------------------------
# Input / Output directories
# -----------------------------

RAW_CHUNKS_DIR = os.path.join(PROCESSED_DATA_DIR, "raw_chunks")
LABELED_CHUNKS_DIR = os.path.join(PROCESSED_DATA_DIR, "labeled_chunks")

os.makedirs(LABELED_CHUNKS_DIR, exist_ok=True)

log(f"[INFO] Labeled chunks output directory: {LABELED_CHUNKS_DIR}")

# -----------------------------
# Label column names
# -----------------------------

RAW_LABEL_STAGE_1 = "Label"
RAW_LABEL_STAGE_2 = "Traffic Type"
RAW_LABEL_STAGE_3 = "Traffic Subtype"

DERIVED_LABEL_STAGE_1 = "label_stage1"
DERIVED_LABEL_STAGE_2 = "label_stage2"
DERIVED_LABEL_STAGE_3 = "label_stage3"

# -----------------------------
# Stage 1 label mapping
# -----------------------------

def build_stage1_label(series):
    # Benign -> 0, everything else -> 1
    return (series != "Benign").astype("int8")

# -----------------------------
# Stage 2 label mapping
# -----------------------------

def build_stage2_label(series):
    # Normalize text and preserve Non-Attack explicitly
    return series.fillna("Unknown").astype(str)

# -----------------------------
# Stage 3 label mapping
# -----------------------------

def build_stage3_label(series):
    # Preserve subtype as-is (fine-grained)
    return series.fillna("Unknown").astype(str)

# -----------------------------
# Process chunks
# -----------------------------

chunk_count = 0
total_rows = 0

for fname in sorted(os.listdir(RAW_CHUNKS_DIR)):
    input_path = os.path.join(RAW_CHUNKS_DIR, fname)

    df = pq.read_table(input_path).to_pandas()

    # -------------------------
    # Derived label construction
    # -------------------------

    df[DERIVED_LABEL_STAGE_1] = build_stage1_label(df[RAW_LABEL_STAGE_1])
    df[DERIVED_LABEL_STAGE_2] = build_stage2_label(df[RAW_LABEL_STAGE_2])
    df[DERIVED_LABEL_STAGE_3] = build_stage3_label(df[RAW_LABEL_STAGE_3])

    # -------------------------
    # Persist labeled chunk
    # -------------------------

    output_path = os.path.join(
        LABELED_CHUNKS_DIR,
        fname.replace("raw_chunk", "labeled_chunk")
    )

    table = pa.Table.from_pandas(df, preserve_index=False)
    pq.write_table(table, output_path)

    rows_in_chunk = len(df)
    total_rows += rows_in_chunk
    chunk_count += 1

    if chunk_count % 5 == 0:
        log(f"[INFO] Labeled {chunk_count} chunks...")

    del df, table
    gc.collect()

# -----------------------------
# Persist labeling metadata
# -----------------------------

labeling_metadata = {
    "chunks_written": chunk_count,
    "rows_processed": total_rows,
    "stage1_definition": "Benign=0, Non-Benign=1",
    "stage2_definition": "Traffic Type",
    "stage3_definition": "Traffic Subtype"
}

labeling_metadata_path = os.path.join(
    METADATA_DIR, "labeling_stats.json"
)

with open(labeling_metadata_path, "w") as f:
    json.dump(labeling_metadata, f, indent=2)

log("[DONE] Label construction completed")
log(f"[INFO] Total chunks written: {chunk_count}")
log(f"[INFO] Total rows processed: {total_rows:,}")


[INFO] Labeled chunks output directory: C:\Users\DELL\Desktop\Network Intrusion Detection System\data\processed\labeled_chunks
[INFO] Labeled 5 chunks...
[INFO] Labeled 10 chunks...
[INFO] Labeled 15 chunks...
[INFO] Labeled 20 chunks...
[INFO] Labeled 25 chunks...
[INFO] Labeled 30 chunks...
[INFO] Labeled 35 chunks...
[DONE] Label construction completed
[INFO] Total chunks written: 35
[INFO] Total rows processed: 8,656,767


In [5]:
# ============================================================
# Step 5: Column Role Locking and Feature-Only Dataset Creation
# Purpose:
#   - Explicitly separate features from identifiers and labels
#   - Remove all non-feature columns to prevent leakage
#   - Produce feature-only chunks suitable for all stages
#
# This step ensures:
#   - No identifiers leak contextual information
#   - No raw or derived labels are seen by models
#   - Feature space is identical for Stage 1, 2, and 3
# ============================================================

# -----------------------------
# Directory definitions
# -----------------------------

LABELED_CHUNKS_DIR = os.path.join(PROCESSED_DATA_DIR, "labeled_chunks")
FEATURE_CHUNKS_DIR = os.path.join(PROCESSED_DATA_DIR, "feature_chunks")

os.makedirs(FEATURE_CHUNKS_DIR, exist_ok=True)

log(f"[INFO] Feature-only chunks output directory: {FEATURE_CHUNKS_DIR}")

# -----------------------------
# Column role definitions
# -----------------------------

IDENTIFIER_COLUMNS = [
    "Flow ID",
    "Src IP",
    "Dst IP",
    "Timestamp"
]

RAW_LABEL_COLUMNS = [
    "Label",
    "Traffic Type",
    "Traffic Subtype"
]

DERIVED_LABEL_COLUMNS = [
    "label_stage1",
    "label_stage2",
    "label_stage3"
]

COLUMNS_TO_DROP = (
    IDENTIFIER_COLUMNS
    + RAW_LABEL_COLUMNS
    + DERIVED_LABEL_COLUMNS
)

# -----------------------------
# Sanity check on schema
# -----------------------------

sample_chunk = pq.read_table(
    os.path.join(LABELED_CHUNKS_DIR, os.listdir(LABELED_CHUNKS_DIR)[0])
).to_pandas()

missing_cols = set(COLUMNS_TO_DROP) - set(sample_chunk.columns)
assert len(missing_cols) == 0, f"Missing expected columns: {missing_cols}"

del sample_chunk
gc.collect()

# -----------------------------
# Feature extraction
# -----------------------------

chunk_count = 0
total_rows = 0

for fname in sorted(os.listdir(LABELED_CHUNKS_DIR)):
    input_path = os.path.join(LABELED_CHUNKS_DIR, fname)

    df = pq.read_table(input_path).to_pandas()

    # Drop all non-feature columns
    feature_df = df.drop(columns=COLUMNS_TO_DROP)

    output_path = os.path.join(
        FEATURE_CHUNKS_DIR,
        fname.replace("labeled_chunk", "feature_chunk")
    )

    table = pa.Table.from_pandas(feature_df, preserve_index=False)
    pq.write_table(table, output_path)

    rows_in_chunk = len(feature_df)
    total_rows += rows_in_chunk
    chunk_count += 1

    if chunk_count % 5 == 0:
        log(f"[INFO] Extracted features from {chunk_count} chunks...")

    del df, feature_df, table
    gc.collect()

# -----------------------------
# Persist feature metadata
# -----------------------------

feature_metadata = {
    "chunks_written": chunk_count,
    "rows_processed": total_rows,
    "num_features": feature_df.shape[1]
}

feature_metadata_path = os.path.join(
    METADATA_DIR, "feature_schema.json"
)

with open(feature_metadata_path, "w") as f:
    json.dump(feature_metadata, f, indent=2)

log("[DONE] Feature-only dataset creation completed")
log(f"[INFO] Total chunks written: {chunk_count}")
log(f"[INFO] Total rows processed: {total_rows:,}")
log(f"[INFO] Number of features per row: {feature_df.shape[1]}")


[INFO] Feature-only chunks output directory: C:\Users\DELL\Desktop\Network Intrusion Detection System\data\processed\feature_chunks
[INFO] Extracted features from 5 chunks...
[INFO] Extracted features from 10 chunks...
[INFO] Extracted features from 15 chunks...
[INFO] Extracted features from 20 chunks...
[INFO] Extracted features from 25 chunks...
[INFO] Extracted features from 30 chunks...
[INFO] Extracted features from 35 chunks...


NameError: name 'feature_df' is not defined

In [7]:
# Re-establish canonical project paths (kernel-safe)

import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
PROCESSED_DATA_PATH = os.path.join(PROJECT_ROOT, "data", "processed")

print("[OK] Paths restored")
print("Processed data path:", PROCESSED_DATA_PATH)
# Fix missing feature metadata after successful extraction

import os
import json
import pyarrow.parquet as pq

FEATURE_CHUNKS_DIR = os.path.join(PROCESSED_DATA_PATH, "feature_chunks")
METADATA_DIR = os.path.join(PROCESSED_DATA_PATH, "metadata")
os.makedirs(METADATA_DIR, exist_ok=True)

sample_file = sorted(os.listdir(FEATURE_CHUNKS_DIR))[0]
sample_df = pq.read_table(
    os.path.join(FEATURE_CHUNKS_DIR, sample_file)
).to_pandas()

feature_metadata = {
    "chunks_written": len(os.listdir(FEATURE_CHUNKS_DIR)),
    "rows_processed": 8_656_767,
    "num_features": sample_df.shape[1],
    "feature_names": list(sample_df.columns)
}

metadata_path = os.path.join(METADATA_DIR, "feature_schema.json")
with open(metadata_path, "w") as f:
    json.dump(feature_metadata, f, indent=2)

print("[OK] Feature metadata written successfully")
print(f"[INFO] Number of features: {feature_metadata['num_features']}")


[OK] Paths restored
Processed data path: C:\Users\DELL\Desktop\Network Intrusion Detection System\data\processed
[OK] Feature metadata written successfully
[INFO] Number of features: 79


In [8]:
# ============================================================
# FINAL AUDIT & PRESENTATION CELL
# 01_data_preparation.ipynb
# This cell ONLY reads and displays delivered artifacts.
# ============================================================

import os
import json
import pyarrow.parquet as pq

print("=" * 90)
print("FINAL DATA PREPARATION DELIVERABLES – FULL INSPECTION")
print("=" * 90)

# ------------------------------------------------------------
# Paths
# ------------------------------------------------------------
RAW_CHUNKS_DIR = os.path.join(PROCESSED_DATA_PATH, "raw_chunks")
LABELED_CHUNKS_DIR = os.path.join(PROCESSED_DATA_PATH, "labeled_chunks")
FEATURE_CHUNKS_DIR = os.path.join(PROCESSED_DATA_PATH, "feature_chunks")
METADATA_DIR = os.path.join(PROCESSED_DATA_PATH, "metadata")

# ------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------
def list_parquet_files(path):
    return sorted([f for f in os.listdir(path) if f.endswith(".parquet")]) if os.path.exists(path) else []

def inspect_parquet_schema(path, fname):
    table = pq.read_table(os.path.join(path, fname))
    return table.schema.names, table.num_rows

# ------------------------------------------------------------
# 1. Raw Chunks Inspection
# ------------------------------------------------------------
print("\n[1] RAW DATA CHUNKS")
raw_files = list_parquet_files(RAW_CHUNKS_DIR)
print(f"Directory: {RAW_CHUNKS_DIR}")
print(f"Number of chunks: {len(raw_files)}")

if raw_files:
    cols, rows = inspect_parquet_schema(RAW_CHUNKS_DIR, raw_files[0])
    print(f"Sample chunk: {raw_files[0]}")
    print(f"Rows in sample chunk: {rows}")
    print(f"Number of columns: {len(cols)}")

# ------------------------------------------------------------
# 2. Labeled Chunks Inspection
# ------------------------------------------------------------
print("\n[2] LABELED DATA CHUNKS (Stage labels included)")
labeled_files = list_parquet_files(LABELED_CHUNKS_DIR)
print(f"Directory: {LABELED_CHUNKS_DIR}")
print(f"Number of chunks: {len(labeled_files)}")

if labeled_files:
    cols, rows = inspect_parquet_schema(LABELED_CHUNKS_DIR, labeled_files[0])
    label_cols = [c for c in cols if c.startswith("label_")]
    print(f"Sample chunk: {labeled_files[0]}")
    print(f"Rows in sample chunk: {rows}")
    print(f"Label columns: {label_cols}")

# ------------------------------------------------------------
# 3. Feature-only Chunks Inspection
# ------------------------------------------------------------
print("\n[3] FEATURE-ONLY CHUNKS (Leakage-safe)")
feature_files = list_parquet_files(FEATURE_CHUNKS_DIR)
print(f"Directory: {FEATURE_CHUNKS_DIR}")
print(f"Number of chunks: {len(feature_files)}")

if feature_files:
    cols, rows = inspect_parquet_schema(FEATURE_CHUNKS_DIR, feature_files[0])
    print(f"Sample chunk: {feature_files[0]}")
    print(f"Rows in sample chunk: {rows}")
    print(f"Number of features: {len(cols)}")

# ------------------------------------------------------------
# 4. Metadata Files Inspection
# ------------------------------------------------------------
print("\n[4] METADATA FILES")
print(f"Directory: {METADATA_DIR}")

if os.path.exists(METADATA_DIR):
    for fname in sorted(os.listdir(METADATA_DIR)):
        print(f"\n--- {fname} ---")
        with open(os.path.join(METADATA_DIR, fname), "r") as f:
            data = json.load(f)
        for k, v in data.items():
            print(f"{k}: {v}")
else:
    print("Metadata directory not found")

# ------------------------------------------------------------
# Final Status
# ------------------------------------------------------------
print("\n" + "=" * 90)
print("STATUS: DATA PREPARATION COMPLETED SUCCESSFULLY")
print("Artifacts are clean, structured, and ready for modeling stages.")
print("=" * 90)


FINAL DATA PREPARATION DELIVERABLES – FULL INSPECTION

[1] RAW DATA CHUNKS
Directory: C:\Users\DELL\Desktop\Network Intrusion Detection System\data\processed\raw_chunks
Number of chunks: 35
Sample chunk: raw_chunk_000.parquet
Rows in sample chunk: 250000
Number of columns: 86

[2] LABELED DATA CHUNKS (Stage labels included)
Directory: C:\Users\DELL\Desktop\Network Intrusion Detection System\data\processed\labeled_chunks
Number of chunks: 35
Sample chunk: labeled_chunk_000.parquet
Rows in sample chunk: 250000
Label columns: ['label_stage1', 'label_stage2', 'label_stage3']

[3] FEATURE-ONLY CHUNKS (Leakage-safe)
Directory: C:\Users\DELL\Desktop\Network Intrusion Detection System\data\processed\feature_chunks
Number of chunks: 35
Sample chunk: feature_chunk_000.parquet
Rows in sample chunk: 250000
Number of features: 79

[4] METADATA FILES
Directory: C:\Users\DELL\Desktop\Network Intrusion Detection System\data\processed\metadata

--- feature_schema.json ---
chunks_written: 35
rows_proces