In [None]:
# --- GPT Prompt Chain for Cybersecurity EDA (OpenAI GPT-5 Reasoning) ---

import os, io, time, traceback
import pandas as pd
from dotenv import load_dotenv
from IPython.display import display, Markdown
from typing import Dict, List
from openai import OpenAI

# 1. Environment / API Setup --------------------------------------------------
# Expect an env file containing OPENAI_API_KEY
#ENV_PATH = r"/Users/sarahsetiawan/Desktop/VNFCDR-1/SarahCode/Generative_AI/OPENAI_API_KEY.env"  # <-- update if needed
ENV_PATH = r"C:\\Users\\macdo\\Github\\VNFCDR-1\\Chris Code\\Generative AI Data Cleaning\\Machine Learning\\Step 4_GPT Prompting\\OPENAI_API_KEY.env"
load_dotenv(ENV_PATH)
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError("OPENAI_API_KEY not found. Ensure the env file exists and key is set.")

# Instantiate OpenAI client
client = OpenAI(api_key=api_key)
print("OpenAI API client configured successfully.")

# 2. Data Load ----------------------------------------------------------------
#DATA_PATH = r"/Users/sarahsetiawan/Desktop/VNFCDR-1/SarahCode/Sample_df/Code/MachineLearning/SampleData_API/CSVs/Representative_APISample_20000_2.csv"
DATA_PATH = r"E:\\Datasets\\UNSW-NB15\\Training and Testing Sets\\UNSW_NB15_concatenated_dropped.csv"
try:
    df = pd.read_csv(DATA_PATH, low_memory=False)
except FileNotFoundError:
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")
print(f"Dataset loaded successfully. Shape: {df.shape}")

# 3. Context Preparation (Token-Optimized) ------------------------------------

def build_schema_summary(dataframe: pd.DataFrame) -> str:
    info_buf = io.StringIO()
    dataframe.info(buf=info_buf)
    _ = info_buf.getvalue()  # not used directly; we generate concise version below
    rows = []
    total = len(dataframe)
    for col in dataframe.columns:
        non_null = dataframe[col].notna().sum()
        null_pct = 100 * (1 - non_null / total)
        uniq = dataframe[col].nunique(dropna=True)
        rows.append(f"{col} | {dataframe[col].dtype} | {non_null} | {null_pct:.2f}% | {uniq}")
    header = "Column | DType | Non-Null | Null% | Unique\n------ | ----- | -------- | ----- | ------"
    concise = header + "\n" + "\n".join(rows)
    return concise

RANDOM_SEED = 42
SAMPLE_ROWS = 10
schema_concise = build_schema_summary(df)
sample_md = df.sample(n=SAMPLE_ROWS, random_state=RANDOM_SEED).to_markdown(index=False)

# 4. System Instruction (kept same semantic intent) ---------------------------
system_instruction = """
You are an expert data scientist specializing in data cleaning and preparation for machine learning. 
Your task is to perform exploratory data analysis (EDA), data cleaning and preprocessing for machine learning application. 
You will do this by generating structured insights, plans, and code.

Goals:
1. Perform a sharp initial EDA on a given dataset.
2. Propose an ordered data cleaning and preprocessing plan based on the EDA and best practices.
3. Following the data cleaning plan, produce fully executable, well-commented Python code.

Constraints:
- The created code must be executable without errors using the full dataset loaded from the provided CSV file path.
- Use Markdown headings exactly as requested.
"""

# 5. Generation Config -------------------------------------------------------
MODEL_NAME = "gpt-5"  
MAX_COMPLETION_TOKENS = 32000  
TEMPERATURE = 1  

# 6. Retry + Wrapper (Chat Completions) --------------------------------------

def call_model(messages: List[Dict[str, str]], max_completion_tokens: int = MAX_COMPLETION_TOKENS, temperature: float = TEMPERATURE):
    """Wrapper for OpenAI chat completion.

    NOTE: Newer reasoning / frontier models use 'max_completion_tokens' instead of deprecated 'max_tokens'.
    Some models enforce a fixed temperature (1); earlier attempt with 0.15 caused 400 error.
    """
    return client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        temperature=temperature,
        max_completion_tokens=max_completion_tokens,
        stream=False
    )

def generate_with_retry(prompt_text: str, label: str, max_retries: int = 3, backoff: float = 2.0):
    errors: List[str] = []
    for attempt in range(1, max_retries + 1):
        try:
            start = time.time()
            # Compose messages with system + user
            messages = [
                {"role": "system", "content": system_instruction},
                {"role": "user", "content": prompt_text}
            ]
            response = call_model(messages)
            elapsed = time.time() - start
            text = response.choices[0].message.content.strip() if response.choices else ""
            if not text:
                raise ValueError("Empty response content.")
            print(f"[{label}] Success attempt={attempt} time={elapsed:.2f}s")
            return dict(text=text, raw=response, attempts=attempt, errors=errors)
        except Exception as e:
            err_msg = f"[{label}] Attempt {attempt} failed: {e}"
            print(err_msg)
            errors.append(err_msg)
            if attempt < max_retries:
                sleep_time = backoff ** (attempt - 1)
                time.sleep(sleep_time)
    print(f"[{label}] All attempts failed.")
    return dict(text="", raw=None, attempts=max_retries, errors=errors)

# 7. Prompts (EXACT TEXT PRESERVED from previous implementation) --------------
prompt_1_analysis = f"""
## Task 1: Initial Data Analysis

You are given:
### Concise Schema Summary
```
{schema_concise}
```

### Random Sample ({SAMPLE_ROWS} rows)
```
{sample_md}
```

### Directives
1. Use the schema and sample to perform a sharp exploratory data analysis (EDA).
2. Create a table of unique value counts for all columns in the dataset.
3. Create a table of high cardinality columns. Ensure to explain why the columns were selected.
4. Generate a data quality analysis containing accuracy, completeness, consistency, uniqueness, validity, and timeliness before data cleaning.

Output ONLY under heading:
## Analytical Insights
Use concise Markdown sections & tables. Avoid code.
"""

# 8. Execution Chain ---------------------------------------------------------
chain_artifacts: Dict[str, str] = {}

print("--- Chain Step 1: Initial Analysis ---")
analysis_result = generate_with_retry(prompt_1_analysis, label="analysis")

if not analysis_result["text"]:
    print("Aborting chain: analysis stage failed.")
else:
    chain_artifacts["analysis_md"] = analysis_result["text"]
    display(Markdown("### Analysis Received"))
    display(Markdown(chain_artifacts["analysis_md"]))

    # Step 2: Plan (prompt uses prior output verbatim)
    prompt_2_plan = f"""
## Task 2: Data Cleaning Plan

Using the prior analysis:

### Analytical Insights
{chain_artifacts['analysis_md']}

Produce a prioritized, ordered bullet list of data cleaning and machine learning preprocessing steps.
Generate a data quality analysis containing accuracy, completeness, consistency, uniqueness, validity, and timeliness before data cleaning.
Do not perform train-test splits or modeling.
No code. Include: objective, columns impacted, and rationale.
Heading required: ## Data Cleaning Steps
"""
    print("\n--- Chain Step 2: Cleaning Plan ---")
    plan_result = generate_with_retry(prompt_2_plan, label="plan")

    if not plan_result["text"]:
        print("Aborting chain: plan stage failed.")
    else:
        chain_artifacts["plan_md"] = plan_result["text"]
        display(Markdown("### Cleaning Plan Received"))
        display(Markdown(chain_artifacts["plan_md"]))

        # Step 3: Code
        prompt_3_code = f"""
## Task 3: Data Cleaning Code

### Data Cleaning Steps
{chain_artifacts['plan_md']}

## Directives
1. Create fully executable Python code that implements the data cleaning plan developed in Task 2.
2. Your code should prepare one fully preprocessed dataframe. Do NOT perform any splitting or modeling.
3. Do not use sample data, only the full dataset loaded from the CSV file at path: {DATA_PATH}
4. Provide extremely detailed comments explaining each step of the code.


Return as ONE fenced Python code block under heading:
## Data Cleaning Code
"""
        print("\n--- Chain Step 3: Code Generation ---")
        code_result = generate_with_retry(prompt_3_code, label="code")

        if not code_result["text"]:
            print("Code generation failed.")
        else:
            chain_artifacts["code_md"] = code_result["text"]
            display(Markdown("### Data Cleaning Code Received"))
            display(Markdown(chain_artifacts["code_md"]))

        # Step 4: Validation
        prompt_4_validation = f"""
## Task 4: Data Cleaning Validation

### Data Cleaning Steps
{chain_artifacts['code_md']}

## Directives
1. Provide reasoning for any changes made from the original code.
2. Review the provided data cleaning code for correctness and completeness.
3. Ensure all steps from the cleaning plan are implemented in order.
4. Ensure the code is executable without errors using the full dataset loaded from the provided CSV file.
5. Provide extremely detailed comments explaining each step of the code.
6. Your code should prepare one fully preprocessed dataframe. Do NOT perform any splitting or modeling.
7. If the original code is correct, return it unchanged and note that no changes were needed.

Return as ONE fenced Python code block under heading:
## Data Cleaning Code - Validated
"""
        print("\n--- Chain Step 4: Code Validation ---")
        code_result = generate_with_retry(prompt_4_validation, label="validation")

        if not code_result["text"]:
            print("Validation check failed.")
        else:
            chain_artifacts["code_md"] = code_result["text"]
            display(Markdown("### Data Validation Received"))
            display(Markdown(chain_artifacts["code_md"]))


print("\n--- Prompt Chain Complete ---")

# 9. Export Artifacts --------------------------------------------------------
EXPORT_DIR = os.path.dirname(DATA_PATH)
export_path = "C:\\Users\\macdo\\Github\\VNFCDR-1\\Chris Code\\Generative AI Data Cleaning\\Machine Learning\\Step 4_GPT Prompting\\UNSW_NB15_Train_Test_Concatenated\\GPT_API_Exploratory_Analysis_Export.md"
try:
    with open(export_path, "w", encoding="utf-8") as f:
        if chain_artifacts:
            f.write("# EDA Prompt Chain Output (GPT)\n\n")
            for k, v in chain_artifacts.items():
                pretty = k.replace("_md", "").capitalize()
                f.write(f"\n\n## {pretty}\n\n")
                f.write(v.strip() + "\n")
        else:
            f.write("No artifacts generated (chain failed).")
    print(f"Exported chain artifacts to: {export_path}")
except Exception as e:
    print(f"Export failed: {e}")

# 10. Diagnostic Recap -------------------------------------------------------

def print_diagnostics():
    print("\n=== Diagnostics Summary ===")
    print(f"Model used: {MODEL_NAME}")
    if analysis_result.get('errors'):
        print(f"Analysis errors: {len(analysis_result['errors'])}")
    if 'plan_result' in locals() and plan_result.get('errors'):
        print(f"Plan errors: {len(plan_result['errors'])}")
    if 'code_result' in locals() and code_result.get('errors'):
        print(f"Code errors: {len(code_result['errors'])}")
    print("Artifacts generated:", list(chain_artifacts.keys()))

print_diagnostics()

OpenAI API client configured successfully.
Dataset loaded successfully. Shape: (257673, 42)
--- Chain Step 1: Initial Analysis ---
[analysis] Success attempt=1 time=100.13s


### Analysis Received

## Analytical Insights

### Dataset overview
- Rows: 257,673; Columns: 42
- Types: 41 numeric (int/float), 1 categorical (attack_cat), 1 binary label
- Nulls: 0% across all columns
- Target: label (binary) and attack_cat (10 classes). Sample shows mapping: Normal → label 0; other categories → label 1.

### Initial EDA highlights (from schema and sample)
- Feature scale/dispersion:
  - Many “rate/load/time” features (dur, rate, sload, dload, sinpkt, dinpkt, sjit, djit, tcprtt, synack, ackdat) exhibit very high cardinality, implying continuous/heavy-tailed distributions. Extremely small dur values in sample create very large sload/dload/rate outliers.
  - Byte/packet features (sbytes, dbytes, spkts, dpkts) show moderate-to-high cardinality and likely strong positive skew.
- Low-cardinality protocol/state features:
  - sttl (13), dttl (9), swin (22), dwin (19), ct_state_ttl (7), ct_flw_http_mthd (11), is_sm_ips_ports (2), is_ftp_login (4), ct_ftp_cmd (4) — amenable to integer/bin encoding or small one-hot.
- Potentially derived/collinear groups:
  - sload/dload likely functions of bytes and time; tcprtt ~ synack + ackdat; sbytes/dbytes with spkts/dpkts; suggests multicollinearity to manage during modeling.
- Identifiers:
  - id has 175,341 unique values (≈68% of rows), indicating duplicates of id exist; id should not be used as a predictive feature.
- Class distribution:
  - Not provided; given 10 attack_cat classes, class imbalance is plausible and should be quantified later.

### Unique value counts (all columns)
| Column | Unique |
|---|---:|
| id | 175341 |
| dur | 109945 |
| spkts | 646 |
| dpkts | 627 |
| sbytes | 9382 |
| dbytes | 8653 |
| rate | 115763 |
| sttl | 13 |
| dttl | 9 |
| sload | 121356 |
| dload | 116380 |
| sloss | 490 |
| dloss | 476 |
| sinpkt | 114318 |
| dinpkt | 110270 |
| sjit | 117101 |
| djit | 114861 |
| swin | 22 |
| stcpb | 114473 |
| dtcpb | 114187 |
| dwin | 19 |
| tcprtt | 63878 |
| synack | 57366 |
| ackdat | 53248 |
| smean | 1377 |
| dmean | 1362 |
| trans_depth | 14 |
| response_body_len | 2819 |
| ct_srv_src | 57 |
| ct_state_ttl | 7 |
| ct_dst_ltm | 52 |
| ct_src_dport_ltm | 52 |
| ct_dst_sport_ltm | 35 |
| ct_dst_src_ltm | 58 |
| is_ftp_login | 4 |
| ct_ftp_cmd | 4 |
| ct_flw_http_mthd | 11 |
| ct_src_ltm | 52 |
| ct_srv_dst | 57 |
| is_sm_ips_ports | 2 |
| attack_cat | 10 |
| label | 2 |

### High-cardinality columns
Criterion: unique values ≥ 1% of rows (≥ 2,577 distinct), which signals near-continuous features that should not be one-hot encoded and may need scaling/outlier treatment.

| Column | Unique | % of rows | Rationale |
|---|---:|---:|---|
| id | 175341 | 68.1% | Identifier; high uniqueness; exclude from modeling to avoid leakage/noise. |
| dur | 109945 | 42.7% | Continuous time; heavy tail due to tiny durations. |
| rate | 115763 | 44.9% | Derived rate; large spread; sensitive to small dur. |
| sload | 121356 | 47.1% | Load metric; heavy-tailed; closely tied to sbytes/dur. |
| dload | 116380 | 45.1% | As above for destination. |
| sinpkt | 114318 | 44.4% | Inter-packet timing; continuous, skewed. |
| dinpkt | 110270 | 42.8% | As above for destination. |
| sjit | 117101 | 45.4% | Jitter; continuous, heavy-tailed. |
| djit | 114861 | 44.6% | As above for destination. |
| stcpb | 114473 | 44.4% | TCP base/seq-like; near-continuous; may encode order rather than magnitude. |
| dtcpb | 114187 | 44.3% | As above for destination. |
| tcprtt | 63878 | 24.8% | RTT; continuous; likely correlated with synack/ackdat. |
| synack | 57366 | 22.3% | TCP phase timing; continuous. |
| ackdat | 53248 | 20.7% | TCP phase timing; continuous. |
| sbytes | 9382 | 3.6% | Byte counts; skewed; many unique values. |
| dbytes | 8653 | 3.4% | As above for destination. |
| response_body_len | 2819 | 1.1% | HTTP payload length; moderate-high variety. |

Why selected: These columns are near-continuous with many unique values relative to dataset size. One-hot encoding would explode dimensionality and add noise; appropriate treatment includes numeric scaling, winsorization/clipping, or log transforms. id is included as high-cardinality but should be dropped as it is an identifier.

### Data quality analysis (pre-cleaning)

- Completeness
  - Strength: 0% missing across all columns (per schema).
  - Caveat: Structural zeros may represent “not applicable” rather than true zeros (e.g., many zero values in synack/ackdat/tcprtt for some flows). Flag for semantic-missing assessment.

- Accuracy
  - Unknown without external ground truth. Plausibility checks advised:
    - Non-negativity for counts/times (all samples consistent).
    - TCP timing consistency: tcprtt ≈ synack + ackdat should roughly hold.
    - sload/dload/rate consistency with bytes/duration; extreme ratios likely when dur ~ 0.

- Consistency
  - Categorical alignment: Sample shows attack_cat = Normal implies label = 0; non-Normal implies label = 1. Verify dataset-wide.
  - Domain coherence:
    - sttl/dttl small value sets are consistent with TTL/state buckets.
    - Window sizes (swin/dwin) and means (smean/dmean) within expected ranges in sample; verify upper bounds and MTU-related constraints.
  - Derived relationships likely produce multicollinearity (e.g., sload with sbytes/dur; tcprtt with synack/ackdat). Not an error, but relevant for modeling.

- Uniqueness
  - id is not unique (175,341 unique vs. 257,673 rows), indicating repeated ids or session reuse. Assess whether duplicates are legitimate multi-record flows or require deduplication.
  - No other explicit primary key present.

- Validity
  - Types: All numeric columns correctly typed; attack_cat is object (categorical).
  - Value ranges: Counts/times non-negative in sample; TTL-like features within small sets; binary flags limited to 0/1 (is_sm_ips_ports) and small sets for is_ftp_login/ct_ftp_cmd. Verify full-range adherence and out-of-domain values.
  - Class labels: Binary with 2 unique values; ensure only {0,1} present.

- Timeliness
  - No event timestamp present; dur is per-flow duration, not dataset recency. Timeliness/recency cannot be assessed from given schema. If modeling for detection in current environments, confirm data collection period and drift risk separately.

- Distributional risks for modeling
  - Heavy tails and extreme outliers in rate/load/time features due to very small durations.
  - Potential class-conditional sparsity patterns (e.g., many zeros in timing features for certain attack types) may yield strong separability; validate that no data leakage from post-hoc features exists.
  - Class imbalance likely across 10 attack categories; quantify before model training.


--- Chain Step 2: Cleaning Plan ---
[plan] Success attempt=1 time=101.32s


### Cleaning Plan Received

Data Quality Analysis (pre-cleaning)
- Completeness
  - 0% missing across all columns. Watch for structural zeros that mean “not applicable” (e.g., tcprtt/synack/ackdat/sinpkt/dinpkt can be 0 by design).
- Accuracy
  - Plausibility checks recommended: non-negativity for counts/times; tcprtt ≈ synack + ackdat; loads/rates consistent with bytes and duration. Extremely small dur can inflate rate/sload/dload; cap or transform.
- Consistency
  - Verify mapping: attack_cat == "Normal" implies label == 0; all others imply label == 1. Ensure no contradictions. Check domain ranges of TTL/window sizes. Confirm binary fields only take allowed values.
- Uniqueness
  - id is not unique (≈68% unique). Remove as a predictive feature and consider deduplicating exact duplicate rows to reduce bias.
- Validity
  - Types appear correct: 41 numeric, 1 categorical (attack_cat), 1 binary label. Ensure labels ∈ {0,1}, attack_cat in 10 known classes, and no out-of-domain values (e.g., negative durations/bytes).
- Timeliness
  - No event timestamps; cannot assess recency. If used for current detection, validate collection period and drift separately.

## Data Cleaning Steps
1) Schema lock and dtypes enforcement
- Objective: Establish stable, correct data types and expected value domains to prevent downstream errors.
- Columns impacted: All; special attention to attack_cat (category), label (integer/bool), low-cardinality flags, and numeric metrics.
- Rationale: Explicit dtypes avoid implicit conversions and ensure consistent handling (e.g., categorical encoding vs numeric scaling).

2) Identifier removal
- Objective: Eliminate non-predictive identifiers to prevent leakage/noise.
- Columns impacted: id (drop).
- Rationale: High-cardinality identifier with no causal relationship; retaining can harm generalization.

3) Exact duplicate handling
- Objective: Remove duplicated records that can bias learning and metrics.
- Columns impacted: All feature columns plus targets; define duplicates on all columns except id (already dropped).
- Rationale: Exact duplicates overweight certain patterns and risk leakage if later split by row.

4) Label–category consistency audit and repair
- Objective: Ensure binary label aligns with attack_cat semantics.
- Columns impacted: label, attack_cat.
- Rationale: Inconsistent supervision corrupts training. If rows violate: either correct label based on attack_cat or exclude conflicting rows (document count).

5) Domain validity checks and hard constraints
- Objective: Enforce physical/semantic constraints; quarantine impossible rows.
- Columns impacted: dur, bytes/packets (sbytes, dbytes, spkts, dpkts), timing fields (synack, ackdat, tcprtt, sinpkt, dinpkt, sjit, djit), loads/rates (sload, dload, rate), window/TTL (swin, dwin, sttl, dttl), binary flags.
- Rationale: Negative values are invalid; window/TTL ranges should be non-negative and within plausible bounds. Either filter invalid rows or set offending values to NaN and impute (prefer filtering if rare).

6) Continuous feature skew mitigation (log1p)
- Objective: Stabilize variance and reduce heavy right tails.
- Columns impacted: Positive/zero-valued skewed metrics: dur, rate, sload, dload, sbytes, dbytes, spkts, dpkts, sinpkt, dinpkt, sjit, djit, synack, ackdat, tcprtt, response_body_len, smean, dmean. Exclude zero/negative-including true centered measures; use log1p to safely handle zeros.
- Rationale: Heavy tails degrade many models; log1p is robust for non-negative data and reduces sensitivity to tiny dur producing huge ratios.

7) Extreme outlier capping (winsorization)
- Objective: Limit undue influence of extreme values while preserving order.
- Columns impacted: After log1p, apply per-feature caps on highly volatile features: rate, sload, dload, sinpkt, dinpkt, sjit, djit, synack, ackdat, tcprtt, dur, sbytes, dbytes.
- Rationale: Even after transformation, tails may remain. Cap at robust quantiles (e.g., 0.1th–99.9th or 0.5th–99.5th) determined on training data to control leverage.

8) Multicollinearity control among derived features
- Objective: Reduce redundant information and improve model stability/interpretability.
- Columns impacted: Derived groups: {rate, sload, dload} vs {sbytes, dbytes, dur}; {tcprtt} vs {synack, ackdat}; {smean, dmean} vs packet/byte counts; stcpb, dtcpb (sequence-like).
- Rationale: Strong linear dependencies inflate variance in linear models and can confuse feature importance. Default plan:
  - Drop tcprtt (keep synack, ackdat).
  - Prefer core primitives: keep dur, sbytes, dbytes, spkts, dpkts; drop rate, sload, dload. Document alternative if model family benefits from rates.
  - Drop stcpb, dtcpb (near-random magnitudes, not semantically meaningful).
  - Optionally assess VIF/feature correlation to finalize drops; target VIF < 10.

9) Categorical encoding for protocol/state flags
- Objective: Properly represent nominal features without imposing false ordinality.
- Columns impacted: One-hot encode ct_state_ttl (7), ct_flw_http_mthd (11), is_sm_ips_ports (2), is_ftp_login (4), ct_ftp_cmd (4). Keep sttl, dttl, swin, dwin as numeric (ordered, small-range).
- Rationale: These are codes/flags or small sets; one-hot preserves semantics and is compact. Numeric TTL/window values contain magnitude information; treat as continuous.

10) Zero-inflation indicators for structural zeros
- Objective: Preserve informative absence patterns without distorting distributions.
- Columns impacted: Add binary indicators for zero values in key timing/load fields: I(dur==0), I(sinpkt==0), I(dinpkt==0), I(sjit==0), I(djit==0), I(synack==0), I(ackdat==0), I(rate==0), I(sload==0), I(dload==0) before any dropping of derived features; if rate/sload/dload dropped, omit those indicators.
- Rationale: Structural zeros can be class-informative; separate indicators let models benefit while main features are transformed/scaled.

11) Scaling of continuous features
- Objective: Put features on comparable scales robust to outliers.
- Columns impacted: All continuous numeric features retained after steps 6–8 (including sttl, dttl, swin, dwin, counts, and transformed timings/bytes).
- Rationale: RobustScaler (IQR-based) or StandardScaler (if tails are well-tamed) improves optimization and model performance, especially for distance/gradient-based models.

12) Rare category handling (if any)
- Objective: Prevent sparse dummies from overfitting.
- Columns impacted: One-hot encoded categories with very low frequency (e.g., <0.1% of rows).
- Rationale: Combine into an “Other” bucket during encoding to stabilize estimates.

13) Class distribution profiling and artifact check
- Objective: Quantify imbalance and ensure no leakage-like near-deterministic features.
- Columns impacted: Targets (label, attack_cat) and all features for quick mutual information or separability scans.
- Rationale: Guides downstream choices (e.g., class weights) and validates that post-hoc features do not trivially reveal the label. No modeling performed here; just profiling and documentation.

14) Final integrity and export of clean feature matrix
- Objective: Validate no NaNs/Infs after transformations; persist clean dataset for modeling.
- Columns impacted: All; ensure finite values; verify dummy matrices align with categories seen in training data (to be fixed later during pipeline fit).
- Rationale: Guarantees downstream code stability and reproducibility.

Notes
- Targets: Support both binary (label) and multiclass (attack_cat). Keep both; modelers can choose task.
- Train/test considerations (deferred): When splitting later, stratify by target and consider grouping by id or duplicate keys to avoid leakage across folds.


--- Chain Step 3: Code Generation ---
[code] Success attempt=1 time=128.11s


### Data Cleaning Code Received

## Data Cleaning Code
```python
"""
Data Cleaning and Preprocessing for UNSW-NB15 (concatenated) dataset.

This script implements the cleaning plan described in Task 3:
1) Schema lock and dtypes enforcement
2) Identifier removal
3) Exact duplicate handling
4) Label–category consistency audit and repair
5) Domain validity checks and hard constraints
6) Continuous feature skew mitigation (log1p)
7) Extreme outlier capping (winsorization)
10) Zero-inflation indicators for structural zeros (applied BEFORE dropping derived features)
8) Multicollinearity control among derived features
9) Categorical encoding for selected protocol/state flags
11) Scaling of continuous features
12) Rare category handling (applied within step 9)
13) Class distribution profiling and artifact check (non-modeling diagnostics)
14) Final integrity checks and export

Notes:
- This code avoids splitting or modeling; it prepares a single preprocessed DataFrame.
- It is defensive to slight schema variations by checking column existence before operations.
- All steps print concise diagnostics to assist auditability without interrupting execution.
"""

import os
import sys
import math
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings("ignore", category=FutureWarning)

# -------------------------------------------------------------------
# Configuration
# -------------------------------------------------------------------

# Input CSV path (must exist)
CSV_PATH = r"E:\Datasets\UNSW-NB15\Training and Testing Sets\UNSW_NB15_concatenated_dropped.csv"

# Output (optional) - will attempt to write in the same directory
OUTPUT_FILENAME = "UNSW_NB15_preprocessed.csv"

# Robust winsorization quantiles (after log1p transformation)
LOW_Q = 0.005
HIGH_Q = 0.995

# Rare category threshold (proportion of dataset); categories under this will be pooled into 'Other'
RARE_CAT_THRESHOLD = 0.001  # 0.1%

# -------------------------------------------------------------------
# Utilities
# -------------------------------------------------------------------

def normalize_attack_cat(val):
    """
    Normalize attack category string to canonical form if recognized.
    Returns (canonical_string, recognized_boolean)
    """
    if pd.isna(val):
        return val, False
    s = str(val).strip().lower()
    mapping = {
        "normal": "Normal",
        "fuzzers": "Fuzzers",
        "analysis": "Analysis",
        "backdoor": "Backdoor",
        "backdoors": "Backdoor",  # sometimes plural
        "dos": "DoS",
        "exploits": "Exploits",
        "generic": "Generic",
        "reconnaissance": "Reconnaissance",
        "shellcode": "Shellcode",
        "worms": "Worms",
    }
    if s in mapping:
        return mapping[s], True
    return val, False  # return original if not recognized


def print_step_header(step_text):
    print("\n" + "-" * 80)
    print(step_text)
    print("-" * 80)


def safe_intersect(cols, df_columns):
    return [c for c in cols if c in df_columns]


def is_binary_series(s):
    """Check if a pandas Series contains only {0,1} (ignoring NaNs)."""
    vals = pd.unique(s.dropna())
    return set(vals).issubset({0, 1})


# -------------------------------------------------------------------
# Load data
# -------------------------------------------------------------------

print_step_header("Loading data")
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"Input CSV not found at path: {CSV_PATH}")

df = pd.read_csv(CSV_PATH, low_memory=False)
print(f"Loaded shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# -------------------------------------------------------------------
# 1) Schema lock and dtypes enforcement
# -------------------------------------------------------------------

print_step_header("1) Schema lock and dtypes enforcement")

# Expected targets (keep both)
target_label_col = "label"
target_multiclass_col = "attack_cat"

# Ensure 'label' is present; if not, raise error because it is a key target
if target_label_col not in df.columns:
    raise KeyError("Expected binary target column 'label' not found.")

# Enforce 'label' as numeric integer {0,1} if possible
df[target_label_col] = pd.to_numeric(df[target_label_col], errors="coerce").astype("Int64")

# Ensure 'attack_cat' exists; if not, we proceed but some checks will be skipped
if target_multiclass_col in df.columns:
    # Convert to string then category for consistent handling (allowing normalization later)
    df[target_multiclass_col] = df[target_multiclass_col].astype("string")
else:
    print("Warning: 'attack_cat' column not found; multiclass target-related checks will be skipped.")

# Convert all other numeric-like columns to numeric dtype when possible (coerce errors to NaN)
non_target_cols = [c for c in df.columns if c not in [target_label_col, target_multiclass_col]]
for col in non_target_cols:
    # Try to detect numeric; if it contains any non-numeric, coerce to numeric (NaN) where needed.
    if df[col].dtype == object:
        # Heuristic: attempt to convert if majority of values are representable as numbers
        converted = pd.to_numeric(df[col], errors="coerce")
        if converted.notna().mean() > 0.9:
            df[col] = converted

# -------------------------------------------------------------------
# 2) Identifier removal
# -------------------------------------------------------------------

print_step_header("2) Identifier removal")
drop_id_cols = [c for c in ["id"] if c in df.columns]
if drop_id_cols:
    df.drop(columns=drop_id_cols, inplace=True)
    print(f"Dropped identifier columns: {drop_id_cols}")
else:
    print("No identifier columns found to drop.")

# -------------------------------------------------------------------
# 3) Exact duplicate handling
# -------------------------------------------------------------------

print_step_header("3) Exact duplicate handling")
before = len(df)
df = df.drop_duplicates(ignore_index=True)
after = len(df)
print(f"Removed {before - after} exact duplicate rows. New shape: {df.shape}")

# -------------------------------------------------------------------
# 4) Label–category consistency audit and repair
# -------------------------------------------------------------------

print_step_header("4) Label–category consistency audit and repair")
if target_multiclass_col in df.columns:
    # Normalize attack_cat values to canonical names if recognized
    normalized_vals = df[target_multiclass_col].apply(normalize_attack_cat)
    df[target_multiclass_col] = normalized_vals.apply(lambda x: x[0])
    recognized_mask = normalized_vals.apply(lambda x: x[1])

    # Derive label from attack_cat when recognized
    recognized_cats = df.loc[recognized_mask, target_multiclass_col]
    derived_label = pd.Series(index=df.index, dtype="Int64")
    derived_label.loc[recognized_mask] = (recognized_cats != "Normal").astype("Int64")

    # Identify and repair contradictions (only where derived_label is known)
    mismatch_mask = recognized_mask & df[target_label_col].notna() & (df[target_label_col] != derived_label)
    mismatches = int(mismatch_mask.sum())

    # For rows where label is NaN but derived_label is known, fill from derived_label
    fill_from_cat_mask = recognized_mask & df[target_label_col].isna()
    fills = int(fill_from_cat_mask.sum())

    # Apply fixes
    df.loc[mismatch_mask, target_label_col] = derived_label[mismatch_mask]
    df.loc[fill_from_cat_mask, target_label_col] = derived_label[fill_from_cat_mask]

    print(f"Normalized recognized attack_cat values: {int(recognized_mask.sum())} rows.")
    print(f"Repaired label to match attack_cat in {mismatches} conflicting rows.")
    print(f"Filled missing labels from attack_cat in {fills} rows.")

# Ensure 'label' is strictly 0/1; if non-binary values remain, attempt to coerce or drop
if not is_binary_series(df[target_label_col].astype("float").fillna(-1)):
    # Try a secondary coercion: any positive -> 1, zero -> 0
    print("Warning: Non-binary values detected in 'label'. Coercing positive->1, zero/negative->0.")
    df[target_label_col] = (pd.to_numeric(df[target_label_col], errors="coerce").fillna(0) > 0).astype("Int64")

# Drop rows where label is still NaN or not binary after coercion
valid_label_mask = df[target_label_col].isin([0, 1])
dropped_invalid_label = int((~valid_label_mask).sum())
df = df.loc[valid_label_mask].reset_index(drop=True)
if dropped_invalid_label > 0:
    print(f"Dropped {dropped_invalid_label} rows with invalid 'label'. New shape: {df.shape}")

# -------------------------------------------------------------------
# 5) Domain validity checks and hard constraints
# -------------------------------------------------------------------

print_step_header("5) Domain validity checks and hard constraints")

# Define domain rules
non_negative_cols = [
    "dur","sbytes","dbytes","spkts","dpkts",
    "sload","dload","rate",
    "sinpkt","dinpkt","sjit","djit",
    "synack","ackdat","tcprtt",
    "response_body_len",
    "smean","dmean","smeansz","dmeansz",
    "stcpb","dtcpb",
    "trans_depth",
    "sttl","dttl","swin","dwin",
]

# TTL plausible bounds
ttl_cols = ["sttl", "dttl"]
ttl_min, ttl_max = 0, 255

# Known small-domain flag/category columns (will be one-hot encoded later)
small_cat_cols = ["ct_state_ttl", "ct_flw_http_mthd", "is_sm_ips_ports", "is_ftp_login", "ct_ftp_cmd"]

# Enforce numeric dtypes for numeric domain columns when present
present_nonneg_cols = safe_intersect(non_negative_cols, df.columns)
for col in present_nonneg_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Non-negativity hard filter
neg_mask_any = pd.Series(False, index=df.index)
for col in present_nonneg_cols:
    neg_mask = df[col] < 0
    neg_mask_any = neg_mask_any | (neg_mask.fillna(False))

neg_count = int(neg_mask_any.sum())
if neg_count > 0:
    print(f"Filtering out {neg_count} rows with negative values in non-negative constrained columns.")
    df = df.loc[~neg_mask_any].reset_index(drop=True)

# TTL hard range check [0, 255]
present_ttl_cols = safe_intersect(ttl_cols, df.columns)
if present_ttl_cols:
    out_of_range_mask_any = pd.Series(False, index=df.index)
    for col in present_ttl_cols:
        out_of_range_mask = ~df[col].between(ttl_min, ttl_max)
        out_of_range_mask_any = out_of_range_mask_any | (out_of_range_mask.fillna(False))
    out_range_count = int(out_of_range_mask_any.sum())
    if out_range_count > 0:
        print(f"Filtering out {out_range_count} rows with TTL out of [{ttl_min},{ttl_max}].")
        df = df.loc[~out_of_range_mask_any].reset_index(drop=True)

# Binary domain for is_sm_ips_ports if present (should be {0,1})
if "is_sm_ips_ports" in df.columns:
    df["is_sm_ips_ports"] = pd.to_numeric(df["is_sm_ips_ports"], errors="coerce").astype("Int64")
    valid_binary = df["is_sm_ips_ports"].isin([0, 1])
    invalid_binary_count = int((~valid_binary).sum())
    if invalid_binary_count > 0:
        print(f"Dropping {invalid_binary_count} rows with invalid 'is_sm_ips_ports' values (expect 0/1).")
        df = df.loc[valid_binary].reset_index(drop=True)

# Ensure we removed any NaNs introduced so far in strict columns:
strict_cols = present_nonneg_cols + present_ttl_cols + (["is_sm_ips_ports"] if "is_sm_ips_ports" in df.columns else [])
if strict_cols:
    nan_mask_any = df[strict_cols].isna().any(axis=1)
    nan_count = int(nan_mask_any.sum())
    if nan_count > 0:
        print(f"Dropping {nan_count} rows with NaN in strict domain columns after enforcement.")
        df = df.loc[~nan_mask_any].reset_index(drop=True)

print(f"Post domain checks shape: {df.shape}")

# -------------------------------------------------------------------
# 6) Continuous feature skew mitigation (log1p)
# -------------------------------------------------------------------

print_step_header("6) Continuous feature skew mitigation (log1p)")

# Columns suited for log1p (only if present and non-negative by design)
log1p_candidates = [
    "dur","rate","sload","dload",
    "sbytes","dbytes","spkts","dpkts",
    "sinpkt","dinpkt","sjit","djit",
    "synack","ackdat","tcprtt",
    "response_body_len",
    "smean","dmean","smeansz","dmeansz",
]

log1p_cols = safe_intersect(log1p_candidates, df.columns)

# Ensure these columns are non-negative (should be after Step 5)
# Apply log1p transformation in-place
for col in log1p_cols:
    # If any residual negatives (shouldn't happen), shift to 0 as a last resort
    min_val = df[col].min()
    if pd.notna(min_val) and min_val < 0:
        shift = abs(min_val)
        print(f"Warning: {col} has negative values after domain checks. Shifting by {shift} before log1p.")
        df[col] = df[col] + shift
    df[col] = np.log1p(df[col].astype(float))

print(f"Applied log1p to {len(log1p_cols)} columns: {log1p_cols}")

# -------------------------------------------------------------------
# 7) Extreme outlier capping (winsorization) on log-transformed features
# -------------------------------------------------------------------

print_step_header("7) Extreme outlier capping (winsorization)")
winsorize_candidates = [
    "rate","sload","dload","sinpkt","dinpkt","sjit","djit",
    "synack","ackdat","tcprtt","dur","sbytes","dbytes"
]
winsor_cols = safe_intersect(winsorize_candidates, df.columns)

for col in winsor_cols:
    lo = df[col].quantile(LOW_Q)
    hi = df[col].quantile(HIGH_Q)
    if pd.isna(lo) or pd.isna(hi):
        continue
    if lo > hi:
        lo, hi = hi, lo
    df[col] = df[col].clip(lower=lo, upper=hi)

print(f"Winsorized {len(winsor_cols)} columns at [{LOW_Q*100:.1f}%, {HIGH_Q*100:.1f}%] quantiles.")

# -------------------------------------------------------------------
# 10) Zero-inflation indicators for structural zeros (do BEFORE feature dropping)
# Note: We place step 10 before step 8 to preserve zero-pattern info for dropped features.
# -------------------------------------------------------------------

print_step_header("10) Zero-inflation indicators for structural zeros")

zero_indicator_candidates = [
    "dur","sinpkt","dinpkt","sjit","djit","synack","ackdat","tcprtt"
    # We omit rate/sload/dload indicators because they will be dropped for multicollinearity
]
present_zero_cols = safe_intersect(zero_indicator_candidates, df.columns)

zero_ind_cols = []
for col in present_zero_cols:
    ind_col = f"is_zero__{col}"
    # Use a tight equality check; post-log1p zero implies original was zero
    # If column was log-transformed, values equal to 0 imply original zero.
    zero_ind = (df[col] == 0).astype("int8")
    df[ind_col] = zero_ind
    zero_ind_cols.append(ind_col)

print(f"Added {len(zero_ind_cols)} zero-indicator columns: {zero_ind_cols}")

# -------------------------------------------------------------------
# 8) Multicollinearity control among derived features
# -------------------------------------------------------------------

print_step_header("8) Multicollinearity control among derived features")

drop_for_collinearity = ["tcprtt", "rate", "sload", "dload", "stcpb", "dtcpb"]
drop_existing = safe_intersect(drop_for_collinearity, df.columns)
df.drop(columns=drop_existing, inplace=True, errors="ignore")
print(f"Dropped for multicollinearity (present): {drop_existing}")

# -------------------------------------------------------------------
# 9) Categorical encoding for protocol/state flags (+ step 12 rare cat handling inside)
# -------------------------------------------------------------------

print_step_header("9) Categorical encoding for selected small-domain features (+ rare handling)")

ohe_base_cols = ["ct_state_ttl", "ct_flw_http_mthd", "is_sm_ips_ports", "is_ftp_login", "ct_ftp_cmd"]
ohe_cols_present = safe_intersect(ohe_base_cols, df.columns)

# We'll one-hot encode these columns if present (even if numeric codes), with rare categories pooled into "Other"
created_dummies = []
for col in ohe_cols_present:
    # Treat as strings for robust one-hot encoding; fill missing as "Unknown"
    col_as_str = df[col].astype("Int64") if pd.api.types.is_numeric_dtype(df[col]) else df[col].astype("string")
    col_as_str = col_as_str.astype("string").fillna("Unknown")

    # Rare category pooling
    vc = col_as_str.value_counts(dropna=False)
    threshold = max(2, int(math.floor(RARE_CAT_THRESHOLD * len(df))))  # at least 2 rows to keep as unique category
    rare_cats = set(vc[vc < threshold].index)
    pooled = col_as_str.where(~col_as_str.isin(rare_cats), other="Other")

    # Create dummies with consistent naming
    dummies = pd.get_dummies(pooled, prefix=col, prefix_sep="=", dtype=np.uint8)
    df = pd.concat([df.drop(columns=[col]), dummies], axis=1)
    created_dummies.extend(list(dummies.columns))

print(f"One-hot encoded columns: {ohe_cols_present}")
print(f"Created {len(created_dummies)} dummy columns.")

# -------------------------------------------------------------------
# 11) Scaling of continuous features (RobustScaler)
# - Scale only continuous numeric features (exclude targets, OHE binaries, and zero-indicator binaries)
# -------------------------------------------------------------------

print_step_header("11) Scaling of continuous features (RobustScaler)")

# Identify numeric columns post-encoding
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Exclusions from scaling:
exclude_from_scaling = set()
# Targets
exclude_from_scaling.add(target_label_col)
# Zero indicators
exclude_from_scaling.update(zero_ind_cols)
# OHE dummy columns (uint8 dummies created in step 9)
exclude_from_scaling.update(created_dummies)

# Final list of continuous columns to scale
scale_cols = [c for c in numeric_cols if c not in exclude_from_scaling]

if scale_cols:
    scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0))
    df[scale_cols] = scaler.fit_transform(df[scale_cols].astype(float))
    print(f"Scaled {len(scale_cols)} continuous features with RobustScaler.")
else:
    print("No continuous columns identified for scaling.")

# -------------------------------------------------------------------
# 12) Rare category handling (handled in step 9 via pooling before encoding)
# -------------------------------------------------------------------
print_step_header("12) Rare category handling")
print("Rare category pooling performed during step 9 before one-hot encoding.")

# -------------------------------------------------------------------
# 13) Class distribution profiling and artifact check (no modeling)
# -------------------------------------------------------------------

print_step_header("13) Class distribution profiling and artifact check")

# Label distribution
label_counts = df[target_label_col].value_counts(dropna=False)
print("Label distribution:")
print(label_counts.to_string())

# Multiclass distribution if available
if target_multiclass_col in df.columns:
    print("\nattack_cat distribution (top 20):")
    print(df[target_multiclass_col].value_counts(dropna=False).head(20).to_string())

# Quick leakage-like check: high correlation with label for numeric features
print("\nTop 10 absolute correlations with label (numeric features):")
num_for_corr = [c for c in numeric_cols if c != target_label_col and c in df.columns]
if num_for_corr:
    corr = df[num_for_corr].corrwith(df[target_label_col].astype(float)).abs().sort_values(ascending=False)
    print(corr.head(10).to_string())
else:
    print("No numeric features available for correlation analysis.")

# -------------------------------------------------------------------
# 14) Final integrity and export of clean feature matrix
# -------------------------------------------------------------------

print_step_header("14) Final integrity and export of clean feature matrix")

# Replace inf values (should be none) and check for NaNs
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Sanity: ensure no NaNs in numeric features created by our pipeline (original dataset has no missing)
nan_summary = df.isna().sum()
total_nans = int(nan_summary.sum())
if total_nans > 0:
    # In this pipeline, we prefer to drop rows with NaNs to keep a clean matrix
    print(f"Warning: Found {total_nans} missing values after processing; dropping affected rows.")
    df = df.dropna(axis=0).reset_index(drop=True)

print(f"Final preprocessed shape: {df.shape}")

# Expose the final preprocessed DataFrame for downstream usage
df_preprocessed = df.copy()

# Attempt to save to disk (optional)
try:
    out_dir = str(Path(CSV_PATH).parent)
    out_path = os.path.join(out_dir, OUTPUT_FILENAME)
    df_preprocessed.to_csv(out_path, index=False)
    print(f"Saved preprocessed dataset to: {out_path}")
except Exception as e:
    print(f"Could not save preprocessed CSV due to: {e}")

# For interactive sessions, you can inspect df_preprocessed.head()
# print(df_preprocessed.head())
```


--- Chain Step 4: Code Validation ---
[validation] Success attempt=1 time=111.53s


### Data Validation Received

## Data Cleaning Code - Validated
```python
"""
Data Cleaning and Preprocessing for UNSW-NB15 (concatenated) dataset.

VALIDATION SUMMARY (Task 4):
- The original pipeline is largely correct and complete. It implements all plan steps, with an intentional
  reordering where step 10 (zero-inflation indicators) is applied before step 8 (multicollinearity drops)
  to preserve useful zero-pattern information for features that are subsequently removed.
- One targeted change was made for robustness and to avoid unnecessary row loss:
  CHANGE: In Final Integrity (Step 14), instead of dropping rows with NaN in any column (including raw
          non-modeled string columns), we now only drop rows that have NaNs in numeric columns
          (i.e., model-ready features and targets). This aligns with the step’s intent and avoids removing rows
          due to missing values in non-numeric, uninterpreted identifiers or text columns.
  RATIONALE: The previous "drop if any NaN in df" could remove a large number of rows if any unused string
             column had NaNs. Our change keeps the feature matrix clean where it matters (numeric inputs
             and targets) while preserving rows that are otherwise valid for modeling.

All other logic is retained; additional comments were added for clarity.

This script implements the cleaning plan described in Task 3:
1) Schema lock and dtypes enforcement
2) Identifier removal
3) Exact duplicate handling
4) Label–category consistency audit and repair
5) Domain validity checks and hard constraints
6) Continuous feature skew mitigation (log1p)
7) Extreme outlier capping (winsorization)
10) Zero-inflation indicators for structural zeros (applied BEFORE dropping derived features)
8) Multicollinearity control among derived features
9) Categorical encoding for selected protocol/state flags
11) Scaling of continuous features
12) Rare category handling (applied within step 9)
13) Class distribution profiling and artifact check (non-modeling diagnostics)
14) Final integrity checks and export

Notes:
- This code avoids splitting or modeling; it prepares a single preprocessed DataFrame.
- It is defensive to slight schema variations by checking column existence before operations.
- All steps print concise diagnostics to assist auditability without interrupting execution.
"""

import os
import sys
import math
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings("ignore", category=FutureWarning)

# -------------------------------------------------------------------
# Configuration
# -------------------------------------------------------------------

# Input CSV path (must exist)
CSV_PATH = r"E:\Datasets\UNSW-NB15\Training and Testing Sets\UNSW_NB15_concatenated_dropped.csv"

# Output (optional) - will attempt to write in the same directory
OUTPUT_FILENAME = "UNSW_NB15_preprocessed.csv"

# Robust winsorization quantiles (after log1p transformation)
LOW_Q = 0.005
HIGH_Q = 0.995

# Rare category threshold (proportion of dataset); categories under this will be pooled into 'Other'
RARE_CAT_THRESHOLD = 0.001  # 0.1%

# -------------------------------------------------------------------
# Utilities
# -------------------------------------------------------------------

def normalize_attack_cat(val):
    """
    Normalize attack category string to canonical form if recognized.
    Returns (canonical_string, recognized_boolean)

    - Handles common UNSW-NB15 attack categories and "Normal"
    - Uses a lowercase map but returns canonical title-case labels.
    """
    if pd.isna(val):
        return val, False
    s = str(val).strip().lower()
    mapping = {
        "normal": "Normal",
        "fuzzers": "Fuzzers",
        "analysis": "Analysis",
        "backdoor": "Backdoor",
        "backdoors": "Backdoor",  # sometimes plural
        "dos": "DoS",
        "exploits": "Exploits",
        "generic": "Generic",
        "reconnaissance": "Reconnaissance",
        "shellcode": "Shellcode",
        "worms": "Worms",
    }
    if s in mapping:
        return mapping[s], True
    return val, False  # return original if not recognized


def print_step_header(step_text):
    """Helper to print readable step headers in the console for audit trail."""
    print("\n" + "-" * 80)
    print(step_text)
    print("-" * 80)


def safe_intersect(cols, df_columns):
    """Return the intersection of a candidate list with an existing DataFrame columns."""
    return [c for c in cols if c in df_columns]


def is_binary_series(s):
    """Check if a pandas Series contains only {0,1} (ignoring NaNs)."""
    vals = pd.unique(s.dropna())
    return set(vals).issubset({0, 1})


# -------------------------------------------------------------------
# Load data
# -------------------------------------------------------------------

print_step_header("Loading data")
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"Input CSV not found at path: {CSV_PATH}")

# low_memory=False to preserve column consistency and avoid mixed dtypes
df = pd.read_csv(CSV_PATH, low_memory=False)
print(f"Loaded shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# -------------------------------------------------------------------
# 1) Schema lock and dtypes enforcement
# -------------------------------------------------------------------

print_step_header("1) Schema lock and dtypes enforcement")

# Targets: binary 'label' is required; multiclass 'attack_cat' is optional.
target_label_col = "label"
target_multiclass_col = "attack_cat"

# Ensure 'label' is present; if not, raise error because it is a key target.
if target_label_col not in df.columns:
    raise KeyError("Expected binary target column 'label' not found.")

# Enforce 'label' as pandas nullable integer Int64 (allows NA during coercion) then validate later.
df[target_label_col] = pd.to_numeric(df[target_label_col], errors="coerce").astype("Int64")

# Ensure 'attack_cat' exists; if so, cast to string for normalization.
if target_multiclass_col in df.columns:
    df[target_multiclass_col] = df[target_multiclass_col].astype("string")
else:
    print("Warning: 'attack_cat' column not found; multiclass target-related checks will be skipped.")

# Convert object-typed non-target columns to numeric if the vast majority are numeric-representable.
# This prevents string categorical columns from being damaged while ensuring numeric strings become numerics.
non_target_cols = [c for c in df.columns if c not in [target_label_col, target_multiclass_col]]
for col in non_target_cols:
    if df[col].dtype == object:
        converted = pd.to_numeric(df[col], errors="coerce")
        # Convert only if >90% of values can be parsed as numbers (heuristic to preserve true categoricals).
        if converted.notna().mean() > 0.9:
            df[col] = converted

# -------------------------------------------------------------------
# 2) Identifier removal
# -------------------------------------------------------------------

print_step_header("2) Identifier removal")
# Drop only clear row-unique identifiers; keep potential features like IPs unless explicitly excluded.
drop_id_cols = [c for c in ["id"] if c in df.columns]
if drop_id_cols:
    df.drop(columns=drop_id_cols, inplace=True)
    print(f"Dropped identifier columns: {drop_id_cols}")
else:
    print("No identifier columns found to drop.")

# -------------------------------------------------------------------
# 3) Exact duplicate handling
# -------------------------------------------------------------------

print_step_header("3) Exact duplicate handling")
before = len(df)
df = df.drop_duplicates(ignore_index=True)
after = len(df)
print(f"Removed {before - after} exact duplicate rows. New shape: {df.shape}")

# -------------------------------------------------------------------
# 4) Label–category consistency audit and repair
# -------------------------------------------------------------------

print_step_header("4) Label–category consistency audit and repair")
if target_multiclass_col in df.columns:
    # Normalize attack_cat values to canonical names if recognized by our map
    normalized_vals = df[target_multiclass_col].apply(normalize_attack_cat)
    df[target_multiclass_col] = normalized_vals.apply(lambda x: x[0])
    recognized_mask = normalized_vals.apply(lambda x: x[1])

    # Derive the expected binary label from normalized attack_cat where recognized (Normal -> 0; else -> 1).
    derived_label = pd.Series(index=df.index, dtype="Int64")
    recognized_cats = df.loc[recognized_mask, target_multiclass_col]
    derived_label.loc[recognized_mask] = (recognized_cats != "Normal").astype("Int64")

    # Identify contradictions only on rows where both a label and a recognized cat exist.
    mismatch_mask = recognized_mask & df[target_label_col].notna() & (df[target_label_col] != derived_label)
    mismatches = int(mismatch_mask.sum())

    # Fill missing labels directly from recognized attack_cat-derived labels.
    fill_from_cat_mask = recognized_mask & df[target_label_col].isna()
    fills = int(fill_from_cat_mask.sum())

    # Apply fixes (mend contradictions and fill missing labels).
    df.loc[mismatch_mask, target_label_col] = derived_label[mismatch_mask]
    df.loc[fill_from_cat_mask, target_label_col] = derived_label[fill_from_cat_mask]

    print(f"Normalized recognized attack_cat values: {int(recognized_mask.sum())} rows.")
    print(f"Repaired label to match attack_cat in {mismatches} conflicting rows.")
    print(f"Filled missing labels from attack_cat in {fills} rows.")

# Ensure 'label' is strictly binary {0,1}; if non-binary values remain, coerce and enforce.
if not is_binary_series(df[target_label_col].astype("float").fillna(-1)):
    print("Warning: Non-binary values detected in 'label'. Coercing positive->1, zero/negative->0.")
    df[target_label_col] = (pd.to_numeric(df[target_label_col], errors="coerce").fillna(0) > 0).astype("Int64")

# Drop rows where label is still NaN or not in {0,1} after coercion to guarantee clean target.
valid_label_mask = df[target_label_col].isin([0, 1])
dropped_invalid_label = int((~valid_label_mask).sum())
df = df.loc[valid_label_mask].reset_index(drop=True)
if dropped_invalid_label > 0:
    print(f"Dropped {dropped_invalid_label} rows with invalid 'label'. New shape: {df.shape}")

# -------------------------------------------------------------------
# 5) Domain validity checks and hard constraints
# -------------------------------------------------------------------

print_step_header("5) Domain validity checks and hard constraints")

# Define domain rules for numeric columns with known physical constraints (non-negative, bounded)
non_negative_cols = [
    "dur","sbytes","dbytes","spkts","dpkts",
    "sload","dload","rate",
    "sinpkt","dinpkt","sjit","djit",
    "synack","ackdat","tcprtt",
    "response_body_len",
    "smean","dmean","smeansz","dmeansz",
    "stcpb","dtcpb",
    "trans_depth",
    "sttl","dttl","swin","dwin",
]

# TTL plausible bounds
ttl_cols = ["sttl", "dttl"]
ttl_min, ttl_max = 0, 255

# Known small-domain flag/category columns (will be one-hot encoded later)
small_cat_cols = ["ct_state_ttl", "ct_flw_http_mthd", "is_sm_ips_ports", "is_ftp_login", "ct_ftp_cmd"]

# Enforce numeric dtypes for numeric domain columns when present
present_nonneg_cols = safe_intersect(non_negative_cols, df.columns)
for col in present_nonneg_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Non-negativity hard filter: drop any row with a negative value across constrained columns
neg_mask_any = pd.Series(False, index=df.index)
for col in present_nonneg_cols:
    neg_mask = df[col] < 0
    neg_mask_any = neg_mask_any | (neg_mask.fillna(False))

neg_count = int(neg_mask_any.sum())
if neg_count > 0:
    print(f"Filtering out {neg_count} rows with negative values in non-negative constrained columns.")
    df = df.loc[~neg_mask_any].reset_index(drop=True)

# TTL hard range check [0, 255]; drop rows outside bounds
present_ttl_cols = safe_intersect(ttl_cols, df.columns)
if present_ttl_cols:
    out_of_range_mask_any = pd.Series(False, index=df.index)
    for col in present_ttl_cols:
        out_of_range_mask = ~df[col].between(ttl_min, ttl_max)
        out_of_range_mask_any = out_of_range_mask_any | (out_of_range_mask.fillna(False))
    out_range_count = int(out_of_range_mask_any.sum())
    if out_range_count > 0:
        print(f"Filtering out {out_range_count} rows with TTL out of [{ttl_min},{ttl_max}].")
        df = df.loc[~out_of_range_mask_any].reset_index(drop=True)

# Binary domain enforcement for is_sm_ips_ports if present (must be {0,1}); drop invalid.
if "is_sm_ips_ports" in df.columns:
    df["is_sm_ips_ports"] = pd.to_numeric(df["is_sm_ips_ports"], errors="coerce").astype("Int64")
    valid_binary = df["is_sm_ips_ports"].isin([0, 1])
    invalid_binary_count = int((~valid_binary).sum())
    if invalid_binary_count > 0:
        print(f"Dropping {invalid_binary_count} rows with invalid 'is_sm_ips_ports' values (expect 0/1).")
        df = df.loc[valid_binary].reset_index(drop=True)

# Ensure we removed any NaNs introduced so far in strict columns (domain-constrained numeric fields)
strict_cols = present_nonneg_cols + present_ttl_cols + (["is_sm_ips_ports"] if "is_sm_ips_ports" in df.columns else [])
if strict_cols:
    nan_mask_any = df[strict_cols].isna().any(axis=1)
    nan_count = int(nan_mask_any.sum())
    if nan_count > 0:
        print(f"Dropping {nan_count} rows with NaN in strict domain columns after enforcement.")
        df = df.loc[~nan_mask_any].reset_index(drop=True)

print(f"Post domain checks shape: {df.shape}")

# -------------------------------------------------------------------
# 6) Continuous feature skew mitigation (log1p)
# -------------------------------------------------------------------

print_step_header("6) Continuous feature skew mitigation (log1p)")

# Columns suited for log1p (only if present and non-negative by design)
# These are often long-tailed in network traffic; log1p compresses scale and preserves zeros.
log1p_candidates = [
    "dur","rate","sload","dload",
    "sbytes","dbytes","spkts","dpkts",
    "sinpkt","dinpkt","sjit","djit",
    "synack","ackdat","tcprtt",
    "response_body_len",
    "smean","dmean","smeansz","dmeansz",
]

log1p_cols = safe_intersect(log1p_candidates, df.columns)

# Apply safe log1p transformation in-place (shifts up if any residual negatives appear unexpectedly)
for col in log1p_cols:
    min_val = df[col].min()
    if pd.notna(min_val) and min_val < 0:
        shift = abs(min_val)
        print(f"Warning: {col} has negative values after domain checks. Shifting by {shift} before log1p.")
        df[col] = df[col] + shift
    df[col] = np.log1p(df[col].astype(float))

print(f"Applied log1p to {len(log1p_cols)} columns: {log1p_cols}")

# -------------------------------------------------------------------
# 7) Extreme outlier capping (winsorization) on log-transformed features
# -------------------------------------------------------------------

print_step_header("7) Extreme outlier capping (winsorization)")
# Winsorization post-log stabilizes extremes while preserving relative ranks.
winsorize_candidates = [
    "rate","sload","dload","sinpkt","dinpkt","sjit","djit",
    "synack","ackdat","tcprtt","dur","sbytes","dbytes"
]
winsor_cols = safe_intersect(winsorize_candidates, df.columns)

for col in winsor_cols:
    lo = df[col].quantile(LOW_Q)
    hi = df[col].quantile(HIGH_Q)
    if pd.isna(lo) or pd.isna(hi):
        continue
    if lo > hi:
        lo, hi = hi, lo
    df[col] = df[col].clip(lower=lo, upper=hi)

print(f"Winsorized {len(winsor_cols)} columns at [{LOW_Q*100:.1f}%, {HIGH_Q*100:.1f}%] quantiles.")

# -------------------------------------------------------------------
# 10) Zero-inflation indicators for structural zeros (do BEFORE feature dropping)
# NOTE (intentional reordering): We place step 10 before step 8 to preserve zero-pattern information for features
# that are subsequently dropped for multicollinearity (e.g., tcprtt, rate, loads).
# -------------------------------------------------------------------

print_step_header("10) Zero-inflation indicators for structural zeros")

zero_indicator_candidates = [
    "dur","sinpkt","dinpkt","sjit","djit","synack","ackdat","tcprtt"
    # We omit rate/sload/dload indicators because they will be dropped for multicollinearity
]
present_zero_cols = safe_intersect(zero_indicator_candidates, df.columns)

zero_ind_cols = []
for col in present_zero_cols:
    ind_col = f"is_zero__{col}"
    # After log1p, a value equal to 0 implies the original value was exactly 0.
    zero_ind = (df[col] == 0).astype("int8")
    df[ind_col] = zero_ind
    zero_ind_cols.append(ind_col)

print(f"Added {len(zero_ind_cols)} zero-indicator columns: {zero_ind_cols}")

# -------------------------------------------------------------------
# 8) Multicollinearity control among derived features
# -------------------------------------------------------------------

print_step_header("8) Multicollinearity control among derived features")

# Drop highly collinear or redundant features (e.g., TCP RTT variants and rates) to reduce leakage risk
# and simplify models. We already captured zero-patterns for some of these above.
drop_for_collinearity = ["tcprtt", "rate", "sload", "dload", "stcpb", "dtcpb"]
drop_existing = safe_intersect(drop_for_collinearity, df.columns)
df.drop(columns=drop_existing, inplace=True, errors="ignore")
print(f"Dropped for multicollinearity (present): {drop_existing}")

# -------------------------------------------------------------------
# 9) Categorical encoding for protocol/state flags (+ step 12 rare cat handling inside)
# -------------------------------------------------------------------

print_step_header("9) Categorical encoding for selected small-domain features (+ rare handling)")

# Selected known small-domain categoricals; will be one-hot encoded with rare category pooling.
ohe_base_cols = ["ct_state_ttl", "ct_flw_http_mthd", "is_sm_ips_ports", "is_ftp_login", "ct_ftp_cmd"]
ohe_cols_present = safe_intersect(ohe_base_cols, df.columns)

created_dummies = []
for col in ohe_cols_present:
    # Treat values as strings for robust one-hot encoding; ensure missing becomes explicit "Unknown"
    col_as_str = df[col].astype("Int64") if pd.api.types.is_numeric_dtype(df[col]) else df[col].astype("string")
    col_as_str = col_as_str.astype("string").fillna("Unknown")

    # Rare category pooling: any category appearing fewer than threshold rows becomes "Other"
    vc = col_as_str.value_counts(dropna=False)
    threshold = max(2, int(math.floor(RARE_CAT_THRESHOLD * len(df))))  # at least 2 rows to keep as unique
    rare_cats = set(vc[vc < threshold].index)
    pooled = col_as_str.where(~col_as_str.isin(rare_cats), other="Other")

    # Create one-hot columns with deterministic naming
    dummies = pd.get_dummies(pooled, prefix=col, prefix_sep="=", dtype=np.uint8)
    df = pd.concat([df.drop(columns=[col]), dummies], axis=1)
    created_dummies.extend(list(dummies.columns))

print(f"One-hot encoded columns: {ohe_cols_present}")
print(f"Created {len(created_dummies)} dummy columns.")

# -------------------------------------------------------------------
# 11) Scaling of continuous features (RobustScaler)
# - Scale only continuous numeric features (exclude targets, OHE binaries, and zero-indicator binaries)
# -------------------------------------------------------------------

print_step_header("11) Scaling of continuous features (RobustScaler)")

# Identify numeric columns post-encoding (includes label, zero indicators, and OHE dummies)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Build the exclusion set:
exclude_from_scaling = set()
# Targets
exclude_from_scaling.add(target_label_col)
# Zero indicators (binary features we don't scale)
exclude_from_scaling.update(zero_ind_cols)
# OHE dummy columns (uint8 dummies created in step 9)
exclude_from_scaling.update(created_dummies)

# Final list of continuous columns to scale
scale_cols = [c for c in numeric_cols if c not in exclude_from_scaling]

if scale_cols:
    scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0))
    df[scale_cols] = scaler.fit_transform(df[scale_cols].astype(float))
    print(f"Scaled {len(scale_cols)} continuous features with RobustScaler.")
else:
    print("No continuous columns identified for scaling.")

# -------------------------------------------------------------------
# 12) Rare category handling (handled in step 9 via pooling before encoding)
# -------------------------------------------------------------------
print_step_header("12) Rare category handling")
print("Rare category pooling performed during step 9 before one-hot encoding.")

# -------------------------------------------------------------------
# 13) Class distribution profiling and artifact check (no modeling)
# -------------------------------------------------------------------

print_step_header("13) Class distribution profiling and artifact check")

# Label distribution
label_counts = df[target_label_col].value_counts(dropna=False)
print("Label distribution:")
print(label_counts.to_string())

# Multiclass distribution if available
if target_multiclass_col in df.columns:
    print("\nattack_cat distribution (top 20):")
    print(df[target_multiclass_col].value_counts(dropna=False).head(20).to_string())

# Quick leakage-like check: high correlation with label for numeric features
print("\nTop 10 absolute correlations with label (numeric features):")
num_for_corr = [c for c in numeric_cols if c != target_label_col and c in df.columns]
if num_for_corr:
    corr = df[num_for_corr].corrwith(df[target_label_col].astype(float)).abs().sort_values(ascending=False)
    print(corr.head(10).to_string())
else:
    print("No numeric features available for correlation analysis.")

# -------------------------------------------------------------------
# 14) Final integrity and export of clean feature matrix
# -------------------------------------------------------------------

print_step_header("14) Final integrity and export of clean feature matrix")

# Replace inf values (should be none) as a safeguard, then check for NaNs
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# VALIDATED CHANGE:
# Previously, code dropped rows if ANY column had NaN (including unused string columns).
# We now enforce NaN-free condition only across numeric columns (model features and targets),
# which aligns with the step's intent and avoids unnecessary row loss.
numeric_cols_final = df.select_dtypes(include=[np.number]).columns.tolist()
nan_mask_numeric = df[numeric_cols_final].isna().any(axis=1)
nan_rows_numeric = int(nan_mask_numeric.sum())
if nan_rows_numeric > 0:
    print(f"Warning: Found {nan_rows_numeric} rows with NaNs in numeric columns after processing; dropping those rows.")
    df = df.loc[~nan_mask_numeric].reset_index(drop=True)

print(f"Final preprocessed shape: {df.shape}")

# Expose the final preprocessed DataFrame for downstream usage (single fully-prepared matrix)
df_preprocessed = df.copy()

# Attempt to save to disk (optional)
try:
    out_dir = str(Path(CSV_PATH).parent)
    out_path = os.path.join(out_dir, OUTPUT_FILENAME)
    df_preprocessed.to_csv(out_path, index=False)
    print(f"Saved preprocessed dataset to: {out_path}")
except Exception as e:
    print(f"Could not save preprocessed CSV due to: {e}")

# For interactive sessions, you can inspect
# print(df_preprocessed.head())
```


--- Prompt Chain Complete ---
Exported chain artifacts to: C:\Users\macdo\Github\VNFCDR-1\Chris Code\Generative AI Data Cleaning\Machine Learning\Step 4_GPT Prompting\UNSW_NB15_Train_Test_Concatenated\GPT_API_Exploratory_Analysis_Export.md

=== Diagnostics Summary ===
Model used: gpt-5
Artifacts generated: ['analysis_md', 'plan_md', 'code_md']


In [1]:
# Post Cleaning Data Quality Analysis

import os, io, time, traceback
import pandas as pd
from dotenv import load_dotenv
from IPython.display import display, Markdown
from typing import Dict, List
from openai import OpenAI

# 1. Environment / API Setup --------------------------------------------------
# Expect an env file containing OPENAI_API_KEY
#ENV_PATH = r"/Users/sarahsetiawan/Desktop/VNFCDR-1/SarahCode/Generative_AI/OPENAI_API_KEY.env"  # <-- update if needed
ENV_PATH = r"C:\\Users\\macdo\\Github\\VNFCDR-1\\Chris Code\\Generative AI Data Cleaning\\Machine Learning\\Step 4_GPT Prompting\\OPENAI_API_KEY.env"
load_dotenv(ENV_PATH)
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError("OPENAI_API_KEY not found. Ensure the env file exists and key is set.")

# Instantiate OpenAI client (no custom base_url needed for official API)
client = OpenAI(api_key=api_key)
print("OpenAI API client configured successfully.")

# 2. Data Load ----------------------------------------------------------------
#DATA_PATH = r"/Users/sarahsetiawan/Desktop/VNFCDR-1/SarahCode/Sample_df/Code/MachineLearning/SampleData_API/CSVs/Representative_APISample_20000_2.csv"
DATA_PATH = r"C:\\Users\\macdo\\Github\\VNFCDR-1\\Chris Code\\Generative AI Data Cleaning\\Machine Learning\\Step 4_GPT Prompting\\UNSW_NB15_Train_Test_Concatenated\\cleaned_data.csv"
try:
    df = pd.read_csv(DATA_PATH, low_memory=False)
except FileNotFoundError:
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")
print(f"Dataset loaded successfully. Shape: {df.shape}")

# 3. Context Preparation (Token-Optimized) ------------------------------------

def build_schema_summary(dataframe: pd.DataFrame) -> str:
    info_buf = io.StringIO()
    dataframe.info(buf=info_buf)
    _ = info_buf.getvalue()  # not used directly; we generate concise version below
    rows = []
    total = len(dataframe)
    for col in dataframe.columns:
        non_null = dataframe[col].notna().sum()
        null_pct = 100 * (1 - non_null / total)
        uniq = dataframe[col].nunique(dropna=True)
        rows.append(f"{col} | {dataframe[col].dtype} | {non_null} | {null_pct:.2f}% | {uniq}")
    header = "Column | DType | Non-Null | Null% | Unique\n------ | ----- | -------- | ----- | ------"
    concise = header + "\n" + "\n".join(rows)
    return concise

RANDOM_SEED = 42
SAMPLE_ROWS = 10
schema_concise = build_schema_summary(df)
sample_md = df.sample(n=SAMPLE_ROWS, random_state=RANDOM_SEED).to_markdown(index=False)

# 4. System Instruction (kept same semantic intent) ---------------------------
system_instruction = """
You are an expert data scientist specializing in data cleaning and preparation for machine learning. 
Your task is to Generate a data quality analysis containing accuracy, completeness, consistency, uniqueness, validity, and timeliness on an imported file. 

Goals:
1. Perform a data quality analysis on a given dataset.

Constraints:
- Use Markdown headings exactly as requested.
"""

# 5. Generation Config -------------------------------------------------------
MODEL_NAME = "gpt-5"  
MAX_COMPLETION_TOKENS = 32000  
TEMPERATURE = 1  

# 6. Retry + Wrapper (Chat Completions) --------------------------------------

def call_model(messages: List[Dict[str, str]], max_completion_tokens: int = MAX_COMPLETION_TOKENS, temperature: float = TEMPERATURE):
    """Wrapper for OpenAI chat completion.

    NOTE: Newer reasoning / frontier models use 'max_completion_tokens' instead of deprecated 'max_tokens'.
    Some models enforce a fixed temperature (1); earlier attempt with 0.15 caused 400 error.
    """
    return client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        temperature=temperature,
        max_completion_tokens=max_completion_tokens,
        stream=False
    )

def generate_with_retry(prompt_text: str, label: str, max_retries: int = 3, backoff: float = 2.0):
    errors: List[str] = []
    for attempt in range(1, max_retries + 1):
        try:
            start = time.time()
            # Compose messages with system + user
            messages = [
                {"role": "system", "content": system_instruction},
                {"role": "user", "content": prompt_text}
            ]
            response = call_model(messages)
            elapsed = time.time() - start
            text = response.choices[0].message.content.strip() if response.choices else ""
            if not text:
                raise ValueError("Empty response content.")
            print(f"[{label}] Success attempt={attempt} time={elapsed:.2f}s")
            return dict(text=text, raw=response, attempts=attempt, errors=errors)
        except Exception as e:
            err_msg = f"[{label}] Attempt {attempt} failed: {e}"
            print(err_msg)
            errors.append(err_msg)
            if attempt < max_retries:
                sleep_time = backoff ** (attempt - 1)
                time.sleep(sleep_time)
    print(f"[{label}] All attempts failed.")
    return dict(text="", raw=None, attempts=max_retries, errors=errors)

# 7. Prompts (EXACT TEXT PRESERVED from previous implementation) --------------
prompt_1_analysis = f"""
## Task 1: Data Quality Analysis

You are given:
### Concise Schema Summary
```
{schema_concise}
```

### Random Sample ({SAMPLE_ROWS} rows)
```
{sample_md}
```

### Directives
1. Generate a data quality analysis containing accuracy, completeness, consistency, uniqueness, validity, and timeliness on the impored file at {DATA_PATH}.

Output ONLY under heading:
## Analytical Insights
Use concise Markdown sections & tables. Avoid code.
"""

# 8. Execution Chain ---------------------------------------------------------
chain_artifacts: Dict[str, str] = {}

print("--- Chain Step 1: Initial Analysis ---")
analysis_result = generate_with_retry(prompt_1_analysis, label="analysis")

if not analysis_result["text"]:
    print("Aborting chain: analysis stage failed.")
else:
    chain_artifacts["analysis_md"] = analysis_result["text"]
    display(Markdown("### Analysis Received"))
    display(Markdown(chain_artifacts["analysis_md"]))

# 9. Export Artifacts --------------------------------------------------------
EXPORT_DIR = os.path.dirname(DATA_PATH)
export_path = "C:\\Users\\macdo\\Github\\VNFCDR-1\\Chris Code\\Generative AI Data Cleaning\\Machine Learning\\Step 4_GPT Prompting\\UNSW_NB15_Train_Test_Concatenated\\GPT_API_Exploratory_Analysis_Export.md"
try:
    with open(export_path, "w", encoding="utf-8") as f:
        if chain_artifacts:
            f.write("# EDA Prompt Chain Output (GPT)\n\n")
            for k, v in chain_artifacts.items():
                pretty = k.replace("_md", "").capitalize()
                f.write(f"\n\n## {pretty}\n\n")
                f.write(v.strip() + "\n")
        else:
            f.write("No artifacts generated (chain failed).")
    print(f"Exported chain artifacts to: {export_path}")
except Exception as e:
    print(f"Export failed: {e}")

# 10. Diagnostic Recap -------------------------------------------------------

def print_diagnostics():
    print("\n=== Diagnostics Summary ===")
    print(f"Model used: {MODEL_NAME}")
    if analysis_result.get('errors'):
        print(f"Analysis errors: {len(analysis_result['errors'])}")
    if 'plan_result' in locals() and plan_result.get('errors'):
        print(f"Plan errors: {len(plan_result['errors'])}")
    if 'code_result' in locals() and code_result.get('errors'):
        print(f"Code errors: {len(code_result['errors'])}")
    print("Artifacts generated:", list(chain_artifacts.keys()))

print_diagnostics()

OpenAI API client configured successfully.
Dataset loaded successfully. Shape: (160474, 56)
--- Chain Step 1: Initial Analysis ---
[analysis] Success attempt=1 time=93.47s


### Analysis Received

## Analytical Insights

### Dataset Snapshot
- Source: C:\Users\macdo\Github\VNFCDR-1\Chris Code\Generative AI Data Cleaning\Machine Learning\Step 4_GPT Prompting\UNSW_NB15_Train_Test_Concatenated\cleaned_data.csv
- Shape: 160,474 rows × 56 columns
- Types: 28 float64, 27 int64, 1 object
- Nulls: 0% across all columns (per schema)

### Completeness
- Overall completeness: 100% non-null across all columns.
- Zero-variance columns (uninformative): 
  - is_zero__dur (unique=1; always 0)
  - is_zero__sinpkt (unique=1; always 0)
- Note: All one-hot columns are present; each shows 2 unique values (0/1), indicating no missing categories at schema level.

Recommended actions:
- Drop zero-variance columns: is_zero__dur, is_zero__sinpkt.
- Confirm no hidden NaNs (e.g., inf, -inf) in floats.

### Consistency
Key cross-field rules and observations:
- attack_cat ↔ label mapping: In sample, Normal → 0; attacks → 1. Consistent in sample; verify globally.
- One-hot exclusivity (exactly one 1 per group):
  - ct_state_ttl=[0,1,2,3,6,Other]
  - ct_flw_http_mthd=[0,1,4,Other]
  - is_ftp_login=[0,1,Other]
  - ct_ftp_cmd=[0,1,Other]
  Observed consistent in sample (sums to 1); verify dataset-wide.
- Derived relationships:
  - is_zero__{synack, ackdat, tcprtt} should reflect synack, ackdat, and synack+ackdat respectively. Sample rows appear coherent; verify systematically.
- Train/test concatenation: ensure no data leakage in downstream modeling (e.g., re-split by time/session if applicable).

Recommended actions:
- Enforce one-hot group sum-to-one constraints; flag violations.
- Verify label consistency: label = 0 iff attack_cat == "Normal", else 1.
- Validate is_zero__tcprtt equals 1 iff synack+ackdat == 0 (within numeric tolerance).

### Validity
Schema- and domain-level checks:
- Data types:
  - attack_cat: object with 10 categories (expected for UNSW-NB15: Normal, Generic, Exploits, Fuzzers, DoS, Reconnaissance, Analysis, Backdoor, Shellcode, Worms).
  - Binary/one-hot columns are int64 with values in {0,1} (unique=2), as expected.
- Scaled features:
  - Many originally count/time fields are float64 with negative values (standardization/robust scaling). Negative scaled values are valid; not raw units.
  - sttl, dttl show low cardinality (13, 9 unique) consistent with limited TTL variants.
- Outliers/heavy tails:
  - Example: sloss shows an extreme value (46) in sample while many rows cluster near small scaled values, indicating heavy-tailed distributions. This is plausible but warrants monitoring.

Recommended actions:
- Enumerate and validate attack_cat against the expected 10-class set; correct any stray/typo categories.
- For binary fields, assert set membership {0,1}; coerce or flag any deviations.
- Inspect extreme values and consider robust methods (winsorization/capping) if modeling sensitivity is high.
- Document scaling pipeline (fitted stats, version) to ensure reproducibility.

### Uniqueness
- Column-level uniqueness:
  - No natural key column; most continuous features have high cardinality (e.g., sjit 116,392 unique), making exact row duplication unlikely.
- Row-level duplicates: Not measurable from provided summary.

Recommended actions:
- Compute exact duplicate row rate and near-duplicate rate (hash rows or distance-based checks); drop or consolidate if >0.
- If session/flow identifiers exist upstream, consider retaining them to support deduplication and lineage.

### Accuracy
- Ground-truth validation: Not directly assessable without external references or raw units.
- Internal plausibility:
  - attack_cat ↔ label mapping looks correct in the sample.
  - is_zero__ flags appear directionally consistent with their parent variables in the sample.
  - Some features that represent counts/times are scaled; negative values are therefore not accuracy errors.

Recommended actions:
- Back-check a sample against raw, unscaled data (if retained) to confirm scaling correctness and is_zero__ flags.
- Quantify rate of any rule violations (e.g., one-hot sums, tcprtt construction) as a proxy for internal accuracy.

### Timeliness
- No timestamp or period-of-collection fields in schema; dataset recency and freshness cannot be assessed.
- File represents concatenated train/test cleaned data; suitable for modeling but not for streaming freshness guarantees.

Recommended actions:
- If timeliness matters, include or join acquisition timestamps, and track file modification times and data versions.
- Define acceptable data latency SLAs for future refreshes.

Exported chain artifacts to: C:\Users\macdo\Github\VNFCDR-1\Chris Code\Generative AI Data Cleaning\Machine Learning\Step 4_GPT Prompting\UNSW_NB15_Train_Test_Concatenated\GPT_API_Exploratory_Analysis_Export.md

=== Diagnostics Summary ===
Model used: gpt-5
Artifacts generated: ['analysis_md']
