In [3]:
"""
Scikit-learn autoencoder for tabular data (no TensorFlow).
Trains an MLPRegressor to reconstruct the input and extracts the central hidden layer as encoded features.
"""

import os
import json
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neural_network import MLPRegressor
import joblib

# -----------------------
# USER CONFIG
# -----------------------
INPUT_CSV = "/home/mubasshir/Desktop/Research/Insurance/DataSet_Patient/Medicare_Physician_Data/CMSData_sampled.csv"  # <-- change to your CSV path
OUTPUT_DIR = "/"
RANDOM_SEED = 42
TEST_SIZE = 0.15
VAL_SIZE = 0.15
BATCH_SIZE = 256
MAX_ITER = 400
LATENT_DIM = 16
HIDDEN_UNITS_ENCODER = [128, 64]  # encoder side; will mirror for decoder
ACTIVATION = "relu"
EARLY_STOPPING = True
VERBOSE = True
SAVE_TRANSFORMER = True
SAVE_MODEL = True

os.makedirs(OUTPUT_DIR, exist_ok=True)

# -----------------------
# Helpers
# -----------------------
def read_csv(path: str) -> pd.DataFrame:
    print(f"Loading CSV from: {path}")
    df = pd.read_csv(path)
    print(f"Loaded shape: {df.shape}")
    return df

def detect_columns(df: pd.DataFrame) -> Tuple[List[str], List[str]]:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    return numeric_cols, categorical_cols

def build_preprocessor(df: pd.DataFrame):
    num_cols, cat_cols = detect_columns(df)
    print(f"Numeric columns ({len(num_cols)}): {num_cols}")
    print(f"Categorical columns ({len(cat_cols)}): {cat_cols}")

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="__MISSING__")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ], remainder="drop", sparse_threshold=0)

    return preprocessor, num_cols, cat_cols

def symmetric_hidden_layers(encoder_units: List[int], latent_dim: int) -> Tuple[Tuple[int,...], int]:
    left = list(encoder_units)
    right = list(reversed(encoder_units))
    hidden = tuple(left + [latent_dim] + right)
    latent_index = len(left)
    return hidden, latent_index

def forward_to_layer(X: np.ndarray, coefs: List[np.ndarray], intercepts: List[np.ndarray],
                     layer_idx: int, activation: str = "relu") -> np.ndarray:
    def act(x):
        if activation == "relu":
            return np.maximum(0, x)
        elif activation == "tanh":
            return np.tanh(x)
        elif activation in ("logistic", "sigmoid"):
            return 1.0 / (1.0 + np.exp(-x))
        else:
            return x
    a = X.copy()
    for i in range(layer_idx + 1):
        W = coefs[i]
        b = intercepts[i]
        a = a.dot(W) + b
        a = act(a)
    return a

# -----------------------
# Main pipeline
# -----------------------
def main():
    np.random.seed(RANDOM_SEED)

    df = read_csv(INPUT_CSV)
    if df.shape[0] < 5:
        raise SystemExit("Dataset too small to train an autoencoder.")

    # Detect and preserve probable ID columns (unique-per-row)
    possible_id_cols = [c for c in df.columns if c.lower() in ("id", "npi", "physician_id", "provider_id") or c.lower().endswith("_id")]
    id_cols = [c for c in possible_id_cols if df[c].nunique() == df.shape[0]]
    print(f"Preserving ID columns (not encoded): {id_cols}")

    df_features = df.drop(columns=id_cols, errors="ignore")

    # Split into train/test and a small validation set
    train_df, test_df = train_test_split(df_features, test_size=TEST_SIZE, random_state=RANDOM_SEED, shuffle=True)
    train_df, val_df = train_test_split(train_df, test_size=VAL_SIZE, random_state=RANDOM_SEED)

    # Build and fit preprocessor
    preprocessor, num_cols, cat_cols = build_preprocessor(train_df)
    print("Fitting preprocessor on training data...")
    preprocessor.fit(train_df)
    X_train = preprocessor.transform(train_df)
    X_val = preprocessor.transform(val_df)
    X_test = preprocessor.transform(test_df)

    X_train = np.asarray(X_train, dtype=np.float32)
    X_val = np.asarray(X_val, dtype=np.float32)
    X_test = np.asarray(X_test, dtype=np.float32)

    input_dim = X_train.shape[1]
    print(f"Transformed input dimension: {input_dim}")

    if SAVE_TRANSFORMER:
        transformer_path = os.path.join(OUTPUT_DIR, "preprocessor.joblib")
        joblib.dump(preprocessor, transformer_path)
        print(f"Saved preprocessor to {transformer_path}")

    hidden_layers, latent_layer_index = symmetric_hidden_layers(HIDDEN_UNITS_ENCODER, LATENT_DIM)
    print(f"Using hidden_layer_sizes (MLP): {hidden_layers} with latent at index {latent_layer_index}")

    mlp = MLPRegressor(hidden_layer_sizes=hidden_layers,
                       activation=ACTIVATION,
                       solver="adam",
                       alpha=1e-5,
                       batch_size=BATCH_SIZE,
                       learning_rate_init=1e-3,
                       max_iter=MAX_ITER,
                       early_stopping=EARLY_STOPPING,
                       validation_fraction=0.15,
                       n_iter_no_change=20,
                       random_state=RANDOM_SEED,
                       verbose=VERBOSE)

    print("Training MLP autoencoder (X -> X)...")
    mlp.fit(X_train, X_train)

    if SAVE_MODEL:
        model_path = os.path.join(OUTPUT_DIR, "mlp_autoencoder.joblib")
        joblib.dump(mlp, model_path)
        print(f"Saved MLP autoencoder to {model_path}")

    recon_test = mlp.predict(X_test)
    mse_per_sample = np.mean((recon_test - X_test) ** 2, axis=1)
    print(f"Test reconstruction MSE: mean={mse_per_sample.mean():.6f}, std={mse_per_sample.std():.6f}")

    test_with_ids = test_df.reset_index(drop=True).copy()
    if id_cols:
        original_train, original_test = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_SEED, shuffle=True)
        test_ids = original_test.reset_index(drop=True)[id_cols]
        test_with_ids = pd.concat([test_ids.reset_index(drop=True), test_with_ids.reset_index(drop=True)], axis=1)

    test_with_ids["reconstruction_mse"] = mse_per_sample
    results_path = os.path.join(OUTPUT_DIR, "test_reconstruction_errors.csv")
    test_with_ids.to_csv(results_path, index=False)
    print(f"Saved test reconstruction errors to: {results_path}")

    # Extract latent representations for entire dataset
    all_transformed = preprocessor.transform(df_features)
    all_transformed = np.asarray(all_transformed, dtype=np.float32)
    coefs = mlp.coefs_
    intercepts = mlp.intercepts_
    encoded_all = forward_to_layer(all_transformed, coefs, intercepts, layer_idx=latent_layer_index, activation=ACTIVATION)

    encoded_cols = [f"enc_{i}" for i in range(encoded_all.shape[1])]
    encoded_df = pd.DataFrame(encoded_all, columns=encoded_cols)
    if id_cols:
        encoded_df = pd.concat([df[id_cols].reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

    encoded_out_path = os.path.join(OUTPUT_DIR, "encoded_features.csv")
    encoded_df.to_csv(encoded_out_path, index=False)
    print(f"Saved encoded features to: {encoded_out_path}")

    meta = {
        "input_csv": INPUT_CSV,
        "n_rows": int(df.shape[0]),
        "n_columns_original": int(df.shape[1]),
        "numeric_cols": detect_columns(train_df)[0],
        "categorical_cols": detect_columns(train_df)[1],
        "id_cols": id_cols,
        "input_dim_after_transform": int(input_dim),
        "latent_dim": int(LATENT_DIM),
        "hidden_units_encoder": HIDDEN_UNITS_ENCODER,
        "mlp_hidden_layers": hidden_layers
    }
    with open(os.path.join(OUTPUT_DIR, "metadata.json"), "w", encoding="utf-8") as fh:
        json.dump(meta, fh, indent=2)

    print("Done.")

if __name__ == "__main__":
    main()


Loading CSV from: /home/mubasshir/Desktop/Research/Insurance/DataSet_Patient/Medicare_Physician_Data/CMSData_sampled.csv


  df = pd.read_csv(path)


Loaded shape: (292663, 29)
Preserving ID columns (not encoded): []
Numeric columns (9): ['Rndrng_NPI', 'Rndrng_Prvdr_RUCA', 'Tot_Benes', 'Tot_Srvcs', 'Tot_Bene_Day_Srvcs', 'Avg_Sbmtd_Chrg', 'Avg_Mdcr_Alowd_Amt', 'Avg_Mdcr_Pymt_Amt', 'Avg_Mdcr_Stdzd_Amt']
Categorical columns (20): ['Rndrng_Prvdr_Last_Org_Name', 'Rndrng_Prvdr_First_Name', 'Rndrng_Prvdr_MI', 'Rndrng_Prvdr_Crdntls', 'Rndrng_Prvdr_Gndr', 'Rndrng_Prvdr_Ent_Cd', 'Rndrng_Prvdr_St1', 'Rndrng_Prvdr_St2', 'Rndrng_Prvdr_City', 'Rndrng_Prvdr_State_Abrvtn', 'Rndrng_Prvdr_State_FIPS', 'Rndrng_Prvdr_Zip5', 'Rndrng_Prvdr_RUCA_Desc', 'Rndrng_Prvdr_Cntry', 'Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Mdcr_Prtcptg_Ind', 'HCPCS_Cd', 'HCPCS_Desc', 'HCPCS_Drug_Ind', 'Place_Of_Srvc']


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'