In [None]:
# =========================================================
# FINAL MODEL + FASTAPI 
# =========================================================

from mlflow.tracking import MlflowClient
import mlflow
import pandas as pd
import joblib
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

from custom_transformers import Log1pTransformer
from pydantic import BaseModel

# ---------------------------------------------------------
# 0. Load modeling dataset and create X_train / y_train
# ---------------------------------------------------------
df = pd.read_parquet("modeling_dataset_clean.parquet")
print("‚úÖ Loaded modeling_dataset_clean.parquet")
print("Columns:", df.columns.tolist())


binary_cols = [c for c in df.columns if df[c].dropna().isin([0, 1]).all()]
if len(binary_cols) == 1:
    target_col = binary_cols[0]
else:
    target_col = df.columns[-1]

print(f"‚úÖ Detected target column: {target_col}")

X = df.drop(target_col, axis=1)
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("‚úÖ X_train shape:", X_train.shape)
print("‚úÖ y_train shape:", y_train.shape)

# ---------------------------------------------------------
# 1. Connect to MLflow and find best run
# ---------------------------------------------------------
mlflow.set_tracking_uri("https://dagshub.com/menna1996/avoidable_ed_ml_project.mlflow")
client = MlflowClient()

experiment = client.get_experiment_by_name("Avoidable_ED_Project_Experiments")
runs = client.search_runs(experiment_ids=[experiment.experiment_id])

records = []
for run in runs:
    f1 = run.data.metrics.get("test_f1")
    if f1 is not None:
        records.append({
            "run_id": run.info.run_id,
            "run_name": run.data.tags.get("mlflow.runName", run.info.run_id),
            "test_f1": f1
        })

df_runs = pd.DataFrame(records).sort_values("test_f1", ascending=False)
best_run = df_runs.iloc[0]

best_run_id = best_run["run_id"]
best_run_name = best_run["run_name"]

print(" Best run:", best_run_name)

# ---------------------------------------------------------
# 2. Extract hyperparameters
# ---------------------------------------------------------
run_data = client.get_run(best_run_id)
raw_params = run_data.data.params

clean_params = {}
for k, v in raw_params.items():
    if k.startswith("clf__"):
        k = k.replace("clf__", "")
    if k in ["n_estimators", "max_depth", "min_samples_split", "min_samples_leaf"]:
        clean_params[k] = int(v)
    elif v in ["True", "False"]:
        clean_params[k] = (v == "True")
    else:
        clean_params[k] = v

print("‚úÖ Cleaned params:", clean_params)

# ---------------------------------------------------------
# 3. EXACT VARIABLES
# ---------------------------------------------------------
numeric_features = [
    "AGE_AT_END_REF_YR",
    "total_paid_amt",
    "primary_dx_chronic_flag",
    "bodysystem_respiratory",
    "bodysystem_circulatory",
    "bodysystem_infectious",
    "bodysystem_digestive",
    "bodysystem_mentalbehavioral",
    "bodysystem_musculoskeletal",
    "bodysystem_neoplasms",
    "bodysystem_nervoussystem",
    "bodysystem_injurypoisoning",
    "bodysystem_skin",
    "bodysystem_genitourinary",
    "bodysystem_endocrine",
    "bodysystem_bloodimmune",
    "bodysystem_symptoms",
    "bodysystem_externalcauses",
    "bodysystem_congenital",
    "bodysystem_perinatal",
    "bodysystem_pregnancy",
    "bodysystem_dental",
    "bodysystem_eye",
    "bodysystem_ear",
    "bodysystem_healthstatus",
    "bodysystem_unacceptable",
]

categorical_features = [
    "SEX_IDENT_CD",
    "BENE_RACE_CD",
    "YEAR"
]

missing_in_data = [c for c in numeric_features + categorical_features if c not in X_train.columns]
if missing_in_data:
    print("ERROR: Missing features:", missing_in_data)
    raise ValueError("Feature mismatch between experiment and modeling_dataset_clean.parquet")

# ---------------------------------------------------------
# 4. preprocessing pipeline
# ---------------------------------------------------------
numeric_transformer = Pipeline(steps=[
    ("log1p", Log1pTransformer()),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# ---------------------------------------------------------
# 5. final model
# ---------------------------------------------------------
rf_final = RandomForestClassifier(
    class_weight="balanced",
    n_jobs=-1,
    random_state=42,
    **clean_params,
)

final_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", rf_final),
])

# ---------------------------------------------------------
# 6. Fit final model
# ---------------------------------------------------------
final_pipeline.fit(X_train, y_train)
print("‚úÖ Final model fitted.")

# ---------------------------------------------------------
# 7. Save final model
# ---------------------------------------------------------
joblib.dump(final_pipeline, "final_avoidable_ed_model.joblib")
print("‚úÖ Saved final_avoidable_ed_model.joblib")

# ---------------------------------------------------------
# 8. Extract input features for FastAPI
# ---------------------------------------------------------
input_features = preprocessor.feature_names_in_
print("‚úÖ Input features:", list(input_features))

# ---------------------------------------------------------
# 9. Generate FastAPI schema
# ---------------------------------------------------------
fields = "\n".join([f"    {feat}: float" for feat in input_features])

schema_text = f"""
class EDVisitInput(BaseModel):
{fields}
"""

print(schema_text)

# ---------------------------------------------------------
# 10. Generate main.py 
# ---------------------------------------------------------
main_py = f"""
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import joblib
import pandas as pd

model = joblib.load("final_avoidable_ed_model.joblib")

class EDVisitInput(BaseModel):
{fields}

app = FastAPI()

@app.get("/")
def root():
    return {{"message": "Avoidable ED Prediction API"}}

@app.post("/predict")
def predict(data: EDVisitInput):
    try:
        df = pd.DataFrame([data.dict()])
        print("Received input:", df.to_dict(orient="records")[0])

        df.columns = df.columns.str.strip()

        categorical_cols = ["SEX_IDENT_CD", "BENE_RACE_CD", "YEAR"]
        for col in categorical_cols:
            if col in df.columns:
                df[col] = df[col].astype(str)

        df.fillna("missing", inplace=True)

        pred = model.predict(df)[0]
        prob = model.predict_proba(df)[0][1]

        print("Prediction:", pred, "Probability:", prob)
        return {{"prediction": int(pred), "probability": float(prob)}}

    except Exception as e:
        print("Prediction error:", str(e))
        raise HTTPException(status_code=500, detail="Prediction failed")
"""

with open("main.py", "w") as f:
    f.write(main_py)

print("‚úÖ main.py generated.")


‚úÖ Loaded modeling_dataset_clean.parquet
Columns: ['AGE_AT_END_REF_YR', 'SEX_IDENT_CD', 'BENE_RACE_CD', 'YEAR', 'total_paid_amt', 'primary_dx_chronic_flag', 'bodysystem_respiratory', 'bodysystem_circulatory', 'bodysystem_infectious', 'bodysystem_digestive', 'bodysystem_mentalbehavioral', 'bodysystem_musculoskeletal', 'bodysystem_neoplasms', 'bodysystem_nervoussystem', 'bodysystem_injurypoisoning', 'bodysystem_skin', 'bodysystem_genitourinary', 'bodysystem_endocrine', 'bodysystem_bloodimmune', 'bodysystem_symptoms', 'bodysystem_externalcauses', 'bodysystem_congenital', 'bodysystem_perinatal', 'bodysystem_pregnancy', 'bodysystem_dental', 'bodysystem_eye', 'bodysystem_ear', 'bodysystem_healthstatus', 'bodysystem_unacceptable', 'Avoidable_ED_Visit']
‚úÖ Detected target column: Avoidable_ED_Visit
‚úÖ X_train shape: (6047, 29)
‚úÖ y_train shape: (6047,)
 Best run: Exp2_RandomForest
‚úÖ Cleaned params: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
‚úÖ Final model fitted.
‚úÖ

In [None]:
# Validation 

!pip install uvicorn[standard]

from custom_transformers import Log1pTransformer
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# -------------------------------
# Load dataset
# -------------------------------
df = pd.read_parquet("modeling_dataset_clean.parquet")
print("‚úÖ Loaded dataset")

# Detect target column (binary or last column)
binary_cols = [c for c in df.columns if df[c].dropna().isin([0,1]).all()]
target_col = binary_cols[0] if len(binary_cols)==1 else df.columns[-1]
print("‚úÖ Target column:", target_col)

X = df.drop(target_col, axis=1)
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# Load model
# -------------------------------
model = joblib.load("final_avoidable_ed_model.joblib")
print("‚úÖ Model loaded")

# -------------------------------
# Check feature alignment
# -------------------------------
expected = list(model.named_steps["preprocess"].feature_names_in_)
missing = [c for c in expected if c not in X_train.columns]
extra = [c for c in X_train.columns if c not in expected]

print("‚úÖ Expected features:", expected)
print(" Missing:", missing) if missing else print("‚úÖ No missing features")
print("‚ö†Ô∏è Extra:", extra) if extra else print("‚úÖ No extra features")

# -------------------------------
# Prediction tests
# -------------------------------
sample = X_train.iloc[[0]]
pred = model.predict(sample)[0]
prob = model.predict_proba(sample)[0][1]
print("‚úÖ Single prediction OK:", pred, prob)

preds = model.predict(X_train.head(50))
probs = model.predict_proba(X_train.head(50))[:,1]
print("‚úÖ Batch prediction OK")

# -------------------------------
# NaN check
# -------------------------------
if np.isnan(preds).any() or np.isnan(probs).any():
    print(" NaNs found")
else:
    print("‚úÖ No NaNs in predictions")

print("\nüéâ FINAL CHECK COMPLETE ‚Äî Model is deployment‚Äëready üéâ")

In [None]:
#!pip install -U jupyter-book
