In [None]:
import mlflow
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [None]:
df = pd.read_csv("../temp/loanApprovalPrediction.csv")
df.head()

In [None]:
df = df.drop("Loan_ID", axis=1)

In [None]:
df["Dependents"] = df["Dependents"].astype(str)

In [None]:
le = LabelEncoder()
df["Loan_Status"] = le.fit_transform(df["Loan_Status"])
y = df["Loan_Status"]
X = df.drop("Loan_Status", axis=1)

In [None]:
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include="object").columns.tolist()

In [None]:
if "Dependents" in numerical_features:  # If it was initially int/float, remove it
    numerical_features.remove("Dependents")
if (
    "Dependents" not in categorical_features
):  # And add it to categorical if it's not there
    categorical_features.append("Dependents")

In [None]:
print(numerical_features)
print(categorical_features)

In [None]:
numerical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="passthrough",  # Keep other columns (if any)
)

In [None]:
preprocessor.fit(X)
X_processed = preprocessor.transform(X)

In [None]:
new_column_names = numerical_features + list(
    preprocessor.named_transformers_["cat"]
    .named_steps["onehot"]
    .get_feature_names_out(categorical_features)
)

In [None]:
full_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),  # The fitted preprocessor object
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)

In [None]:
param_grid = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 5],
    "classifier__max_features": ["sqrt", "log2"],
}
grid_search = GridSearchCV(
    full_pipeline,
    param_grid,
    cv=KFold(n_splits=3, shuffle=True, random_state=42),
    scoring="accuracy",
    n_jobs=-1,
    verbose=1,
)

In [None]:
grid_search.fit(X, y)

In [None]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_cv_score = grid_search.best_score_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

In [None]:
with mlflow.start_run(run_name="Loan_Default_Prediction_RandomForest") as run:
    run_id = run.info.run_id

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=KFold(n_splits=3, shuffle=True, random_state=42),
        scoring="accuracy",
        n_jobs=-1,
        verbose=1,
    )

    grid_search.fit(X, y)

    # Log the best score and best params
    mlflow.log_metric("best_accuracy", grid_search.best_score_)
    mlflow.log_params(grid_search.best_params_)

    # Log all runs (optional, but great for comparison)
    results = grid_search.cv_results_
    for i in range(len(results["params"])):
        with mlflow.start_run(run_name=f"GridSearchTrial_{i}", nested=True):
            mlflow.log_params(results["params"][i])
            mlflow.log_metric("mean_test_score", results["mean_test_score"][i])
            mlflow.log_metric("std_test_score", results["std_test_score"][i])