In [2]:
import pandas as pd

In [3]:
output_df = pd.read_csv('../data/processed/output.csv')

In [None]:
from sklearn.model_selection import train_test_split

# Drop unneeded columns
features = output_df.drop(columns=['CustomerId', 'last_transaction_date', 'cluster', 'credit_risk'])
X = features
y = output_df['credit_risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, preds))
    print("Precision:", precision_score(y_test, preds))
    print("Recall:", recall_score(y_test, preds))
    print("F1 Score:", f1_score(y_test, preds))
    print("ROC-AUC:", roc_auc_score(y_test, probs))


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 5, 10]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_


In [None]:
import mlflow
import mlflow.sklearn

mlflow.set_experiment("credit-risk-modeling")

with mlflow.start_run():
    mlflow.log_params(grid.best_params_)
    mlflow.log_metric("accuracy", accuracy_score(y_test, best_model.predict(X_test)))
    mlflow.sklearn.log_model(best_model, "model")
    mlflow.register_model("runs:/" + mlflow.active_run().info.run_id + "/model", "CreditRiskModel")
