# Desenvolvimento do modelo de score de crédito

Exploração inicial de modelos de base

In [19]:
import pandas as pd
from dagshub.data_engine import datasources
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
import mlflow.sklearn
import mlflow.catboost
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import mlflow.models.signature
from mlflow.models import infer_signature
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

## Carregando Dataset

In [2]:
ds = datasources.get_datasource("pedromonnt/fiap-credit-score-classification-model", "processed")

In [3]:
ds.all().dataframe

Output()

Unnamed: 0,path,datapoint_id,dagshub_download_url,media type,size
0,train-processed.csv,103365851,https://dagshub.com/api/v1/repos/pedromonnt/fi...,text/plain,16635276


In [4]:
res = ds.head()

for dp in res:
    dataset_url = dp.download_url

Output()

In [5]:
dataset_url

'https://dagshub.com/api/v1/repos/pedromonnt/fiap-credit-score-classification-model/raw/main/data/processed/train-processed.csv'

In [6]:
df = pd.read_csv(dataset_url)
df.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,1.0,23.0,13,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,...,1,809.98,26.82262,265.0,1,49.574949,80.415295,3,312.494089,2
1,2.0,23.0,13,19114.12,3093.745,3.0,4.0,3.0,4.0,-1.0,...,2,809.98,31.94496,219.0,1,49.574949,118.280222,4,284.629162,2
2,3.0,-3.0,13,19114.12,3093.745,3.0,4.0,3.0,4.0,3.0,...,2,809.98,28.609352,267.0,1,49.574949,81.699521,5,331.209863,2
3,4.0,23.0,13,19114.12,3093.745,3.0,4.0,3.0,4.0,5.0,...,2,809.98,31.377862,268.0,1,49.574949,199.458074,6,223.45131,2
4,5.0,23.0,13,19114.12,1824.843333,3.0,4.0,3.0,4.0,6.0,...,2,809.98,24.797347,269.0,1,49.574949,41.420153,2,341.489231,2


## Desenvolvimento e experimentos de modelos

In [7]:
dagshub.init(repo_owner="pedromonnt", repo_name="fiap-credit-score-classification-model", mlflow=True)

In [8]:
mlflow.autolog()

2025/07/16 02:29:44 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/07/16 02:29:47 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/07/16 02:29:47 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [14]:
# Features e Target do conjunto de treino
y = df['Credit_Score']
X = df.drop(columns='Credit_Score')

# Dividir o conjunto de TREINO em treino e VALIDAÇÃO
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
def evaluate_and_log_model(kind, model_name, model, X_val, y_val):
    predictions = model.predict(X_val)
    pred_proba = model.predict_proba(X_val)

    # Métricas
    accuracy = accuracy_score(y_val, predictions)
    f1 = f1_score(y_val, predictions, average='weighted')
    precision = precision_score(y_val, predictions, average='weighted')
    recall = recall_score(y_val, predictions, average='weighted')
    roc_auc = roc_auc_score(y_val, pred_proba, multi_class='ovr', average='weighted')

    print(f"Resultados para {model_name}:")
    print(f"  Acurácia: {accuracy:.4f}")
    print(f"  F1-Score (Weighted): {f1:.4f}")
    print(f"  ROC AUC (Weighted): {roc_auc:.4f}")

    # Log de Métricas no MLflow
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_weighted", f1)
    mlflow.log_metric("precision_weighted", precision)
    mlflow.log_metric("recall_weighted", recall)
    mlflow.log_metric("roc_auc_weighted", roc_auc)

    signature = infer_signature(X_val, predictions)

    if kind == "catboost":
        mlflow.catboost.log_model(model, model_name, signature=signature, input_example=X_val[:5])
    elif kind == "xgboost":
        mlflow.xgboost.log_model(model, model_name, signature=signature, input_example=X_val[:5])
    elif kind == "lightgbm":
        mlflow.lightgbm.log_model(model, model_name, signature=signature, input_example=X_val[:5])
    else:
        mlflow.sklearn.log_model(model, model_name, signature=signature, input_example=X_val[:5])

    print(f"Model {model_name} logged with accuracy: {accuracy}, f1_weighted: {f1}, precision_weighted: {precision}, recall_weighted: {recall}, roc_auc_weighted: {roc_auc}")

    # Log do Relatório de Classificação como artefato de texto
    report = classification_report(y_val, predictions)
    mlflow.log_text(report, "classification_report.txt")

    # Log do modelo
    mlflow.sklearn.log_model(model, model_name)

### Modelo 1: Regressão Logística

In [29]:
with mlflow.start_run(run_name="Logistic Regression"):
   
    # Parâmetros para GridSearchCV
    param_grid_lr = {
        'C': [0.1, 1.0, 10.0],
        'solver': ['liblinear', 'saga']
    }
    
    lr = LogisticRegression(random_state=42, max_iter=1000)

    grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search_lr.fit(X_train, y_train)
    
    best_model_lr = grid_search_lr.best_estimator_
    mlflow.log_params(grid_search_lr.best_params_)
    
    evaluate_and_log_model("sklearn", "logistic_regression", best_model_lr, X_val, y_val)

2025/07/16 19:47:38 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


Resultados para logistic_regression:
  Acurácia: 0.6163
  F1-Score (Weighted): 0.5964
  ROC AUC (Weighted): 0.7557




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model logistic_regression logged with accuracy: 0.61635, f1_weighted: 0.5964499850319438, precision_weighted: 0.6099706270816464, recall_weighted: 0.61635, roc_auc_weighted: 0.7557387565705866




🏃 View run Logistic Regression at: https://dagshub.com/pedromonnt/fiap-credit-score-classification-model.mlflow/#/experiments/0/runs/2dc4cbafc11d4d56ac608f6a56c274db
🧪 View experiment at: https://dagshub.com/pedromonnt/fiap-credit-score-classification-model.mlflow/#/experiments/0


### Modelo 2: Árvore de Decisão

In [30]:
with mlflow.start_run(run_name="Decision Tree Classifier"):
    dt = DecisionTreeClassifier(random_state=42)
    
    param_grid_dt = {
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search_dt.fit(X_train, y_train)

    best_model_dt = grid_search_dt.best_estimator_
    mlflow.log_params(grid_search_dt.best_params_)

    evaluate_and_log_model("sklearn", "decision_tree_classifier", best_model_dt, X_val, y_val)

2025/07/16 19:49:45 INFO mlflow.sklearn.utils: Logging the 5 best runs, 22 runs will be omitted.


Resultados para decision_tree_classifier:
  Acurácia: 0.7031
  F1-Score (Weighted): 0.7036
  ROC AUC (Weighted): 0.8390




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model decision_tree_classifier logged with accuracy: 0.7031, f1_weighted: 0.7035515413010885, precision_weighted: 0.7047698245011892, recall_weighted: 0.7031, roc_auc_weighted: 0.838961378943835




🏃 View run Decision Tree Classifier at: https://dagshub.com/pedromonnt/fiap-credit-score-classification-model.mlflow/#/experiments/0/runs/fbeef9d7206843f58ab0f876e643f05b
🧪 View experiment at: https://dagshub.com/pedromonnt/fiap-credit-score-classification-model.mlflow/#/experiments/0


### Modelo 3: XGBoost Classifier

In [31]:
with mlflow.start_run(run_name="XGBoost Classifier"):
    
    xgb = XGBClassifier(random_state=42, eval_metric='mlogloss')
    
    param_grid_xgb = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0]
    }
    
    grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search_xgb.fit(X_train, y_train)

    best_model_xgb = grid_search_xgb.best_estimator_
    mlflow.log_params(grid_search_xgb.best_params_)
    
    evaluate_and_log_model("xgboost", "xgboost_classifier", best_model_xgb, X_val, y_val)

2025/07/16 19:54:23 INFO mlflow.sklearn.utils: Logging the 5 best runs, 19 runs will be omitted.


Resultados para xgboost_classifier:
  Acurácia: 0.7671
  F1-Score (Weighted): 0.7670
  ROC AUC (Weighted): 0.8893


  self.get_booster().save_model(fname)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Model xgboost_classifier logged with accuracy: 0.76705, f1_weighted: 0.7669911168240147, precision_weighted: 0.7672242310778753, recall_weighted: 0.76705, roc_auc_weighted: 0.8893220604433997




🏃 View run XGBoost Classifier at: https://dagshub.com/pedromonnt/fiap-credit-score-classification-model.mlflow/#/experiments/0/runs/cec574e6013d4c41883a495d502c60e8
🧪 View experiment at: https://dagshub.com/pedromonnt/fiap-credit-score-classification-model.mlflow/#/experiments/0


## Registro de Modelo em Produção

In [34]:
run_id = "cec574e6013d4c41883a495d502c60e8"

mlflow.register_model(model_uri=f"runs:/{run_id}/model", name="credit-score-classification-model")

Registered model 'credit-score-classification-model' already exists. Creating a new version of this model...
2025/07/16 19:56:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: credit-score-classification-model, version 6
Created version '6' of model 'credit-score-classification-model'.


<ModelVersion: aliases=[], creation_timestamp=1752706574963, current_stage='None', description='', last_updated_timestamp=1752706574963, name='credit-score-classification-model', run_id='cec574e6013d4c41883a495d502c60e8', run_link='', source='mlflow-artifacts:/5cb5e553364e438896ea46ed8538561f/cec574e6013d4c41883a495d502c60e8/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='6'>