# Desenvolvimento do modelo

Exploração inicial de modelos de base - CLASSIFICAÇÃO

In [None]:
import pandas as pd
from dagshub.data_engine import datasources
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
import mlflow.sklearn
import mlflow.xgboost
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.svm import SVC
import mlflow.models.signature
from mlflow.models import infer_signature
from sklearn.preprocessing import LabelEncoder
import numpy as np

## Carregando Dataset

In [None]:
ds = datasources.get_datasource("luciancsilva/fiap-10dtsr-mlops-trabalho-final", "processed")

In [None]:
ds.all().dataframe

In [None]:
res = ds.head()

for dp in res:
    dataset_url = dp.download_url

In [None]:
dataset_url

In [None]:
df = pd.read_csv(dataset_url)
df.head()

## Verificação e ajuste do Credit_Score para classificação (-1, 0, 1)

In [None]:
# Verificar os valores únicos atuais do Credit_Score
print("Valores únicos no Credit_Score antes do ajuste:")
print(df['Credit_Score'].unique())
print("Tipo atual:", df['Credit_Score'].dtype)

# Se os valores já são -1, 0, 1, apenas garantir que seja int64
# Se são strings como 'Poor', 'Standard', 'Good', mapear para -1, 0, 1
if df['Credit_Score'].dtype == 'object':
    # Mapear strings para valores numéricos (ajuste conforme seus dados)
    credit_score_map = {
        'Poor': -1, 
        'Standard': 0, 
        'Good': 1
    }
    df['Credit_Score'] = df['Credit_Score'].map(credit_score_map)

# Garantir que seja int64
df['Credit_Score'] = df['Credit_Score'].astype('int64')

print("\nValores únicos no Credit_Score após ajuste:")
print(df['Credit_Score'].unique())
print("Distribuição do Credit_Score:")
print(df['Credit_Score'].value_counts().sort_index())

## Desenvolvimento e experimentos de modelos

In [None]:
dagshub.init(repo_owner="luciancsilva", repo_name="fiap-10dtsr-mlops-trabalho-final", mlflow=True)

In [None]:
mlflow.autolog()

In [None]:
# Usar o LabelEncoder para a coluna 'Payment_of_Min_Amount'
le = LabelEncoder()
df['Payment_of_Min_Amount'] = le.fit_transform(df['Payment_of_Min_Amount'].astype(str))

In [None]:
features = list(df.columns)
features.remove('ID')
features.remove('Customer_ID')
features.remove('Credit_Score')
features.remove('Occupation')
features.remove('Monthly_Inhand_Salary')
features.remove('Interest_Rate')
features.remove('Type_of_Loan')
features.remove('Delay_from_due_date')
features.remove('Changed_Credit_Limit')
features.remove('Num_Credit_Inquiries')
features.remove('Credit_Mix')
features.remove('Amount_invested_monthly')
features.remove('Monthly_Balance')
features.remove('Num_of_Loan')
features.remove('Outstanding_Debt')
features.remove('Not_Specified')
features.remove('Month')

features

In [None]:
X = df[features]

In [None]:
len(features)

In [None]:
y = df["Credit_Score"]
print("Classes únicas:", y.unique())
print("Tipo do target:", y.dtype)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Mapear classes para XGBoost (que espera classes sequenciais começando do 0)
class_mapping = {-1: 0, 0: 1, 1: 2}
reverse_mapping = {0: -1, 1: 0, 2: 1}

# Aplicar mapeamento nos conjuntos de treino e teste
y_train_xgb = y_train.map(class_mapping)
y_test_xgb = y_test.map(class_mapping)

print("Mapeamento de classes para XGBoost:")
print("Original -> XGBoost")
for orig, xgb in class_mapping.items():
    print(f"{orig} -> {xgb}")
    
print(f"\nDistribuição y_train original: {y_train.value_counts().sort_index()}")
print(f"Distribuição y_train_xgb: {y_train_xgb.value_counts().sort_index()}")

In [None]:
def evaluate_and_log_classification_model(kind, model_name, model, X_test, y_test, y_test_original=None, reverse_mapping=None):
    predictions = model.predict(X_test)
    
    # Se for XGBoost, converter predições de volta para classes originais
    if reverse_mapping is not None:
        predictions_original = pd.Series(predictions).map(reverse_mapping)
        y_test_eval = y_test_original
    else:
        predictions_original = predictions
        y_test_eval = y_test
    
    probabilities = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None

    accuracy = accuracy_score(y_test_eval, predictions_original)
    precision = precision_score(y_test_eval, predictions_original, average='weighted')
    recall = recall_score(y_test_eval, predictions_original, average='weighted')
    f1 = f1_score(y_test_eval, predictions_original, average='weighted')

    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("Precision", precision)
    mlflow.log_metric("Recall", recall)
    mlflow.log_metric("F1_Score", f1)

    # Log classification report
    report = classification_report(y_test_eval, predictions_original, output_dict=True)
    for class_label, metrics in report.items():
        if isinstance(metrics, dict):
            for metric_name, value in metrics.items():
                mlflow.log_metric(f"{class_label}_{metric_name}", value)

    signature = infer_signature(X_test, predictions_original)

    if kind == "xgboost":
        mlflow.xgboost.log_model(model, model_name, signature=signature, input_example=X_test[:5])
    elif kind == "lightgbm":
        mlflow.lightgbm.log_model(model, model_name, signature=signature, input_example=X_test[:5])
    else:
        mlflow.sklearn.log_model(model, model_name, signature=signature, input_example=X_test[:5])

    print(f"Model {model_name} logged with Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    print(f"Predições originais (sample): {predictions_original[:10] if reverse_mapping else predictions[:10]}")

### Experimento com Ridge Classifier

In [None]:
print(X_train.dtypes)

In [None]:
with mlflow.start_run(run_name="Ridge Classifier"):
    param_grid = {
        'alpha': [0.1, 1.0, 10.0, 100.0],
        'fit_intercept': [True, False],
        }
    
    ridge = RidgeClassifier()

    grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_param("best_alpha", grid_search.best_params_['alpha'])
    mlflow.log_param("best_fit_intercept", grid_search.best_params_['fit_intercept'])

    evaluate_and_log_classification_model("sklearn", "ridge_classifier", best_model, X_test, y_test)

### Decision Tree Classifier

In [None]:
with mlflow.start_run(run_name="Decision Tree Classifier"):
    param_grid = {
        'max_depth': [None, 3, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    dt = DecisionTreeClassifier(random_state=42)

    grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_param("best_max_depth", grid_search.best_params_['max_depth'])
    mlflow.log_param("best_min_samples_split", grid_search.best_params_['min_samples_split'])
    mlflow.log_param("best_min_samples_leaf", grid_search.best_params_['min_samples_leaf'])

    evaluate_and_log_classification_model("sklearn", "decision_tree_classifier", best_model, X_test, y_test)

## XGBoost Classifier

In [None]:
with mlflow.start_run(run_name="XGBoost_Classifier_Fast"):
   
    # Grid muito mais simples para ser rápido
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'learning_rate': [0.1, 0.2]
    }
   
    # Usar dados mapeados para XGBoost
    xgb = XGBClassifier(random_state=42, verbosity=0, eval_metric='mlogloss')
    grid_search = GridSearchCV(xgb, param_grid, scoring='accuracy', cv=3)  # CV reduzido para 3
    grid_search.fit(X_train, y_train_xgb)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_learning_rate", best_model.learning_rate)
    
    # Avaliar com conversão de volta para classes originais
    evaluate_and_log_classification_model("xgboost", "XGBoost Classifier", best_model, X_test, y_test_xgb, y_test, reverse_mapping)

## Registro de Modelo em Produção

In [None]:
# Substitua pelo run_id do melhor modelo
run_id = "4c0a3d5e4f0948fba1e1d6735a44719f"

mlflow.register_model(model_uri=f"runs:/{run_id}/model", name="credit_score_model")