# Exerc√≠cios MLflow 

In [None]:
# Importa√ß√µes

In [1]:
import mlflow
from mlflow.models import infer_signature
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Base de dados & Data understanding b√°sico

In [4]:
# Leitura da base
dataset_path = '../data/raw/heart.csv'
dataset = pd.read_csv(dataset_path)
# Sanity test - 5 primeiros registros
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
dataset.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [5]:
dataset.shape

(1025, 14)

In [5]:
X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values

In [7]:
X, y

(array([[52.,  1.,  0., ...,  2.,  2.,  3.],
        [53.,  1.,  0., ...,  0.,  0.,  3.],
        [70.,  1.,  0., ...,  0.,  0.,  3.],
        ...,
        [47.,  1.,  0., ...,  1.,  1.,  2.],
        [50.,  0.,  0., ...,  2.,  0.,  2.],
        [54.,  1.,  0., ...,  1.,  1.,  3.]], shape=(1025, 13)),
 array([0, 0, 0, ..., 0, 1, 0], shape=(1025,)))

In [6]:
X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.2, random_state=0) 

In [10]:
# Sanity test - Training Data
X_train.shape, y_train.shape

((820, 13), (820,))

In [11]:
# Sanity test - Test Data
X_test.shape, y_test.shape

((205, 13), (205,))

In [None]:
# Cross validation / Tunning de hiperpar√¢metros

In [6]:
def generic_grid_search_cv(estimator_classifier, param_dict, X_matrix, y_matrix):
    """
    Executa um GridSearchCV para um classificador do Scikit-Learn.

    Esta fun√ß√£o recebe um estimador do Scikit-Learn (classificador),
    realiza uma busca em grade com valida√ß√£o cruzada (GridSearchCV) 
    utilizando os hiperpar√¢metros fornecidos e retorna o nome do 
    estimador e o melhor conjunto de hiperpar√¢metros encontrado.

    Args:
        estimator_classifier: Estimador Scikit-Learn, como 
            `DecisionTreeClassifier()`, `RandomForestClassifier()`, etc.
        param_dict (dict): Dicion√°rio contendo hiperpar√¢metros como chaves
            e listas de valores como op√ß√µes a serem testadas.
        X_matrix (numpy.ndarray): Matriz de features usada para o treinamento.
        y_matrix (numpy.ndarray): Vetor ou matriz de r√≥tulos alvo.

    Returns:
        estimator_name (str): Nome da classe do estimador recebido.
        best_params (dict): Melhor combina√ß√£o de hiperpar√¢metros encontrada 
            pelo GridSearchCV.
    """
    
    grid_search = GridSearchCV(estimator=estimator_classifier, param_grid=param_dict)
    grid_search.fit(X = X_matrix, y = y_matrix)
    best_params = grid_search.best_params_
    estimator_name = estimator_classifier.__class__.__name__
    
    return estimator_name, best_params

In [7]:
# Dicion√°rios de par√¢metros e valores para efetuar Cross validation

# Decision tree
tree_params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

# Random florest
rf_params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [10, 40, 100, 150],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

# Gradient boosting classifier
gbc_params = {
    'loss': ['log_loss', 'exponential'],
    'learning_rate': [0.1, 0.01, 0.001, 0.0001],
    'n_estimators': [25, 50, 100, 200, 300, 500],
    'criterion': ['friedman_mse', 'squared_error']
}

# K-nearest neighbors classifier
knn_params = {
    'n_neighbors': [3, 5, 10, 20],
    'p': [1, 2]
}

# Logistic regression
lr_params = {
    'tol': [0.0001, 0.00001, 0.000001],
    'C': [1.0, 1.5, 2.0],
    'solver': ['lbfgs', 'sag', 'saga']
}

# Support vector machine
svm_params = {
    'tol': [0.001, 0.0001, 0.00001],
    'C': [1.0, 1.5, 2.0],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

# Neural network classifier
neural_net_params = {
    'activation': ['relu', 'logistic', 'tahn'],
    'solver': ['adam', 'sgd'],
    'batch_size': [10, 56]
}

all_params = {
    'tree_params': [tree_params, DecisionTreeClassifier()],
    'rf_params': [rf_params, RandomForestClassifier()],
    'gbc_params': [gbc_params, GradientBoostingClassifier()],
    'knn_params': [knn_params, KNeighborsClassifier()],
    'lr_params': [lr_params, LogisticRegression()],
    'svm_params': [svm_params, SVC()],
    'neural_net_params': [neural_net_params, MLPClassifier()],
}


In [12]:
best_params_dict = {}
for name, param in all_params.items():
    name, best_params = generic_grid_search_cv(estimator_classifier=param[1], param_dict=param[0], X_matrix=X, y_matrix=y)
    # os values provavelmente tem de ser salvos numa lista onde 0 = best_params e 1 = objeto
    best_params_dict[f'{name}_best_params'] = best_params, param[1]

best_params_dict

{'DecisionTreeClassifier_best_params': ({'criterion': 'gini',
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'splitter': 'best'},
  DecisionTreeClassifier()),
 'RandomForestClassifier_best_params': ({'criterion': 'gini',
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'n_estimators': 40},
  RandomForestClassifier()),
 'GradientBoostingClassifier_best_params': ({'criterion': 'friedman_mse',
   'learning_rate': 0.1,
   'loss': 'exponential',
   'n_estimators': 300},
  GradientBoostingClassifier()),
 'KNeighborsClassifier_best_params': ({'n_neighbors': 3, 'p': 1},
  KNeighborsClassifier()),
 'LogisticRegression_best_params': ({'C': 1.0,
   'solver': 'lbfgs',
   'tol': 0.0001},
  LogisticRegression()),
 'SVC_best_params': ({'C': 1.0, 'kernel': 'linear', 'tol': 0.001}, SVC()),
 'MLPClassifier_best_params': ({'activation': 'logistic',
   'batch_size': 10,
   'solver': 'adam'},
  MLPClassifier())}

In [9]:
# Melhores par√¢metros definidos pelo cross-validation/tunning de hiperpar√¢metros
tree_params = {
    'criterion': 'entropy', 
    'min_samples_leaf':1, 
    'min_samples_split': 5,
    'splitter':'best'
}

rf_params = {
    'criterion': 'entropy',
    'min_samples_leaf': 1,
    'n_estimators': 40
}

gb_params = {
    'criterion': 'friedman_mse', 
    'learning_rate': 0.1,
    'loss': 'exponential',
    'n_estimators': 300
}

knn_params = {
    'n_neighbors': 20,
    'p': 1
}

lr_params = {
    'C': 1.0,
    'solver': 'lbfgs',
    'tol': 0.0001
}

svm_params = {
    'C': 1.5,
    'kernel': 'rbf',
    'tol': 0.001
}

nn_params = {
    'activation': 'relu',
    'batch_size': 10,
    'solver': 'adam'
}


In [None]:
# MLflow

In [11]:
# Definir o tracking ui
mlflow.set_tracking_uri(uri='http://localhost:5000')
# Definindo o experimento do registro
response = mlflow.set_experiment('Heart disease experiment')

In [None]:
# Tracking de modelos

In [12]:
# Random forest
mlflow_dataset = mlflow.data.from_pandas(
    dataset, source=dataset_path, name='heart-disease-dataset'
)

mlflow_X_matrix = mlflow.data.from_numpy(
    X, name='Features Matrix'
)

with mlflow.start_run():
    rf_model = RandomForestClassifier(**rf_params)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    
    mlflow.log_params(rf_params)
    
    mlflow.log_metric('Accuracy', accuracy_score(y_test, y_pred_rf))
    mlflow.log_metric('Precision', precision_score(y_test, y_pred_rf))
    mlflow.log_metric('Recall', recall_score(y_test, y_pred_rf))
    mlflow.log_metric('f1 score', f1_score(y_test, y_pred_rf))
    
    mlflow.set_tag('Training info', f'Basic Random forest for heart disease data')
    mlflow.set_tag('Scientist info', 'Test Test Test')
    
    signature = infer_signature(mlflow_dataset, y_pred_rf)
    
    model_info = mlflow.sklearn.log_model(
        sk_model=rf_model,
        name='rf_heart_model',
        input_example=X_train[[0]],
        registered_model_name="Random-forest-classifier",
    )
    
    mlflow.log_input(mlflow_dataset, context='project data')
    mlflow.log_input(mlflow_X_matrix, context='training')
    
print('-------------------------------------------')
print("Model URI: ", model_info.model_uri)
print("Model ID: ", model_info.model_id)
print("Artifact Path: ", model_info.artifact_path)
print("Model version: ", model_info.registered_model_version)
print('-------------------------------------------')

Registered model 'Random-forest-classifier' already exists. Creating a new version of this model...
2025/11/27 13:19:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random-forest-classifier, version 19


üèÉ View run classy-mouse-876 at: http://localhost:5000/#/experiments/577837039507156569/runs/2402eceb4fe74b8caea94de01fc9103d
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
-------------------------------------------
Model URI:  models:/m-de7d245c1fa4452cb1bb530dd6a204ed
Model ID:  m-de7d245c1fa4452cb1bb530dd6a204ed
Artifact Path:  mlflow-artifacts:/577837039507156569/models/m-de7d245c1fa4452cb1bb530dd6a204ed/artifacts
Model version:  19
-------------------------------------------


Created version '19' of model 'Random-forest-classifier'.


In [None]:
# Fun√ß√£o gen√©rica para registro de modelos

In [15]:
def train_and_log_model(estimator, X_train, X_test, y_train, y_test, params, dataset, dataset_path):

    """
    Fun√ß√£o "gen√©rica" de registro de modelos no MLflow.
    
    Esta √© uma fun√ß√£o gen√©rica para registrar os modelos ao MLflow. Ela recebe o classificador, as matrizes X e Y, os par√¢metros
    especificos do modelo (estimador), o dataset usado em seu treinamento e o caminho do dataset arquivo do dataset no projeto. 
    Sua funcionalidade se d√° em treinar o modelo e fazer seu registro de sua run no MLflow, al√©m de registrar outras informa√ß√µes como:
    par√¢metros, m√©tricas (accuracy, precision, recall & f1-score), exemplo de input, assinatura do modelo e o dataset usado.
    
    Args:
        estimator: Classificador do Scikit-Learn.
        X_train (numpy.ndarray): Matriz de features usada para o treinamento.
        y_train (numpy.ndarray): Vetor ou matriz de r√≥tulos alvo para treinamento.
        X_test (numpy.ndarray): Matriz de features usada para o teste.
        y_test (numpy.ndarray): Vetor ou matriz de r√≥tulos alvo para teste.
        params (dict): Dicion√°rio de melhores par√¢metros escolhidos via GridSearchCV.
        dataset (pandas.core.frame.DataFrame): 
    
    Returns:
        
    
    """
    
    mlflow_dataset = mlflow.data.from_pandas(
        dataset, 
        source=dataset_path,
        name=dataset_path.split('/')[-1]
    )
    
    
    # Wrapper de run
    with mlflow.start_run():
        
        # Treinamento do modelo
        # Obter o atributo nome do estimador
        estimator = estimator.__class__
        estimator_name = estimator.__name__
        # Inst√¢nciar do estimador com os par√¢metros escolhidos
        estimator = estimator(**params)
        # Fitagem dos dados no algoritmo
        estimator.fit(X_train, y_train)
        # Infer√™ncia do modelo
        y_pred = estimator.predict(X_test)
        
        # Registro no MLflow
        # Registro dos par√¢metros
        mlflow.log_params(params)
        
        # Registro das m√©tricas principais
        mlflow.log_metric('Accuracy', accuracy_score(y_test, y_pred))
        mlflow.log_metric('Precision', precision_score(y_test, y_pred))
        mlflow.log_metric('Recall', recall_score(y_test, y_pred))
        mlflow.log_metric('f1 score', f1_score(y_test, y_pred))
        
        # 
        mlflow.log_input(mlflow_dataset, context='raw-data')
        
        # Registro de tags para fins de documenta√ß√£o
        mlflow.set_tag('Training info', f'Basic {estimator_name} for heart disease data')
        mlflow.set_tag('Model origin', 'train and log models function')
        
        signature = infer_signature(X_train, estimator.predict(X_test))
        
        # Registro do modelo em si
        # A vari√°vel model_info cont√©m informa√ß√µes retornadas ap√≥s o tracking do modelo
        model_info = mlflow.sklearn.log_model(
            name=estimator_name,
            sk_model=estimator,
            signature=signature, 
            input_example=X_test[[0]],
            registered_model_name=estimator_name
        )
        
        return model_info
        

In [22]:
for name, param in best_params_dict.items():
    print(f'model: {name}')
    model_tacking = train_and_log_model(param[1], X_train, X_test, y_train, y_test, param[0], dataset, dataset_path)

    print(f'artifact path: {model_tacking.artifact_path}')

model: DecisionTreeClassifier_best_params


Registered model 'DecisionTreeClassifier' already exists. Creating a new version of this model...
2025/11/22 17:07:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeClassifier, version 3
Created version '3' of model 'DecisionTreeClassifier'.


üèÉ View run agreeable-dove-765 at: http://localhost:5000/#/experiments/577837039507156569/runs/5d19625e7f0a4aa59f123b00b7d79943
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-4b5f2865f2f14968afd71e7554b848c1/artifacts
model: RandomForestClassifier_best_params


Registered model 'RandomForestClassifier' already exists. Creating a new version of this model...
2025/11/22 17:07:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestClassifier, version 7
Created version '7' of model 'RandomForestClassifier'.


üèÉ View run thoughtful-fox-683 at: http://localhost:5000/#/experiments/577837039507156569/runs/47ef22c9fe7540e8bb626c02a15ba3f2
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-f7eaaa54ac0b43748f2397751be0add3/artifacts
model: GradientBoostingClassifier_best_params


Successfully registered model 'GradientBoostingClassifier'.
2025/11/22 17:07:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: GradientBoostingClassifier, version 1
Created version '1' of model 'GradientBoostingClassifier'.


üèÉ View run stately-yak-41 at: http://localhost:5000/#/experiments/577837039507156569/runs/bc34caf7b3fb4d6e87ec7127ac4bd156
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-88707672570d4d28af2aad7ed472bb0f/artifacts
model: KNeighborsClassifier_best_params


Successfully registered model 'KNeighborsClassifier'.
2025/11/22 17:07:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: KNeighborsClassifier, version 1
Created version '1' of model 'KNeighborsClassifier'.


üèÉ View run sincere-shark-92 at: http://localhost:5000/#/experiments/577837039507156569/runs/d116b75394b749a083885d2a14e25462
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-dcc950baa90c4e5aa5b44cbc38724131/artifacts
model: LogisticRegression_best_params


Successfully registered model 'LogisticRegression'.
2025/11/22 17:07:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression, version 1
Created version '1' of model 'LogisticRegression'.


üèÉ View run traveling-elk-457 at: http://localhost:5000/#/experiments/577837039507156569/runs/8243d11fbef542b0bb7a0fd021ec274f
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-9e554fb408704d24ba857a8e9ace820d/artifacts
model: SVC_best_params


Successfully registered model 'SVC'.
2025/11/22 17:08:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SVC, version 1
Created version '1' of model 'SVC'.


üèÉ View run angry-hen-446 at: http://localhost:5000/#/experiments/577837039507156569/runs/4d655e914c5c464e8778dd5753636d06
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-e705c3d72fa541bbbaa86e612fe1a232/artifacts
model: MLPClassifier_best_params


Successfully registered model 'MLPClassifier'.
2025/11/22 17:08:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MLPClassifier, version 1


üèÉ View run beautiful-panda-793 at: http://localhost:5000/#/experiments/577837039507156569/runs/6a4765df9f774de1a094628efbe5f7b0
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-47eb2b1814bc463798155a7b60cbbca2/artifacts


Created version '1' of model 'MLPClassifier'.
