# MLflow Pipeline

In [1]:
# Importa√ß√µes

In [2]:
import mlflow
from mlflow.models import infer_signature
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Base de dados & Data understanding b√°sico

In [5]:
# Leitura da base
dataset_path = '../data/raw/heart.csv'
dataset = pd.read_csv(dataset_path)
# Sanity test - 5 primeiros registros
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [6]:
dataset.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [7]:
dataset.shape

(1025, 14)

In [8]:
X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values

In [9]:
X, y

(array([[52.,  1.,  0., ...,  2.,  2.,  3.],
        [53.,  1.,  0., ...,  0.,  0.,  3.],
        [70.,  1.,  0., ...,  0.,  0.,  3.],
        ...,
        [47.,  1.,  0., ...,  1.,  1.,  2.],
        [50.,  0.,  0., ...,  2.,  0.,  2.],
        [54.,  1.,  0., ...,  1.,  1.,  3.]], shape=(1025, 13)),
 array([0, 0, 0, ..., 0, 1, 0], shape=(1025,)))

In [10]:
X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.2, random_state=0) 

In [11]:
# Sanity test - Training Data
X_train.shape, y_train.shape

((820, 13), (820,))

In [12]:
# Sanity test - Test Data
X_test.shape, y_test.shape

((205, 13), (205,))

In [13]:
# Cross validation / Tunning de hiperpar√¢metros

In [14]:
def generic_grid_search_cv(estimator_classifier, param_dict, X_matrix, y_matrix):
    """
    Executa um GridSearchCV para um classificador do Scikit-Learn.

    Esta fun√ß√£o recebe um estimador do Scikit-Learn (classificador),
    realiza uma busca em grade com valida√ß√£o cruzada (GridSearchCV) 
    utilizando os hiperpar√¢metros fornecidos e retorna o nome do 
    estimador e o melhor conjunto de hiperpar√¢metros encontrado.

    Args:
        estimator_classifier: Estimador Scikit-Learn, como 
            `DecisionTreeClassifier()`, `RandomForestClassifier()`, etc.
        param_dict (dict): Dicion√°rio contendo hiperpar√¢metros como chaves
            e listas de valores como op√ß√µes a serem testadas.
        X_matrix (numpy.ndarray): Matriz de features usada para o treinamento.
        y_matrix (numpy.ndarray): Vetor ou matriz de r√≥tulos alvo.

    Returns:
        estimator_name (str): Nome da classe do estimador recebido.
        best_params (dict): Melhor combina√ß√£o de hiperpar√¢metros encontrada 
            pelo GridSearchCV.
    """
    
    grid_search = GridSearchCV(estimator=estimator_classifier, param_grid=param_dict)
    grid_search.fit(X = X_matrix, y = y_matrix)
    best_params = grid_search.best_params_
    estimator_name = estimator_classifier.__class__.__name__
    
    return estimator_name, best_params

In [15]:
# Dicion√°rios de par√¢metros e valores para efetuar Cross validation

# Decision tree
tree_params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

# Random florest
rf_params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [10, 40, 100, 150],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

# Gradient boosting classifier
gbc_params = {
    'loss': ['log_loss', 'exponential'],
    'learning_rate': [0.1, 0.01, 0.001, 0.0001],
    'n_estimators': [25, 50, 100, 200, 300, 500],
    'criterion': ['friedman_mse', 'squared_error']
}

# K-nearest neighbors classifier
knn_params = {
    'n_neighbors': [3, 5, 10, 20],
    'p': [1, 2]
}

# Logistic regression
lr_params = {
    'tol': [0.0001, 0.00001, 0.000001],
    'C': [1.0, 1.5, 2.0],
    'solver': ['lbfgs', 'sag', 'saga']
}

# Support vector machine
svm_params = {
    'tol': [0.001, 0.0001, 0.00001],
    'C': [1.0, 1.5, 2.0],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

# Neural network classifier
neural_net_params = {
    'activation': ['relu', 'logistic', 'tahn'],
    'solver': ['adam', 'sgd'],
    'batch_size': [10, 56]
}

all_params = {
    'tree_params': [tree_params, DecisionTreeClassifier()],
    'rf_params': [rf_params, RandomForestClassifier()],
    'gbc_params': [gbc_params, GradientBoostingClassifier()],
    'knn_params': [knn_params, KNeighborsClassifier()],
    'lr_params': [lr_params, LogisticRegression()],
    'svm_params': [svm_params, SVC()],
    'neural_net_params': [neural_net_params, MLPClassifier()],
}


In [16]:
best_params_dict = {}
for name, param in all_params.items():
    name, best_params = generic_grid_search_cv(estimator_classifier=param[1], param_dict=param[0], X_matrix=X, y_matrix=y)
    # os values provavelmente tem de ser salvos numa lista onde 0 = best_params e 1 = objeto
    best_params_dict[f'{name}_best_params'] = best_params, param[1]

best_params_dict

{'DecisionTreeClassifier_best_params': ({'criterion': 'entropy',
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'splitter': 'best'},
  DecisionTreeClassifier()),
 'RandomForestClassifier_best_params': ({'criterion': 'gini',
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'n_estimators': 150},
  RandomForestClassifier()),
 'GradientBoostingClassifier_best_params': ({'criterion': 'friedman_mse',
   'learning_rate': 0.1,
   'loss': 'exponential',
   'n_estimators': 300},
  GradientBoostingClassifier()),
 'KNeighborsClassifier_best_params': ({'n_neighbors': 3, 'p': 1},
  KNeighborsClassifier()),
 'LogisticRegression_best_params': ({'C': 1.0,
   'solver': 'lbfgs',
   'tol': 0.0001},
  LogisticRegression()),
 'SVC_best_params': ({'C': 1.0, 'kernel': 'linear', 'tol': 0.001}, SVC()),
 'MLPClassifier_best_params': ({'activation': 'logistic',
   'batch_size': 10,
   'solver': 'adam'},
  MLPClassifier())}

In [17]:
# Definir o tracking ui
mlflow.set_tracking_uri(uri='http://localhost:5000')
# Definindo o experimento do registro
response = mlflow.set_experiment('Heart disease experiment')

In [18]:
# Tracking de modelos

In [19]:
# Random forest
mlflow_dataset = mlflow.data.from_pandas(
    dataset, source=dataset_path, name='heart-disease-dataset'
)

mlflow_X_matrix = mlflow.data.from_numpy(
    X, name='Features Matrix'
)

with mlflow.start_run():
    rf_model = RandomForestClassifier(**rf_params)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    
    mlflow.log_params(rf_params)
    
    mlflow.log_metric('Accuracy', accuracy_score(y_test, y_pred_rf))
    mlflow.log_metric('Precision', precision_score(y_test, y_pred_rf))
    mlflow.log_metric('Recall', recall_score(y_test, y_pred_rf))
    mlflow.log_metric('f1 score', f1_score(y_test, y_pred_rf))
    
    mlflow.set_tag('Training info', f'Basic Random forest for heart disease data')
    mlflow.set_tag('Scientist info', 'Test Test Test')
    
    signature = infer_signature(mlflow_dataset, y_pred_rf)
    
    model_info = mlflow.sklearn.log_model(
        sk_model=rf_model,
        name='rf_heart_model',
        input_example=X_train[[0]],
        registered_model_name="Random-forest-classifier",
    )
    
    mlflow.log_input(mlflow_dataset, context='project data')
    mlflow.log_input(mlflow_X_matrix, context='training')
    
print('-------------------------------------------')
print("Model URI: ", model_info.model_uri)
print("Model ID: ", model_info.model_id)
print("Artifact Path: ", model_info.artifact_path)
print("Model version: ", model_info.registered_model_version)
print('-------------------------------------------')

üèÉ View run upbeat-trout-51 at: http://localhost:5000/#/experiments/577837039507156569/runs/fdf65e587217437694f2db2b9883d013
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569


InvalidParameterError: The 'criterion' parameter of RandomForestClassifier must be a str among {'gini', 'log_loss', 'entropy'}. Got ['gini', 'entropy'] instead.

In [None]:
# Fun√ß√£o gen√©rica para registro de modelos

In [20]:
def train_and_log_model(estimator, X_train, X_test, y_train, y_test, params, dataset, dataset_path):

    """
    Fun√ß√£o "gen√©rica" de registro de modelos no MLflow.
    
    Esta √© uma fun√ß√£o gen√©rica para registrar os modelos ao MLflow. Ela recebe o classificador, as matrizes X e Y, os par√¢metros
    especificos do modelo (estimador), o dataset usado em seu treinamento e o caminho do dataset arquivo do dataset no projeto. 
    Sua funcionalidade se d√° em treinar o modelo e fazer seu registro de sua run no MLflow, al√©m de registrar outras informa√ß√µes como:
    par√¢metros, m√©tricas (accuracy, precision, recall & f1-score), exemplo de input, assinatura do modelo e o dataset usado.
    
    Args:
        estimator: Classificador do Scikit-Learn.
        X_train (numpy.ndarray): Matriz de features usada para o treinamento.
        y_train (numpy.ndarray): Vetor ou matriz de r√≥tulos alvo para treinamento.
        X_test (numpy.ndarray): Matriz de features usada para o teste.
        y_test (numpy.ndarray): Vetor ou matriz de r√≥tulos alvo para teste.
        params (dict): Dicion√°rio de melhores par√¢metros escolhidos via GridSearchCV.
        dataset (pandas.core.frame.DataFrame): 
    
    Returns:
        
    
    """
    
    mlflow_dataset = mlflow.data.from_pandas(
        dataset, 
        source=dataset_path,
        name=dataset_path.split('/')[-1]
    )
    
    
    # Wrapper de run
    with mlflow.start_run():
        
        # Treinamento do modelo
        # Obter o atributo nome do estimador
        estimator = estimator.__class__
        estimator_name = estimator.__name__
        # Inst√¢nciar do estimador com os par√¢metros escolhidos
        estimator = estimator(**params)
        # Fitagem dos dados no algoritmo
        estimator.fit(X_train, y_train)
        # Infer√™ncia do modelo
        y_pred = estimator.predict(X_test)
        
        # Registro no MLflow
        # Registro dos par√¢metros
        mlflow.log_params(params)
        
        # Registro das m√©tricas principais
        mlflow.log_metric('Accuracy', accuracy_score(y_test, y_pred))
        mlflow.log_metric('Precision', precision_score(y_test, y_pred))
        mlflow.log_metric('Recall', recall_score(y_test, y_pred))
        mlflow.log_metric('f1 score', f1_score(y_test, y_pred))
        
        # 
        mlflow.log_input(mlflow_dataset, context='raw-data')
        
        # Registro de tags para fins de documenta√ß√£o
        mlflow.set_tag('Training info', f'Basic {estimator_name} for heart disease data')
        mlflow.set_tag('Model origin', 'train and log models function')
        
        signature = infer_signature(X_train, estimator.predict(X_test))
        
        # Registro do modelo em si
        # A vari√°vel model_info cont√©m informa√ß√µes retornadas ap√≥s o tracking do modelo
        model_info = mlflow.sklearn.log_model(
            name=estimator_name,
            sk_model=estimator,
            signature=signature, 
            input_example=X_test[[0]],
            registered_model_name=estimator_name
        )
        
        return model_info
        

In [21]:
for name, param in best_params_dict.items():
    print(f'model: {name}')
    model_tacking = train_and_log_model(param[1], X_train, X_test, y_train, y_test, param[0], dataset, dataset_path)

    print(f'artifact path: {model_tacking.artifact_path}')

model: DecisionTreeClassifier_best_params


Registered model 'DecisionTreeClassifier' already exists. Creating a new version of this model...
2025/12/17 19:02:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeClassifier, version 11
Created version '11' of model 'DecisionTreeClassifier'.


üèÉ View run calm-chimp-684 at: http://localhost:5000/#/experiments/577837039507156569/runs/9b88793a67c642c1a94d8fbea5a66902
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-d6f0b2e3814449b091dca8d33190bb2f/artifacts
model: RandomForestClassifier_best_params


Registered model 'RandomForestClassifier' already exists. Creating a new version of this model...
2025/12/17 19:02:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestClassifier, version 9
Created version '9' of model 'RandomForestClassifier'.


üèÉ View run melodic-steed-475 at: http://localhost:5000/#/experiments/577837039507156569/runs/84336f1526da4824b860d3e5cbb3c495
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-6c3acb397eb24c089a464444b683da7f/artifacts
model: GradientBoostingClassifier_best_params


Registered model 'GradientBoostingClassifier' already exists. Creating a new version of this model...
2025/12/17 19:02:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: GradientBoostingClassifier, version 3
Created version '3' of model 'GradientBoostingClassifier'.


üèÉ View run resilient-foal-19 at: http://localhost:5000/#/experiments/577837039507156569/runs/668f492340654efd94aad3e394556d9a
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-c7f71e89c4474ccf8230cc93eb51c8af/artifacts
model: KNeighborsClassifier_best_params


Registered model 'KNeighborsClassifier' already exists. Creating a new version of this model...
2025/12/17 19:02:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: KNeighborsClassifier, version 3
Created version '3' of model 'KNeighborsClassifier'.


üèÉ View run redolent-bug-413 at: http://localhost:5000/#/experiments/577837039507156569/runs/723359675f674edca8973ca0113e9634
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-d4b52a1f88804961b5b481866d591ccb/artifacts
model: LogisticRegression_best_params


Registered model 'LogisticRegression' already exists. Creating a new version of this model...
2025/12/17 19:03:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression, version 3
Created version '3' of model 'LogisticRegression'.


üèÉ View run chill-dove-83 at: http://localhost:5000/#/experiments/577837039507156569/runs/1f3ef9fcf38e4d1684796c263b07ca0e
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-a368dd2bb3b14501b422fc7e1dcd0ead/artifacts
model: SVC_best_params


Registered model 'SVC' already exists. Creating a new version of this model...
2025/12/17 19:03:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SVC, version 3
Created version '3' of model 'SVC'.


üèÉ View run delightful-mink-822 at: http://localhost:5000/#/experiments/577837039507156569/runs/dc265257b9424ff5bedc7cf834c18920
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-002d4702529a4069acf0a2c6893a7f33/artifacts
model: MLPClassifier_best_params


Registered model 'MLPClassifier' already exists. Creating a new version of this model...
2025/12/17 19:03:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MLPClassifier, version 3


üèÉ View run efficient-lark-778 at: http://localhost:5000/#/experiments/577837039507156569/runs/c57c7a43c26c4093b628a8e6bc77020f
üß™ View experiment at: http://localhost:5000/#/experiments/577837039507156569
artifact path: mlflow-artifacts:/577837039507156569/models/m-e5160317f6a9462397c5c072c9fc71c3/artifacts


Created version '3' of model 'MLPClassifier'.
