In [1]:
from datetime import datetime
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import (
    GridSearchCV
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    roc_curve, auc, 
    ConfusionMatrixDisplay, 
    classification_report
)
import mlflow
from mlflow.models import infer_signature


Configuramos el experimento con MLflow

In [2]:
MLFLOW_TRACKING_URI = "http://localhost:5000"
MLFLOW_EXPERIMENT_NAME = "failure_prediction_svm"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
experiment = mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

if experiment is None:
    experiment_id = mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
print(f"Using experiment name {MLFLOW_EXPERIMENT_NAME} ID: {experiment_id}")

Using experiment name failure_prediction_svm ID: 1


Cargamos el dataset balanceado y dividido en train y test.

In [3]:
dataset_train = pd.read_csv('./data_train_balanced.csv')
dataset_test = pd.read_csv('./data_test.csv')

dataset_test.head()

Unnamed: 0,airtemperature_k,process_temperature_k,rotational_speed_rpm,torque_nm,tool_wear_min,type_l,type_m,target
0,300.5,309.8,1345,62.7,153,True,False,0
1,303.7,312.4,1513,40.1,135,True,False,0
2,302.5,311.4,1559,37.6,209,True,False,0
3,295.6,306.3,1509,35.8,60,False,False,0
4,300.5,310.0,1358,60.4,102,False,False,0


In [4]:
X_train = dataset_train.drop(columns=['target'])
y_train = dataset_train['target']

X_test = dataset_test.drop(columns=['target'])
y_test = dataset_test['target']

Creamos el pipeline con los hiperparámetros que queremos probar.

In [5]:
X_train['type_l'] = X_train['type_l'].astype('category')
X_train['type_m'] = X_train['type_m'].astype('category')

X_test['type_l'] = X_test['type_l'].astype('category')
X_test['type_m'] = X_test['type_m'].astype('category')

numeric_features = X_train.select_dtypes(exclude=['category']).columns
categorical_features = X_train.select_dtypes(include=['category']).columns


preprocessor_svm = ColumnTransformer(
    transformers=[("num", StandardScaler(), numeric_features),
                  ("cat", "passthrough", categorical_features)]
)

# Build pipeline: preprocessing + classifier
model_svm = Pipeline(steps=[
    ("preprocessor", preprocessor_svm),
    ("classifier", SVC())
])

Busqueda de hiperparametros con GridSearchCV

In [6]:
%env AWS_ACCESS_KEY_ID=root  
%env AWS_SECRET_ACCESS_KEY=12345678
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9010

env: AWS_ACCESS_KEY_ID=root
env: AWS_SECRET_ACCESS_KEY=12345678
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9010


In [7]:
run_name_parent = \
    "best_hyperparams_" + datetime.today().strftime('%Y/%m/%d-%H:%M:%S')

with mlflow.start_run(
    experiment_id=experiment_id, run_name=run_name_parent, nested=True):

    param_grid = {
        'classifier__C': [0.01, 0.11, 10],
        'classifier__kernel': ['linear']
    }
    grid_search = GridSearchCV(model_svm, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("best_train_accuracy", grid_search.best_score_)

    mlflow.set_tags(
        tags={
            "project": "Predictive Maintenance",
            "optimizer_engine": "GridSearchCV",
            "model_family": "sklearn",
            "feature_set_version": 1,
        }
    )

    # Hacemos fit con los mejores parametros sobre el dataset completo
    model_svm = Pipeline(steps=[
        ("preprocessor", preprocessor_svm),
        ("classifier", SVC(C=grid_search.best_params_['classifier__C'],
                            kernel=grid_search.best_params_['classifier__kernel']))
    ])

    test_score = model_svm.fit(X_train, y_train).score(X_test, y_test)
    mlflow.log_metric("best_test_accuracy", test_score)

    print("Best Accuracy train :", grid_search.best_score_)
    print("Best accuracy test:", test_score)
    print("Best params:", grid_search.best_params_)

    # guardamos el modelo como artefacto
    # Guardamos el artefacto del modelo
    artifact_path = "model"

    signature = infer_signature(X_train, model_svm.predict(X_train))

    mlflow.sklearn.log_model(
        sk_model=model_svm,
        artifact_path=artifact_path,
        signature=signature,
        serialization_format='cloudpickle',
        registered_model_name="machine_failure_svm",
        metadata={"model_data_version": 1}
    )

    # Obtenemos la ubicación del modelo guardado en MLFlow
    model_uri = mlflow.get_artifact_uri(artifact_path)
    print(f"Model artifact saved in: {model_uri}")

Best Accuracy train : 0.8388538188112801
Best accuracy test: 0.849
Best params: {'classifier__C': 10, 'classifier__kernel': 'linear'}


Successfully registered model 'machine_failure_svm'.
2025/10/14 00:11:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: machine_failure_svm, version 1
Successfully registered model 'machine_failure_svm'.
2025/10/14 00:11:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: machine_failure_svm, version 1
Created version '1' of model 'machine_failure_svm'.
Created version '1' of model 'machine_failure_svm'.


Model artifact saved in: s3://mlflow/1/af7724d8bf0947c3951b1603427e878c/artifacts/model
🏃 View run best_hyperparams_2025/10/14-00:10:47 at: http://localhost:5000/#/experiments/1/runs/af7724d8bf0947c3951b1603427e878c
🧪 View experiment at: http://localhost:5000/#/experiments/1


In [None]:
# Check the model_uri that was saved
print(f"Model URI: {model_uri}")

# Alternative: Load the model from the model registry instead
try:
    # Method 1: Try loading from the URI
    loaded = mlflow.sklearn.load_model(model_uri)
    print("✅ Successfully loaded model from URI")
except Exception as e:
    print(f"❌ Error loading from URI: {e}")
    
    # Method 2: Load from model registry instead
    try:
        loaded = mlflow.sklearn.load_model("models:/machine_failure_svm/latest")
        print("✅ Successfully loaded model from registry")
    except Exception as e:
        print(f"❌ Error loading from registry: {e}")
        
        # Method 3: Load the most recent run's model
        client = mlflow.MlflowClient()
        latest_run = client.search_runs(experiment_ids=[experiment_id], 
                                       order_by=["start_time DESC"], 
                                       max_results=1)[0]
        latest_model_uri = f"runs:/{latest_run.info.run_id}/model"
        print(f"Trying latest run URI: {latest_model_uri}")
        loaded = mlflow.sklearn.load_model(latest_model_uri)
        print("✅ Successfully loaded model from latest run")

In [15]:
client = mlflow.MlflowClient()
latest_run = client.search_runs(experiment_ids=[experiment_id], 
                                       order_by=["start_time DESC"], 
                                       max_results=1)[0]
latest_model_uri = f"runs:/{latest_run.info.run_id}/model"
print(f"Trying latest run URI: {latest_model_uri}")
loaded = mlflow.sklearn.load_model(latest_model_uri)
print("✅ Successfully loaded model from latest run")

Trying latest run URI: runs:/af7724d8bf0947c3951b1603427e878c/model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 1735.48it/s] 
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 1735.48it/s] 

✅ Successfully loaded model from latest run





Registro el modelo para ponerlo en produccion

In [None]:
client = mlflow.MlflowClient()
name = "machine_failure_svm_prod"
desc = "Classifier for machine failures detection"

# Creamos el modelo productivo
client.create_registered_model(
    name=name, description=desc)

# Guardamos como tag los hiper-parametros en la version del modelo
tags = model_svm.get_params()
tags["model"] = type(model_svm).__name__
tags["test-accuracy"] = test_score

# Guardamos la version del modelo
result = client.create_model_version(
    name=name,
    source=model_uri,
    run_id=model_uri.split("/")[-3],
    tags=tags
)

# Y creamos como la version con el alias de champion para poder levantarlo en nuestro
# proceso de servicio del modelo on-line.
client.set_registered_model_alias(name, "champion", result.version)

Evaluacion

In [None]:
# classification report sobre datos de test
y_pred = model_svm.predict(X_test)
print(classification_report(
    y_test, y_pred, target_names=['Not Faulty', 'Faulty']))

# matrix de confusion
ConfusionMatrixDisplay.from_estimator(
    model_svm, X_test, y_test,
    display_labels=['Not Faulty', 'Faulty'],
    cmap=plt.cm.Blues,
    normalize='true'
)
plt.title('Matriz de Confusión Normalizada')
plt.show()

# Roc curve
y_score = model_svm.decision_function(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='blue', label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc='lower right')
plt.grid()
plt.show()