## Versión 2 modelos

In [14]:
# ==============================================================
# Entrenamiento y empaquetado de modelos predictivos
# Proyecto DSA - Transporte Minero
# ==============================================================
# Autores: Luis Cortes, Víctor Bonilla, Frank Díaz
# Versión: Final (Entrega 3)
# ==============================================================

# ========================================
# 1. Importación de librerías necesarias
# ========================================
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, roc_auc_score, classification_report
import joblib
import warnings
warnings.filterwarnings("ignore")

In [15]:
import mlflow

mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("Proyecto_DSA_Experimentos")


<Experiment: artifact_location='file:///Users/luiscortes/Desktop/Proyecto_DSA/notebooks/../mlruns/330871551908708068', creation_time=1763321936030, experiment_id='330871551908708068', last_update_time=1763321936030, lifecycle_stage='active', name='Proyecto_DSA_Experimentos', tags={}>

In [16]:
# ========================================
# 2. Carga del dataset procesado
# ========================================
df = pd.read_csv("../data/df_final.csv")
print(" Dataset cargado correctamente:", df.shape)
display(df.head())


 Dataset cargado correctamente: (3348, 42)


Unnamed: 0,Equipment,DateTime,Payload,FuelBurned,CycleTime,DistanceTravelled,EmptyTravelTime,EmptyTravelDistance,EmptyStopTime,LoadTime,...,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Shift_Night,CycleClass_Ineficiente
0,2784,2025-08-31 23:20:45,215.2,108.36,42.63,9.0,14.28,4.4,5.47,1.75,...,False,False,False,False,False,False,False,True,True,True
1,2781,2025-08-31 23:22:14,225.3,174.6,52.4,10.5,17.25,4.9,0.15,5.38,...,False,False,False,False,False,False,False,True,True,True
2,2780,2025-08-31 23:27:39,193.4,146.21,42.43,10.1,11.55,4.9,4.47,1.85,...,False,False,False,False,False,False,False,True,True,True
3,2782,2025-08-31 23:56:10,229.7,72.87,23.85,5.6,8.77,2.4,2.88,2.58,...,False,False,False,False,False,False,False,True,True,False
4,2784,2025-09-01 00:03:23,227.9,131.07,39.37,10.3,15.53,5.1,0.13,2.27,...,False,False,False,False,False,False,False,False,True,True


In [17]:
# ========================================
# 3. Definición de conjuntos de entrenamiento y prueba
# ========================================
df = df.sort_values("DateTime").reset_index(drop=True)
n = int(len(df) * 0.8)
train_df, test_df = df.iloc[:n], df.iloc[n:]

print(f"Train: {train_df.shape} | Test: {test_df.shape}")


Train: (2678, 42) | Test: (670, 42)


In [18]:
# ========================================
# 4. Selección de features comunes (para regresión)
# ========================================
exclude_common = [
    "Equipment", "DateTime", "StartLatLong", "DestinationLatLong",
    "CycleTime", "EfficientCycle"
]
exclude_cycleclass = [c for c in df.columns if c.startswith("CycleClass_")]
exclude_targets = ["EmptyStopTime", "LoadStopTime"]
exclude_colinear = ["DistanceTravelled", "EmptyTravelDistance", "LoadTravelDistance"]

exclude_cols = set(exclude_common + exclude_cycleclass + exclude_targets + exclude_colinear)
features = [c for c in df.columns if c not in exclude_cols]

print("Total de features utilizadas:", len(features))
print(features)


Total de features utilizadas: 30
['Payload', 'FuelBurned', 'EmptyTravelTime', 'LoadTime', 'LoadTravelTime', 'TotalStopTime', 'Hour_1', 'Hour_2', 'Hour_3', 'Hour_4', 'Hour_5', 'Hour_6', 'Hour_7', 'Hour_8', 'Hour_9', 'Hour_10', 'Hour_11', 'Hour_12', 'Hour_13', 'Hour_14', 'Hour_15', 'Hour_16', 'Hour_17', 'Hour_18', 'Hour_19', 'Hour_20', 'Hour_21', 'Hour_22', 'Hour_23', 'Shift_Night']


In [19]:
# ========================================
# 5. Entrenamiento modelo 1: EmptyStopTime
# ========================================
X_tr = train_df[features].fillna(0)
X_te = test_df[features].fillna(0)
y_tr = train_df["EmptyStopTime"]
y_te = test_df["EmptyStopTime"]

with mlflow.start_run(run_name="reg_EmptyStopTime_V2_Tuned"):
    model_empty = RandomForestRegressor(
        n_estimators=214,
        max_depth=15,
        max_features=1.0,
        min_samples_leaf=1,
        min_samples_split=6,
        random_state=42,
        n_jobs=-1
    )
    model_empty.fit(X_tr, y_tr)
    preds = model_empty.predict(X_te)

    mae = mean_absolute_error(y_te, preds)
    rmse = np.sqrt(mean_squared_error(y_te, preds))

    mlflow.log_params(model_empty.get_params())
    mlflow.log_metrics({"MAE": mae, "RMSE": rmse})
    mlflow.sklearn.log_model(model_empty, artifact_path="model_EmptyStopTime_V2_Tuned")

print(f"Modelo EmptyStopTime entrenado | MAE: {mae:.2f} | RMSE: {rmse:.2f}")




Modelo EmptyStopTime entrenado | MAE: 0.92 | RMSE: 2.40


In [20]:
# ========================================
# 6. Entrenamiento modelo 2: LoadStopTime
# ========================================
y_tr = train_df["LoadStopTime"]
y_te = test_df["LoadStopTime"]

with mlflow.start_run(run_name="reg_LoadStopTime_V2_Tuned"):
    model_load = RandomForestRegressor(
        n_estimators=892,
        max_depth=8,
        max_features=0.8,
        min_samples_leaf=3,
        min_samples_split=4,
        random_state=42,
        n_jobs=-1
    )
    model_load.fit(X_tr, y_tr)
    preds = model_load.predict(X_te)

    mae = mean_absolute_error(y_te, preds)
    rmse = np.sqrt(mean_squared_error(y_te, preds))

    mlflow.log_params(model_load.get_params())
    mlflow.log_metrics({"MAE": mae, "RMSE": rmse})
    mlflow.sklearn.log_model(model_load, artifact_path="model_LoadStopTime_V2_Tuned")

print(f"Modelo LoadStopTime entrenado | MAE: {mae:.2f} | RMSE: {rmse:.2f}")




Modelo LoadStopTime entrenado | MAE: 0.88 | RMSE: 2.29


In [22]:
# ========================================
# 7. Entrenamiento modelo 3: Clasificación (Eficiencia del ciclo)
# ========================================
features_clf = [
    "Payload", "FuelBurned", "DistanceTravelled", "EmptyTravelTime", "EmptyTravelDistance",
    "LoadTime", "LoadTravelTime", "LoadTravelDistance", "TotalStopTime",
    "Hour_1", "Hour_2", "Hour_3", "Hour_4", "Hour_5", "Hour_6", "Hour_7",
    "Hour_8", "Hour_9", "Hour_10", "Hour_11", "Hour_12", "Hour_13", "Hour_14",
    "Hour_15", "Hour_16", "Hour_17", "Hour_18", "Hour_19", "Hour_20", "Hour_21",
    "Hour_22", "Hour_23", "Shift_Night"
]

X_tr = train_df[features_clf].fillna(0)
X_te = test_df[features_clf].fillna(0)
y_tr = train_df["EfficientCycle"]
y_te = test_df["EfficientCycle"]

with mlflow.start_run(run_name="clf_EfficientCycle_V2"):
    model_clf = GradientBoostingClassifier(random_state=42)
    model_clf.fit(X_tr, y_tr)
    preds = model_clf.predict(X_te)
    proba = model_clf.predict_proba(X_te)[:, 1]

    f1 = f1_score(y_te, preds)
    auc = roc_auc_score(y_te, proba)

    mlflow.log_metrics({"F1": f1, "ROC_AUC": auc})
    mlflow.sklearn.log_model(model_clf, artifact_path="model_EfficientCycle")

print(f"Modelo de clasificación entrenado | F1: {f1:.3f} | AUC: {auc:.3f}")




Modelo de clasificación entrenado | F1: 0.969 | AUC: 0.997


In [11]:
# ========================================
# 8. Empaquetado final de los modelos
# ========================================
joblib.dump(model_empty, "../models/model_EmptyStopTime_V2_Tuned.pkl")
joblib.dump(model_load, "../models/model_LoadStopTime_V2_Tuned.pkl")
joblib.dump(model_clf, "../models/model_EfficientCycle.pkl")

print(" Modelos guardados correctamente en carpeta /models")


 Modelos guardados correctamente en carpeta /models


In [23]:
# ========================================
# 9. Resumen final de métricas
# ========================================
print("====== MÉTRICAS FINALES ======")
print(f"EmptyStopTime → MAE: 1.33  | RMSE: 2.55")
print(f"LoadStopTime  → MAE: 0.93  | RMSE: 2.59")
print(f"EfficientCycle → F1: {f1:.3f} | ROC_AUC: {auc:.3f}")



EmptyStopTime → MAE: 1.33  | RMSE: 2.55
LoadStopTime  → MAE: 0.93  | RMSE: 2.59
EfficientCycle → F1: 0.969 | ROC_AUC: 0.997
