In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
import joblib
import os

In [2]:
# Configuración de la URI de MLflow
mlflow.set_tracking_uri("file:///mnt/mlruns")  
mlflow.set_experiment("forest_fires_experiment")

# Carga de datos del archivo CSV proporcionado
data_path = "forestfires_processed.csv"  # Ruta al archivo CSV
data = pd.read_csv(data_path)

# Separar las características (X) y la variable objetivo (y)
X = data.drop(["area", "log_area"], axis=1)
y = data["log_area"]

In [3]:
# Aplicar One-Hot Encoding a las columnas categóricas ('month', 'day')
X = pd.get_dummies(X, columns=['month', 'day'])

# División de los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Definir función para registrar y evaluar modelos
def evaluate_and_log_model(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=model_name):
        # Entrenar el modelo
        model.fit(X_train, y_train)
        
        # Predicciones
        y_pred = model.predict(X_test)
        
        # Calcular RMSE (cambio recomendado)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # Cambiar a root_mean_squared_error si tu versión de sklearn lo permite
        
        # Calcular R²
        r2 = r2_score(y_test, y_pred)
        
        # Registrar parámetros, métricas y el modelo
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        
        # Registrar el modelo con un ejemplo de entrada para inferir la firma del modelo
        input_example = X_test.iloc[0:1]  # Ejemplo de entrada para MLflow
        mlflow.sklearn.log_model(model, model_name, input_example=input_example)
        
        # Imprimir resultados
        print(f"{model_name}: RMSE={rmse}, R²={r2}")

In [5]:
# Modelos a evaluar
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=200, max_depth=20, min_samples_split=4, random_state=42),
    'SVM': SVR(kernel='rbf'),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

In [6]:
# Evaluar cada modelo
for model_name, model in models.items():
    evaluate_and_log_model(model_name, model, X_train, X_test, y_train, y_test)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Linear Regression: RMSE=0.9598209079707113, R²=-0.09585080778158339




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Decision Tree: RMSE=1.3474090368745084, R²=-1.159583156682146




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Random Forest: RMSE=0.9898031912349464, R²=-0.1653831097364904




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

SVM: RMSE=1.011764436529863, R²=-0.21767065472046698




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Gradient Boosting: RMSE=1.0216572024038024, R²=-0.24159919422326204


In [7]:
# Guardar todos los modelos entrenados usando Joblib
for model_name, model in models.items():
    model_filename = f'models/{model_name.replace(" ", "_").lower()}.pkl'
    
    if not os.path.exists('models'):
        os.makedirs('models')
    
    joblib.dump(model, model_filename)
    print(f"Modelo {model_name} guardado en {model_filename}")

Modelo Linear Regression guardado en models/linear_regression.pkl
Modelo Decision Tree guardado en models/decision_tree.pkl
Modelo Random Forest guardado en models/random_forest.pkl
Modelo SVM guardado en models/svm.pkl
Modelo Gradient Boosting guardado en models/gradient_boosting.pkl


In [8]:
# Versionar todos los modelos con DVC
os.system("dvc add models/")
os.system("dvc push")

1

In [9]:
# Versionar los datos procesados que usaste
os.system("dvc add forestfires_processed.csv")
os.system("dvc push")

1