In [6]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, r2_score
import numpy as np
import joblib

# Cargar el dataset
file_path = './forestfires.csv'
df = pd.read_csv(file_path)

# Seleccionar características y la etiqueta (DC)
X = df.drop(columns=['DC', 'area'])
y = df['DC']

# Identificar características numéricas y categóricas
numeric_features = ['X', 'Y', 'FFMC', 'DMC', 'ISI', 'temp', 'RH', 'wind', 'rain']
categorical_features = ['month', 'day']

# Preprocesamiento: Estandarizar características numéricas y codificar categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Configuración del modelo (Random Forest)
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Implementar K-Fold Cross-Validation
k = 5  # Número de folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Realizar Cross-Validation y calcular R² para cada fold
cv_scores = cross_val_score(model_pipeline, X, y, cv=kf, scoring=make_scorer(r2_score))

# Manejar posibles NaN y calcular la desviación estándar
cv_scores = np.nan_to_num(cv_scores)  # Reemplazar NaN con 0 o manejar según convenga
mean_r2 = np.mean(cv_scores)
std_r2 = np.std(cv_scores)

# Mostrar resultados
print(f"R² en cada fold: {cv_scores}")
print(f"R² promedio: {mean_r2}")
print(f"Desviación estándar de R²: {std_r2}")

# Entrenar el modelo en todos los datos
model_pipeline.fit(X, y)

# Guardar el modelo entrenado
joblib.dump(model_pipeline, 'dc_prediction_random_forest_model.pkl')

# Cargar el modelo y hacer una predicción con nuevos datos
model_loaded = joblib.load('dc_prediction_random_forest_model.pkl')

# Ejemplo de nuevos datos
new_data = pd.DataFrame({
    'X': [7],
    'Y': [5],
    'FFMC': [86.2],
    'DMC': [26.2],
    'ISI': [5.1],
    'temp': [22.0],
    'RH': [45],
    'wind': [3.1],
    'rain': [0.0],
    'month': ['mar'],
    'day': ['fri']
})

# Realizar la predicción
prediction = model_loaded.predict(new_data)
print(f"Predicción del DC: {prediction[0]}")


Traceback (most recent call last):
  File "/home/mcabre/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
  File "/home/mcabre/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
  File "/home/mcabre/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
  File "/home/mcabre/.local/lib/python3.10/site-packages/sklearn/utils/_response.py", line 239, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
  File "/home/mcabre/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 603, in predict
    Xt = transform.transform(Xt)
  File "/home/mcabre/.local/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 313, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/mcabre/.local/lib/python3.10/site-packages/sklearn/compose/_column_transf

R² en cada fold: [0.96657687 0.90349636 0.97760181 0.97232099 0.        ]
R² promedio: 0.7639992079130905
Desviación estándar de R²: 0.38294020276127083
Predicción del DC: 164.8040000000001
