In [0]:
# Importar las bibliotecas necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import mlflow
import mlflow.sklearn

In [0]:
# Leer la tabla de la base de datos default
df = spark.sql("SELECT * FROM default.personas_limpio_4_csv").toPandas()

# Mostrar las primeras filas del DataFrame
display(df)

gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893236911794663,never smoked,1
Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
Female,59.0,0,0,Yes,Private,Rural,76.15,28.893236911794663,Unknown,1
Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [0]:
# Convertir columnas necesarias a tipo numérico
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['hypertension'] = pd.to_numeric(df['hypertension'], errors='coerce')
df['heart_disease'] = pd.to_numeric(df['heart_disease'], errors='coerce')
df['avg_glucose_level'] = pd.to_numeric(df['avg_glucose_level'], errors='coerce')
df['bmi'] = pd.to_numeric(df['bmi'], errors='coerce')

In [0]:
# Codificar variables categóricas
df = pd.get_dummies(df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)

In [0]:
from sklearn.model_selection import train_test_split

# Dividir los datos en características y variable objetivo
X = df.drop(['stroke'], axis=1)
y = pd.to_numeric(df['stroke'], errors='coerce')

# Dividir los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [0]:
#Importe MLFlow para registrar los experimentos, el regresor de bosques aleatorios y la métrica de error cuadrático medio
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [0]:
# defina el servidor para llevar el registro de modelos y artefactos
#mlflow.set_tracking_uri('http://localhost:5000')
# registre el experimento
experiment = mlflow.set_experiment("/Shared/RandomForest1")

2024/11/09 03:59:08 INFO mlflow.tracking.fluent: Experiment with name '/Shared/RandomForest1' does not exist. Creating a new experiment.


In [0]:
# Ejecución de MLflow
with mlflow.start_run():
    # Definir los parámetros del modelo
    n_estimators = 200 
    max_depth = 6
    max_features = 4

    # Crear y entrenar el modelo Random Forest Classifier
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)
    rf.fit(X_train, y_train)

    # Realizar predicciones en el conjunto de prueba
    predictions = rf.predict(X_test)

    # Calcular la precisión del modelo
    accuracy = accuracy_score(y_test, predictions)

    # Registrar los parámetros
    mlflow.log_param("num_trees", n_estimators)
    mlflow.log_param("maxdepth", max_depth)
    mlflow.log_param("max_feat", max_features)

    # Registrar el modelo
    mlflow.sklearn.log_model(rf, "random-forest-model")

    # Registrar la métrica de precisión
    mlflow.log_metric("accuracy", accuracy)

    # Imprimir la precisión
    print("Accuracy:", accuracy)

# Evaluar el modelo
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))



Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Accuracy: 0.9419439008480104
Confusion Matrix:
 [[1444    0]
 [  89    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97      1444
           1       0.00      0.00      0.00        89

    accuracy                           0.94      1533
   macro avg       0.47      0.50      0.49      1533
weighted avg       0.89      0.94      0.91      1533



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [0]:
# registre el experimento
experiment = mlflow.set_experiment("/Shared/RandomForest2")

2024/11/09 04:07:58 INFO mlflow.tracking.fluent: Experiment with name '/Shared/RandomForest2' does not exist. Creating a new experiment.


In [0]:
# Ejecución de MLflow
with mlflow.start_run():
    # Definir los parámetros del modelo
    n_estimators = 300 
    max_depth = 5
    max_features = 3

    # Crear y entrenar el modelo Random Forest Classifier
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)
    rf.fit(X_train, y_train)

    # Realizar predicciones en el conjunto de prueba
    predictions = rf.predict(X_test)

    # Calcular la precisión del modelo
    accuracy = accuracy_score(y_test, predictions)

    # Registrar los parámetros
    mlflow.log_param("num_trees", n_estimators)
    mlflow.log_param("maxdepth", max_depth)
    mlflow.log_param("max_feat", max_features)

    # Registrar el modelo
    mlflow.sklearn.log_model(rf, "random-forest-model")

    # Registrar la métrica de precisión
    mlflow.log_metric("accuracy", accuracy)

    # Imprimir la precisión
    print("Accuracy:", accuracy)

# Evaluar el modelo
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))



Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Accuracy: 0.9419439008480104
Confusion Matrix:
 [[1444    0]
 [  89    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97      1444
           1       0.00      0.00      0.00        89

    accuracy                           0.94      1533
   macro avg       0.47      0.50      0.49      1533
weighted avg       0.89      0.94      0.91      1533



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np


In [0]:
# Espacio de búsqueda de hiperparámetros
param_distributions = {
    'n_estimators': [int(x) for x in np.linspace(100, 1000, 10)],  # Número de árboles
    'max_depth': [None] + [int(x) for x in np.linspace(10, 100, 10)],  # Profundidad máxima
    'min_samples_split': [2, 5, 10],  # Mínimo número de muestras requeridas para dividir un nodo
    'min_samples_leaf': [1, 2, 4],  # Mínimo número de muestras requeridas en una hoja
    'max_features': ['auto', 'sqrt', 'log2'],  # Número de características a considerar al dividir
    'bootstrap': [True, False]  # Si se usa bootstrap al construir los árboles
}


In [0]:
mlflow.set_experiment("/Shared/RandomForestRandomSearch")

2024/11/09 04:15:33 INFO mlflow.tracking.fluent: Experiment with name '/Shared/RandomForestRandomSearch' does not exist. Creating a new experiment.


<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/3267515622416847', creation_time=1731125733933, experiment_id='3267515622416847', last_update_time=1731125733933, lifecycle_stage='active', name='/Shared/RandomForestRandomSearch', tags={'mlflow.experiment.sourceName': '/Shared/RandomForestRandomSearch',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'ns.posada@uniandes.edu.co',
 'mlflow.ownerId': '8344094637587395'}>

In [0]:
# Crear el modelo Random Forest
rf = RandomForestClassifier(class_weight='balanced')

# Configurar RandomizedSearchCV
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=10,  # Puedes ajustar este número según el tiempo que tengas
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Iniciar una corrida en MLflow
with mlflow.start_run():
    # Ajustar el modelo con los datos de entrenamiento
    rf_random.fit(X_train, y_train)

    # Obtener los mejores hiperparámetros
    best_params = rf_random.best_params_
    best_rf = rf_random.best_estimator_

    # Hacer predicciones en el conjunto de prueba
    predictions = best_rf.predict(X_test)

    # Calcular la precisión
    accuracy = accuracy_score(y_test, predictions)

    # Registrar los mejores hiperparámetros en MLflow
    for param, value in best_params.items():
        mlflow.log_param(param, value)

    # Registrar la precisión en MLflow
    mlflow.log_metric("accuracy", accuracy)

    # Registrar el modelo en MLflow
    mlflow.sklearn.log_model(best_rf, "best-random-forest-model")

    # Imprimir la precisión y el informe de clasificación
    print("Mejores hiperparámetros encontrados:", best_params)
    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
    print("Classification Report:\n", classification_report(y_test, predictions))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


9 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "/databricks/python/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/databricks/python/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/databricks/python/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/databricks/python/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    rais

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Mejores hiperparámetros encontrados: {'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
Accuracy: 0.9393346379647749
Confusion Matrix:
 [[1440    4]
 [  89    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97      1444
           1       0.00      0.00      0.00        89

    accuracy                           0.94      1533
   macro avg       0.47      0.50      0.48      1533
weighted avg       0.89      0.94      0.91      1533



In [0]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import mlflow
import mlflow.sklearn
from collections import Counter

# Configurar el experimento en MLflow (asegúrate de usar un path absoluto)
mlflow.set_experiment("/Shared/RandomForestWithSMOTE2")

# Aplicar SMOTE para sobremuestrear la clase minoritaria
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Verificar la distribución de clases después de aplicar SMOTE
print("Distribución de clases después de SMOTE:", Counter(y_resampled))

# Iniciar una corrida en MLflow
with mlflow.start_run():
    # Crear y entrenar el modelo con los mejores hiperparámetros y los datos sobremuestreados
    rf = RandomForestClassifier(n_estimators=200, max_depth=6, max_features=4 , class_weight='balanced')
    rf.fit(X_resampled, y_resampled)

    # Realizar predicciones en el conjunto de prueba
    predictions = rf.predict(X_test)

    # Hacer predicciones en el conjunto de prueba
    predictions = rf.predict(X_test)

    # Calcular las métricas
    accuracy = accuracy_score(y_test, predictions)
    auc_score = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])

    # Registrar los hiperparámetros y las métricas en MLflow
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", 6)
    mlflow.log_param("bootstrap", False)
    mlflow.log_param("max_features", 4)
    

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("auc_roc", auc_score)

    # Registrar el modelo en MLflow
    mlflow.sklearn.log_model(rf, "random-forest-model")

    # Imprimir las métricas
    print("Accuracy:", accuracy)
    print("AUC-ROC Score:", auc_score)
    print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
    print("Classification Report:\n", classification_report(y_test, predictions))

2024/11/09 04:26:35 INFO mlflow.tracking.fluent: Experiment with name '/Shared/RandomForestWithSMOTE2' does not exist. Creating a new experiment.


Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe614ca3600>
Traceback (most recent call last):
  File "/databricks/python/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/databricks/python/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/databricks/python/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/databricks/python/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'split'


Distribución de clases después de SMOTE: Counter({0: 3417, 1: 3417})




Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Accuracy: 0.8401826484018264
AUC-ROC Score: 0.8017367487316754
Confusion Matrix:
 [[1252  192]
 [  53   36]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.87      0.91      1444
           1       0.16      0.40      0.23        89

    accuracy                           0.84      1533
   macro avg       0.56      0.64      0.57      1533
weighted avg       0.91      0.84      0.87      1533

