In [27]:
import pandas as pd
import re

# MLflow
import mlflow
import mlflow.sklearn

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Configura el URI de MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("SMS_Spam_Classification")

2025/02/23 17:19:55 INFO mlflow.tracking.fluent: Experiment with name 'SMS_Spam_Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/775910358339671536', creation_time=1740327595375, experiment_id='775910358339671536', last_update_time=1740327595375, lifecycle_stage='active', name='SMS_Spam_Classification', tags={}>

In [28]:
# 1. Carga de Datos y Limpieza Básica
df = pd.read_csv("spam.csv", encoding="latin-1")

In [29]:
# Renombramos columnas (si el dataset viene con 'v1' y 'v2')
df = df.rename(columns={"v1": "label", "v2": "text"})
df = df[["label", "text"]]  # Nos quedamos solo con 'label' y 'text'

In [30]:
print("=== Primeras filas del dataset ===")
print(df.head(), "\n")

=== Primeras filas del dataset ===
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro... 



In [31]:
print("=== Distribución de clases ===")
print(df["label"].value_counts(), "\n")

=== Distribución de clases ===
label
ham     4825
spam     747
Name: count, dtype: int64 



In [32]:
def limpiar_texto(texto):
    texto = texto.lower()                       # minúsculas
    texto = re.sub(r"[^a-zA-Z\s]", "", texto)   # quitar caracteres especiales
    return texto

df["text_clean"] = df["text"].apply(limpiar_texto)

In [33]:
# 2. Separación de X e y, División en Train/Test
X = df["text_clean"]
y = df["label"]  # 'ham' o 'spam'

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# 3. Función para Entrenar y Registrar el Modelo con un valor de n_estimators 
X = df["text_clean"]
y = df["label"]  # 'ham' o 'spam'

# 4. División en Entrenamiento y Prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [35]:
def entrenar_y_registrar_modelo(n_estimators):
    """
    Crea un pipeline con TfidfVectorizer + RandomForestClassifier,
    entrena el modelo con el conjunto de entrenamiento y registra
    los resultados en MLflow. Permite comparar distintos n_estimators.
    """

    # Iniciamos un run en MLflow con un nombre que incluya n_estimators
    run_name = f"SMS_Spam_RF_n{n_estimators}"
    with mlflow.start_run(run_name=run_name):
        
        # 3.1 Creación del Pipeline
        pipeline = Pipeline([
            ("tfidf", TfidfVectorizer(stop_words="english")),
            ("clf", RandomForestClassifier(
                n_estimators=n_estimators,
                random_state=42
            ))
        ])
        
        # 3.2 Entrenamiento
        pipeline.fit(X_train, y_train)
        
        # 3.3 Predicciones y Métricas
        y_pred = pipeline.predict(X_test)
        
        report = classification_report(y_test, y_pred, output_dict=True)
        precision = report["weighted avg"]["precision"]
        recall = report["weighted avg"]["recall"]
        f1 = report["weighted avg"]["f1-score"]
        
        # Mostramos por consola
        print(f"\n=== Resultados para n_estimators={n_estimators} ===")
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        print("Matriz de Confusión:")
        print(confusion_matrix(y_test, y_pred))
        
        # 3.4 Registro de Parámetros y Métricas en MLflow
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("random_state", 42)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
        # 3.5 Guardar el modelo (pipeline) en MLflow
        mlflow.sklearn.log_model(pipeline, f"modelo_sms_spam_rf_n{n_estimators}")

    print(f"Entrenamiento y registro completados para n_estimators={n_estimators}.")


In [None]:
# 4. Llamar a la Función con Distintos Valores de n_estimators
valores_n_estimators = [50, 100, 200]

for n in valores_n_estimators:
    entrenar_y_registrar_modelo(n)

print("\n¡Todas las ejecuciones se han completado!")




=== Resultados para n_estimators=50 ===
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Matriz de Confusión:
[[966   0]
 [ 26 123]]




🏃 View run SMS_Spam_RF_n50 at: http://127.0.0.1:5000/#/experiments/775910358339671536/runs/52a14585c83248dfa5301977024043d4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/775910358339671536
Entrenamiento y registro completados para n_estimators=50.

=== Resultados para n_estimators=100 ===
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.80      0.89       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Matriz de Confusión:
[[966   0]
 [ 30 119]]




🏃 View run SMS_Spam_RF_n100 at: http://127.0.0.1:5000/#/experiments/775910358339671536/runs/9ac34bf146ef48d194a10fb434c02f07
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/775910358339671536
Entrenamiento y registro completados para n_estimators=100.

=== Resultados para n_estimators=200 ===
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       1.00      0.81      0.89       149

    accuracy                           0.97      1115
   macro avg       0.99      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Matriz de Confusión:
[[966   0]
 [ 29 120]]




🏃 View run SMS_Spam_RF_n200 at: http://127.0.0.1:5000/#/experiments/775910358339671536/runs/55ddd91c22404aff92f2c8f16726b5ae
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/775910358339671536
Entrenamiento y registro completados para n_estimators=200.

¡Todas las ejecuciones se han completado!
