Build Pipeline

In [8]:
# Separar características y etiquetas
X_train = pd.read_csv('./files/X_train.csv')
y_train = np.where(pd.read_csv('./files/y_train.csv') == 'Yes', 1, 0)  # Convertir etiquetas a 1/0
y_train = pd.DataFrame(y_train, columns=['RainTomorrow_Yes'])
# Separar características y etiquetas
X_test = pd.read_csv('./files/X_test.csv')
y_test = np.where(pd.read_csv('./files/y_test.csv') == 'Yes', 1, 0)  # Convertir etiquetas a 1/0
y_test = pd.DataFrame(y_test, columns=['RainTomorrow_Yes'])

In [2]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class DateMonthExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, date_column="Date", new_column="Month"):
        self.date_column = date_column
        self.new_column = new_column
    
    def fit(self, X, y=None):
        return self  # No necesita ajuste
    
    def transform(self, X):
        X = X.copy()  # Crear una copia para no modificar los datos originales
        X[self.date_column] = pd.to_datetime(X[self.date_column])  # Convertir a datetime
        X[self.new_column] = X[self.date_column].dt.month  # Extraer el mes
        X.drop(columns=[self.date_column], inplace=True)  # Eliminar la columna original
        return X

In [7]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class ProbImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns  # Columnas a imputar
        self.value_probs = {}  # Diccionario para almacenar valores únicos y probabilidades
    
    def fit(self, X, y=None):
        for col in self.columns:
            values = X[col].dropna().value_counts(normalize=True)
            self.value_probs[col] = (values.index.values, values.values)
        return self
    
    def transform(self, X):
        X = X.copy()  # Crear una copia para no modificar el original
        for col in self.columns:
            if col in self.value_probs:
                values, probs = self.value_probs[col]
                X[col] = X[col].apply(
                    lambda x: np.random.choice(values, p=probs) if pd.isnull(x) else x
                )
        return X


In [9]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler

# Imputación para columnas numéricas con la mediana
def apply_median_imputation(X, columns):
    imputer_median = SimpleImputer(strategy="median")
    X[columns] = imputer_median.fit_transform(X[columns])
    return X

# Imputación usando KNN para columnas numéricas
def apply_knn_imputation(X, columns):
    knn_imputer = KNNImputer()
    X[columns] = knn_imputer.fit_transform(X[columns])
    return X

# Escalado de características numéricas
def apply_scaling(X):
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled


In [11]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class DummiesEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_cols):
        self.categorical_cols = categorical_cols
        
    def fit(self, X, y=None):
        return self  # No necesita ajuste
    
    def transform(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)  # Convertir a DataFrame si es un ndarray
        X = X.copy()  # Crear una copia para no modificar los datos originales
        
        # Verificar si las columnas están presentes en X
        missing_cols = [col for col in self.categorical_cols if col not in X.columns]
        if missing_cols:
            raise ValueError(f"Las siguientes columnas no están en el DataFrame: {missing_cols}")
        
        # Convertir las columnas categóricas a tipo string
        X.loc[:, self.categorical_cols] = X.loc[:, self.categorical_cols].astype(str)
        
        # Aplicar pd.get_dummies
        X = pd.get_dummies(X, columns=self.categorical_cols, drop_first=True)
        return X


In [12]:
import tensorflow as tf

def create_nn_model(input_shape, learning_rate=0.001025, dropout_rate=0.2454, l2_lambda=0.002, units_per_layer=116):
    model = tf.keras.models.Sequential()
    
    # Capa de entrada con el tamaño correcto
    model.add(tf.keras.layers.InputLayer(input_shape=input_shape))

    # Añadir capas ocultas con parámetros óptimos
    model.add(tf.keras.layers.Dense(
            units_per_layer, 
            activation='relu', 
            kernel_regularizer=tf.keras.regularizers.L2(l2_lambda)
        ))
    model.add(tf.keras.layers.Dropout(dropout_rate))

    # Capa de salida
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    # Configurar optimizador
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model


In [20]:
from sklearn.metrics import f1_score
def train_and_evaluate_nn(X_train, y_train, X_test, y_test, categorical_cols, numerical_median_cols, numerical_knn_cols):
    # Transformaciones: imputación, codificación y escalado
    X_train = apply_median_imputation(X_train, numerical_median_cols)
    X_test = apply_median_imputation(X_test, numerical_median_cols)
    X_train = apply_knn_imputation(X_train, numerical_knn_cols)
    X_test = apply_knn_imputation(X_test, numerical_knn_cols)

    # Crear transformadores adicionales (imputación probabilística y extracción de fecha)
    prob_imputer = ProbImputer(columns=categorical_cols)
    date_transformer = DateMonthExtractor(date_column="Date", new_column="Month")
    
    # Transformar los datos de entrenamiento y prueba
    X_train = prob_imputer.fit_transform(X_train)
    X_test = prob_imputer.transform(X_test)
    X_train = date_transformer.fit_transform(X_train)
    X_test = date_transformer.transform(X_test)
    
    # Codificar variables categóricas
    encoder = DummiesEncoder(categorical_cols=categorical_cols)
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.transform(X_test)
    
    joblib.dump(X_train.columns.tolist(), './docker/train_columns.pkl')
    # Escalar las características
    X_train = apply_scaling(X_train)
    X_test = apply_scaling(X_test)
    
    # Guardar las columnas después del preprocesamiento durante el entrenamiento

    # Crear el modelo con el input_shape correcto
    input_shape = (X_train.shape[1],)  # El número de características
    
    model = create_nn_model(input_shape=input_shape)

    # Entrenamiento del modelo directamente con TensorFlow
    model.fit(X_train, y_train, epochs=81, batch_size=64, verbose=0)
    
    # Predicciones
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)  # Convertir a clases binarias
    
    # Evaluación del modelo
    
    f1 = f1_score(y_test, y_pred,average='weighted')
    print(f"F1-Score: {f1}")
    
    return model, f1


In [21]:
import joblib
# Variables de entrada (X_train, y_train, X_test, y_test ya deberían estar definidos previamente)
categorical_cols = ["WindDir3pm", "WindDir9am", "WindGustDir", "RainToday", "Location"]
numerical_median_cols = ["Pressure3pm", "Pressure9am", "Temp3pm", "Temp9am", "MinTemp", "MaxTemp"]
numerical_knn_cols = ["Evaporation", "Rainfall", "Humidity3pm", "WindSpeed3pm", "WindSpeed9am", "Cloud3pm", "Cloud9am", "Humidity9am", "Sunshine", "WindGustSpeed"]

# Entrenar y evaluar el modelo
nn_model, f1 = train_and_evaluate_nn(X_train, y_train, X_test, y_test, categorical_cols, numerical_median_cols, numerical_knn_cols)


# Guardar el modelo entrenado
joblib.dump(nn_model, './docker/nn_model.pkl')




[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
F1-Score: 0.83918068571204


['./docker/nn_model.pkl']