# 🎓 **Inteligencia Artificial Aplicada**

## 🤖 **Operaciones de aprendizaje automático (Gpo 10)**

### 🏛️ Tecnológico de Monterrey

#### 👨‍🏫 **Profesor titular :** Dr. Gerardo Rodríguez Hernández
#### 👩‍🏫 **Profesor titular :** Maestro Ricardo Valdez Hernández
#### 👩‍🏫 **Profesor tutor :** Jorge Gonzales Zapata

### 📊 **Fase 1 Proyecto MLOps**

#### 📅 **Octubre de 2025**

### 👥 Equipo 43

* 🧑‍💻 **A01795645 :** Alberto Campos Hernández
* 🧑‍💻 **A01016093 :** Oscar Enrique García García
* 🧑‍💻 **A01795922 :** Jessica Giovana García Gómez
* 🧑‍💻 **A01795897 :** Esteban Sebastián Guerra Espinoza
* 🧑‍💻 **A00820345 :** Rafael Sánchez Marmolejo

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import skew, kurtosis
from __future__ import annotations
from dataclasses import dataclass
from typing import Tuple, Optional, List
from pathlib import Path
# Configuración global de paralelismo para todos los modelos
N_JOBS = 2

In [83]:
# ==========================
# CARGA DE ARCHIVOS
# ==========================
@dataclass
class CargaArchivos:
    """Carga datasets crudos desde una carpeta.

    Parámetros
    ----------
    carpeta_raw: str | Path
        Ruta a la carpeta que contiene los CSVs crudos.
    nombre_modificado: str
        Nombre del CSV "modificado". power_tetouan_city_modified.csv
    """

    carpeta_raw: Path
    nombre_modificado: str

    def __post_init__(self) -> None:
        self.carpeta_raw = Path(self.carpeta_raw)
        self.carpeta_raw.mkdir(parents=True, exist_ok=True)

    def leer(self) -> pd.DataFrame:
        na_vals = ["nan", "NAN", "NaT", ""]
        df_modificado = pd.read_csv(
            self.carpeta_raw / self.nombre_modificado,
            na_values=na_vals,
            keep_default_na=True,
        )
        return df_modificado


# ==========================
# PREPROCESAMIENTO
# ==========================
@dataclass
class Preprocesamiento:
    """Transforma el dataset modificado en un dataset listo para modelar.
    Pasos realizados:
    - Elimina columna "mixed_type_col" si existe.
    - Limpia y convierte DateTime con distintos formatos.
    - Imputa DateTime faltante con vecino a ±10 min o punto medio.
    - Imputa numéricos con mediana por columna.
    - Maneja outliers mediante IQR + mediana rodante (ventana configurable).
    - Crea variables de tiempo y elimina DateTime si se solicita.
    """

    @staticmethod
    def _tranformar_numerica(df: pd.DataFrame) -> pd.DataFrame:
        cols = df.columns[1:9]
        df[cols] = (
            df[cols]
            .astype(str)
            .apply(lambda s: s.str.replace(',', '.', regex=False).str.strip())
            .apply(pd.to_numeric, errors='coerce')
        )
        df[cols].dtypes
        return df

    @staticmethod
    def _drop_col_si_existe(df: pd.DataFrame, col: str) -> pd.DataFrame:
        return df.drop(columns=[col], errors="ignore")

    @staticmethod
    def _limpiar_parsear_datetime(df: pd.DataFrame, col: str) -> pd.DataFrame:
        s = (
            df[col].astype(str)
            .str.replace(r"[\r\n\t]+", " ", regex=True)
            .str.strip()
        )
        s = s.mask(s.eq(""))
        s = s.mask(s.str.lower().eq("nan"))

        dt = pd.to_datetime(s, errors="coerce")
        miss = dt.isna()
        # Segundo intento con formato explícito mm/dd/YYYY HH:MM
        dt.loc[miss] = pd.to_datetime(
            s[miss], format="%m/%d/%Y %H:%M", errors="coerce"
        )

        # Imputación por vecinos: 10 minutos o punto medio
        prev = dt.shift(1)
        nxt = dt.shift(-1)
        mask = dt.isna() & prev.notna() & nxt.notna()

        m10 = mask & ((nxt - prev) == pd.Timedelta(minutes=20))
        dt.loc[m10] = prev.loc[m10] + pd.Timedelta(minutes=10)

        m_mid = mask & dt.isna()
        if m_mid.any():
            mid_ns = (prev[m_mid].astype("int64") + nxt[m_mid].astype("int64")) // 2
            dt.loc[m_mid] = pd.to_datetime(mid_ns)

        df[col] = dt

        valid = df['DateTime'].notna() & ~df['DateTime'].astype(str).str.strip().str.lower().eq('nan')
        df['__score__'] = df.drop(columns=['DateTime']).notna().sum(axis=1)
        keep = (df.loc[valid]
                .sort_values(['DateTime','__score__'], ascending=[True, False])
                .drop_duplicates(subset=['DateTime'], keep='first'))

        df= (pd.concat([keep, df.loc[~valid]])
                                    .drop(columns='__score__')
                                    .sort_index()
                                    .reset_index(drop=True))
        return df
    
    @staticmethod
    def _imputar_numericos_mediana(df: pd.DataFrame) -> pd.DataFrame:
        num_cols = df.select_dtypes(include="number").columns
        medianas = df[num_cols].median()
        df[num_cols] = df[num_cols].fillna(medianas)
        return df
    
    @staticmethod
    def _outliers_mediana_rodante(df: pd.DataFrame, col_fecha: str, ventana_mediana: int) -> pd.DataFrame:
        df = df.sort_values(col_fecha).copy()
        num = df.select_dtypes("number").columns

        Q1, Q3 = df[num].quantile(0.25), df[num].quantile(0.75)
        IQR = Q3 - Q1
        lo, hi = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        mask = (df[num] < lo) | (df[num] > hi)

        for c in num:
            rmed = df[c].rolling(window=ventana_mediana, center=True, min_periods=1).median()
            df.loc[mask[c], c] = rmed[mask[c]].fillna(df[c].median())
        
        df=df.dropna()
        return df
    
    @staticmethod
    def _features_tiempo(df: pd.DataFrame, col_fecha: str) -> pd.DataFrame:
        dt = df[col_fecha]
        df["Day"] = dt.dt.day
        df["Month"] = dt.dt.month
        df["Hour"] = dt.dt.hour
        df["Minute"] = dt.dt.minute
        df["Day of Week"] = dt.dt.dayofweek + 1
        # Quarter
        df["Quarter of Year"] = pd.cut(
            df["Month"],
            bins=[0, 3, 6, 9, 12],
            labels=[1, 2, 3, 4],
            include_lowest=True,
        ).astype(int)
        # Day of Year
        df["Day of Year"] = dt.dt.strftime('%j').astype(int)
        return df

    @staticmethod
    def _finalizar(df: pd.DataFrame, col_fecha: str, eliminar_datetime: bool) -> pd.DataFrame:
        df = df.dropna().copy()
        if eliminar_datetime and col_fecha in df.columns:
            df = df.drop(columns=[col_fecha])
        return df

    @staticmethod
    def ejecutar(df_modificado: pd.DataFrame, *, ventana_mediana: int, eliminar_datetime: bool) -> pd.DataFrame:
        df = df_modificado.copy()
        df = Preprocesamiento._tranformar_numerica(df)
        df = Preprocesamiento._drop_col_si_existe(df, "mixed_type_col")
        df = Preprocesamiento._limpiar_parsear_datetime(df, "DateTime")
        df = Preprocesamiento._imputar_numericos_mediana(df)
        df = Preprocesamiento._outliers_mediana_rodante(df, "DateTime", ventana_mediana)
        df = Preprocesamiento._features_tiempo(df, "DateTime")
        df = Preprocesamiento._finalizar(df, "DateTime", eliminar_datetime)
        return df


    @staticmethod
    def correr_pipeline(
        carpeta_raw: str | Path = "../data/raw",
        carpeta_processed: str | Path = "../data/processed",
        nombre_salida: str = "power_tetouan_city_processed1.csv",
        nombre_modificado: str = "power_tetouan_city_modified.csv",
        ventana_mediana: int = 25,
        eliminar_datetime: bool = True,
    ) -> Path:
        carpeta_processed = Path(carpeta_processed)
        carpeta_processed.mkdir(parents=True, exist_ok=True)

        loader = CargaArchivos(carpeta_raw, nombre_modificado)
        df_modificado = loader.leer()

        df_final = Preprocesamiento.ejecutar(df_modificado,ventana_mediana=ventana_mediana,eliminar_datetime=eliminar_datetime,)

        ruta_out = carpeta_processed / nombre_salida
        df_final.to_csv(ruta_out, index=False)
        return ruta_out

Prueba de las clases Carga y Procesamiento

In [86]:
# 1) Prepara un CSV de ejemplo
carpeta = Path("../data/raw")

# 2) Instancia y lee
carga = CargaArchivos(
    carpeta_raw=carpeta,
    nombre_modificado="power_tetouan_city_modified.csv",
)

df = carga.leer()

In [87]:
Preprocesamiento.correr_pipeline( "../data/raw", "../data/processed", "power_tetouan_city_processed.csv","power_tetouan_city_modified.csv",25,True,)

WindowsPath('../data/processed/power_tetouan_city_processed.csv')

Entrenamiento

In [92]:
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Núcleos paralelos por defecto
N_JOBS = -1

# XGBoost opcional
try:
    from xgboost import XGBRegressor
    XGBOOST_AVAILABLE = True
except Exception:
    XGBOOST_AVAILABLE = False

class Train:
    def __init__(
        self,
        df: pd.DataFrame,
        target: str = "PowerConsumption_Zone2",
        num_cols = ('Temperature','Humidity','WindSpeed','GeneralDiffuseFlows','DiffuseFlows'),
        feature_range=(1,2),
        train_ratio: float = 0.80,
        random_state: int = 42
    ):
        self.df = df.copy()
        self.target = target
        self.num_cols = list(num_cols)
        self.feature_range = feature_range
        self.train_ratio = train_ratio
        self.random_state = random_state

        self.df.columns=['Temperature', 'Humidity', 'WindSpeed', 'GeneralDiffuseFlows',
       'DiffuseFlows','PowerConsumption_Zone1',
       'PowerConsumption_Zone2', 'PowerConsumption_Zone3' ,'Day',
       'Month', 'Hour', 'Minute', 'DayWeek', 'QuarterYear',
       'DayYear']

        # split temporal como en tu código
        n = len(self.df)
        i = int(n * self.train_ratio)

        self.X = self.df.drop(columns=['PowerConsumption_Zone1','PowerConsumption_Zone2','PowerConsumption_Zone3'])
        self.y = self.df[[self.target]]

        self.x_train, self.y_train = self.X.iloc[:i], self.y.iloc[:i].values.ravel()
        self.x_test,  self.y_test  = self.X.iloc[i:],  self.y.iloc[i:].values.ravel()

        # preprocesamiento
        self.num_pipeline = Pipeline(steps=[
            ('impMediana', SimpleImputer(strategy='median')),
            ('escalaNum', MinMaxScaler(feature_range=self.feature_range)),
        ])
        self.ct = ColumnTransformer(
            transformers=[('numpipe', self.num_pipeline, self.num_cols)],
            remainder='passthrough'
        )

        # modelos
        self.modelos, self.nombres = self._mis_modelos()

        # salidas
        self.cv_results_ = None
        self.best_name_ = None
        self.best_estimator_ = None          # modelo base
        self.best_pipeline_ = None           # pipeline(ct + modelo) entrenado en train
        self.test_rmse_ = None

    def _mis_modelos(self):
        modelos, nombres = [], []

        modelos.append(RandomForestRegressor(
            n_estimators=700, min_samples_split=2, min_samples_leaf=1,
            max_features=3, random_state=self.random_state, n_jobs=N_JOBS
        )); nombres.append('RandomForest')

        modelos.append(ElasticNet(
            alpha=0.1, l1_ratio=0.5, random_state=self.random_state, max_iter=5000
        )); nombres.append('ElasticNet')

        modelos.append(GradientBoostingRegressor(
            n_estimators=500, learning_rate=0.05, max_depth=5,
            min_samples_split=5, min_samples_leaf=3, random_state=self.random_state
        )); nombres.append('GradientBoosting')

        if XGBOOST_AVAILABLE:
            modelos.append(XGBRegressor(
                n_estimators=500, learning_rate=0.05, max_depth=5,
                random_state=self.random_state, n_jobs=N_JOBS
            )); nombres.append('XGBoost')

        modelos.append(SVR(kernel='rbf', C=100, epsilon=0.1, gamma='scale'))
        nombres.append('SVR')

        return modelos, nombres

    def cross_validate(self, n_splits=5, n_repeats=2, scoring='neg_mean_squared_error'):
        cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=8)
        filas = []
        detalles = {}

        for modelo, nombre in zip(self.modelos, self.nombres):
            pipe = Pipeline(steps=[('ct', self.ct), ('m', modelo)])
            mse_scores = cross_val_score(
                pipe, self.x_train, self.y_train,
                scoring=scoring, cv=cv, n_jobs=N_JOBS
            )
            rmse = np.sqrt(-mse_scores)
            filas.append({'model': nombre, 'rmse_mean': rmse.mean(), 'rmse_std': rmse.std()})
            detalles[nombre] = rmse

        self.cv_results_ = pd.DataFrame(filas).sort_values('rmse_mean').reset_index(drop=True)
        return self.cv_results_, detalles

    def fit_best(self):
        if self.cv_results_ is None or self.cv_results_.empty:
            self.cross_validate()

        self.best_name_ = self.cv_results_.iloc[0]['model']
        idx = self.nombres.index(self.best_name_)
        self.best_estimator_ = self.modelos[idx]

        self.best_pipeline_ = Pipeline(steps=[('ct', self.ct), ('m', self.best_estimator_)])
        self.best_pipeline_.fit(self.x_train, self.y_train)

        preds = self.best_pipeline_.predict(self.x_test)
        self.test_rmse_ = float(np.sqrt(mean_squared_error(self.y_test, preds)))
        return self.best_pipeline_, self.test_rmse_

    def predict(self, X_new: pd.DataFrame):
        if self.best_pipeline_ is None:
            raise RuntimeError("Primero ejecuta fit_best().")
        return self.best_pipeline_.predict(X_new)

    def get_best(self):
        if self.best_pipeline_ is None:
            raise RuntimeError("Aún no hay modelo entrenado. Llama a fit_best().")
        return {
            'name': self.best_name_,
            'pipeline': self.best_pipeline_,
            'test_rmse': self.test_rmse_,
            'cv_table': self.cv_results_.copy()
        }

In [93]:
# 1) Prepara un CSV de ejemplo
carpeta = Path("../data/processed")

# 2) Instancia y lee
carga = CargaArchivos(
    carpeta_raw=carpeta,
    nombre_modificado="power_tetouan_city_processed.csv",
)

df = carga.leer()
train = Train(df)
cv_table, _ = train.cross_validate()
best_pipe, test_rmse = train.fit_best()
print(cv_table)
print("Mejor:", train.best_name_, "RMSE test:", round(test_rmse, 3))
y_pred = train.predict(train.x_test)

              model    rmse_mean   rmse_std
0      RandomForest   863.924245  35.264463
1           XGBoost   951.048584  32.452242
2  GradientBoosting   961.329366  32.197826
3        ElasticNet  3307.388993  27.333673
4               SVR  3374.946358  28.530995
Mejor: RandomForest RMSE test: 3700.542
