In [88]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import GroupShuffleSplit
import matplotlib.pyplot as plt
import numpy as np

In [109]:
modelo = CatBoostClassifier()
modelo.load_model("modelo_tachadas.cbm")

<catboost.core.CatBoostClassifier at 0x1d50d459640>

In [110]:
ruta_csv = r"C:\Users\Usuario\Downloads\SENSOR40_sintetico_match_format.csv"

df_raw = pd.read_csv(ruta_csv)
print("Archivo crudo cargado:", df_raw.shape)

df_raw.head()

Archivo crudo cargado: (669, 13)


Unnamed: 0,planta,año,sensor_id,timestamp,VOLT_HUM,VOLT_TEM,TEMPERATURA,HUMEDAD,Variedad,ID_tachada,HumedadInicial,HumedadFinal,source_file
0,RB,2025,2,2025-03-29 11:00:00,0.970032,3.316077,34.792926,24.72476,MERÍN,90001,24.72476,18.945422,SENSOR_SYN_1.txt
1,RB,2025,2,2025-03-29 11:05:00,0.951873,3.308344,34.700124,24.860952,MERÍN,90001,24.72476,18.945422,SENSOR_SYN_1.txt
2,RB,2025,4,2025-03-29 11:10:00,1.011143,3.346921,35.163047,24.416427,MERÍN,90001,24.72476,18.945422,SENSOR_SYN_1.txt
3,RB,2025,1,2025-03-29 11:15:00,1.01551,3.289278,34.471335,24.383677,MERÍN,90001,24.72476,18.945422,SENSOR_SYN_1.txt
4,RB,2025,3,2025-03-29 11:20:00,1.023274,3.352674,35.23209,24.325447,MERÍN,90001,24.72476,18.945422,SENSOR_SYN_1.txt


In [111]:
df = df_raw.copy()

# Convertir timestamp
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

# Eliminar filas sin timestamp válido
df = df.dropna(subset=["timestamp"])

# Eliminar filas sin valores de sensores
df = df.dropna(subset=["TEMPERATURA", "HUMEDAD"])

# Arreglar variedades
df["Variedad"] = (
    df["Variedad"]
    .fillna("MERÍN")
    .replace("L5903", "MERÍN")
    .astype(str)
)

# Eliminar voltajes (no se usan y no deben aparecer en el CSV final)
df = df.drop(columns=["VOLT_HUM", "VOLT_TEM"], errors="ignore")

print("Luego de limpieza:", df.shape)

Luego de limpieza: (669, 11)


In [112]:
def resumir_tachadas_v3(df):
    df = df.sort_values(["ID_tachada", "timestamp"])

    # ===============================================================
    # 1) FEATURES ORIGINALES (V1)
    # ===============================================================
    resumen = df.groupby("ID_tachada").agg(
        humedad_mean=("HUMEDAD", "mean"),
        humedad_std=("HUMEDAD", "std"),
        humedad_min=("HUMEDAD", "min"),
        humedad_max=("HUMEDAD", "max"),
        temp_mean=("TEMPERATURA", "mean"),
        temp_std=("TEMPERATURA", "std"),
        temp_min=("TEMPERATURA", "min"),
        temp_max=("TEMPERATURA", "max"),
        timestamp_min=("timestamp", "min"),
        timestamp_max=("timestamp", "max"),
        variedad=("Variedad", "first"),
        sensor_id=("sensor_id", "first"),
        año=("año", "first"),
        planta=("planta", "first")
    ).reset_index()

    resumen["humedad_range"] = resumen["humedad_max"] - resumen["humedad_min"]
    resumen["temp_range"] = resumen["temp_max"] - resumen["temp_min"]

    resumen["humedad_p25"] = df.groupby("ID_tachada")["HUMEDAD"].quantile(0.25).values
    resumen["humedad_p75"] = df.groupby("ID_tachada")["HUMEDAD"].quantile(0.75).values
    resumen["temp_p25"]     = df.groupby("ID_tachada")["TEMPERATURA"].quantile(0.25).values
    resumen["temp_p75"]     = df.groupby("ID_tachada")["TEMPERATURA"].quantile(0.75).values

    resumen["duracion_horas"] = (
        resumen["timestamp_max"] - resumen["timestamp_min"]
    ).dt.total_seconds() / 3600

    resumen["hora_inicio"] = resumen["timestamp_min"].dt.hour

    condiciones = [
        resumen["hora_inicio"].between(6, 12),
        resumen["hora_inicio"].between(12, 18),
        resumen["hora_inicio"].between(18, 24),
        resumen["hora_inicio"].between(0, 6)
    ]
    categorias = ["mañana", "tarde", "noche", "madrugada"]
    resumen["momento_dia"] = np.select(condiciones, categorias, default="desconocido")

    def slope(grupo, col):
        if len(grupo) > 1:
            return (grupo[col].iloc[-1] - grupo[col].iloc[0]) / (
                (grupo["timestamp"].iloc[-1] - grupo["timestamp"].iloc[0]).total_seconds() + 1e-9
            )
        return 0

    resumen["slope_temp"] = df.groupby("ID_tachada").apply(slope, "TEMPERATURA").values
    resumen["slope_hum"]  = df.groupby("ID_tachada").apply(slope, "HUMEDAD").values

    # ===============================================================
    # 2) FEATURES NUEVAS (V3)
    # ===============================================================
    UMBRAL_TEMP = 38
    UMBRAL_HUM  = 11
    HUM_FINAL_BUENA = 13

    def count_crossings(series, thr):
        return np.sum((series.shift(1) < thr) & (series >= thr))

    def count_shocks(series, delta):
        return np.sum(series.diff().abs() > delta)

    def time_in_zone(df_t, col, thr):
        return np.sum(df_t[col] > thr)

    def segmented_slope(series):
        n = len(series)
        if n < 4:
            return (0, 0)
        q1 = series.iloc[:n//4]
        q4 = series.iloc[-n//4:]
        slope_q1 = (q1.iloc[-1] - q1.iloc[0]) / (len(q1) + 1e-6)
        slope_q4 = (q4.iloc[-1] - q4.iloc[0]) / (len(q4) + 1e-6)
        return slope_q1, slope_q4

    nuevas_cols = [
        "temp_cross_38", "hum_cross_11",
        "temp_shocks_3", "hum_shocks_5",
        "temp_time_above_38", "hum_time_above_11",
        "slope_temp_Q1", "slope_temp_Q4",
        "slope_hum_Q1", "slope_hum_Q4",
        "drying_rate", "drop_ratio_temp",
        "drop_ratio_hum", "hum_final_above_13"
    ]

    for c in nuevas_cols:
        resumen[c] = np.nan

    for idx, row in resumen.iterrows():
        id_tach = row["ID_tachada"]
        df_tach = df[df["ID_tachada"] == id_tach].sort_values("timestamp")

        resumen.at[idx, "temp_cross_38"] = count_crossings(df_tach["TEMPERATURA"], UMBRAL_TEMP)
        resumen.at[idx, "hum_cross_11"]  = count_crossings(df_tach["HUMEDAD"], UMBRAL_HUM)

        resumen.at[idx, "temp_shocks_3"] = count_shocks(df_tach["TEMPERATURA"], 3)
        resumen.at[idx, "hum_shocks_5"]  = count_shocks(df_tach["HUMEDAD"], 5)

        resumen.at[idx, "temp_time_above_38"] = time_in_zone(df_tach, "TEMPERATURA", UMBRAL_TEMP)
        resumen.at[idx, "hum_time_above_11"]  = time_in_zone(df_tach, "HUMEDAD", UMBRAL_HUM)

        st1, st4 = segmented_slope(df_tach["TEMPERATURA"])
        sh1, sh4 = segmented_slope(df_tach["HUMEDAD"])
        resumen.at[idx, "slope_temp_Q1"] = st1
        resumen.at[idx, "slope_temp_Q4"] = st4
        resumen.at[idx, "slope_hum_Q1"]  = sh1
        resumen.at[idx, "slope_hum_Q4"]  = sh4

        resumen.at[idx, "drying_rate"] = (
            row["humedad_mean"] - row["humedad_min"]
        ) / (row["duracion_horas"] + 1e-6)

        resumen.at[idx, "drop_ratio_temp"] = row["temp_min"] / (row["temp_max"] + 1e-6)
        resumen.at[idx, "drop_ratio_hum"]  = row["humedad_min"] / (row["humedad_max"] + 1e-6)

        resumen.at[idx, "hum_final_above_13"] = int(row["humedad_min"] > HUM_FINAL_BUENA)

    return resumen

In [113]:
# Generar el resumen con todas las features v3
resumen = resumir_tachadas_v3(df)

print("Resumen generado correctamente:", resumen.shape)
resumen.head()

Resumen generado correctamente: (5, 40)


  resumen["slope_temp"] = df.groupby("ID_tachada").apply(slope, "TEMPERATURA").values
  resumen["slope_hum"]  = df.groupby("ID_tachada").apply(slope, "HUMEDAD").values


Unnamed: 0,ID_tachada,humedad_mean,humedad_std,humedad_min,humedad_max,temp_mean,temp_std,temp_min,temp_max,timestamp_min,...,temp_time_above_38,hum_time_above_11,slope_temp_Q1,slope_temp_Q4,slope_hum_Q1,slope_hum_Q4,drying_rate,drop_ratio_temp,drop_ratio_hum,hum_final_above_13
0,90001,21.687243,1.731148,18.740249,24.860952,38.489486,2.153269,34.471335,42.786219,2025-03-29 11:00:00,...,73.0,128.0,0.043243,0.089853,-0.038867,-0.040073,0.278456,0.805664,0.753803,1.0
1,90002,18.372005,1.013441,16.228682,20.444251,37.272479,4.017705,29.658432,43.990912,2025-03-25 13:00:00,...,63.0,137.0,0.086372,0.100591,-0.029858,-0.03968,0.189117,0.674195,0.793802,1.0
2,90003,18.025884,1.34289,15.479552,20.654033,41.064309,3.961726,33.859771,47.992719,2025-03-28 09:00:00,...,103.0,142.0,0.101798,0.074218,-0.024875,-0.030952,0.216709,0.705519,0.749469,1.0
3,90004,17.787457,1.427028,15.286893,20.233507,33.421246,1.926318,29.639417,37.100703,2025-03-24 23:00:00,...,0.0,113.0,0.093788,0.051661,-0.043244,-0.050445,0.267918,0.798891,0.755524,1.0
4,90005,15.679813,1.60704,12.696641,18.635193,37.537956,3.205394,31.550763,43.337719,2025-03-22 04:00:00,...,71.0,149.0,0.075145,0.075436,-0.026122,-0.042616,0.241879,0.728021,0.681326,0.0


In [114]:
features_modelo = [
    "humedad_mean", "humedad_std", "humedad_min", "humedad_max",
    "temp_mean", "temp_std", "temp_min", "temp_max",
    "variedad", "sensor_id",
    "humedad_range", "temp_range",
    "humedad_p25", "humedad_p75",
    "temp_p25", "temp_p75",
    "duracion_horas", "hora_inicio", "momento_dia",
    "slope_temp", "slope_hum",
    "temp_cross_38", "hum_cross_11",
    "temp_shocks_3", "hum_shocks_5",
    "temp_time_above_38", "hum_time_above_11",
    "slope_temp_Q1", "slope_temp_Q4",
    "slope_hum_Q1", "slope_hum_Q4",
    "drying_rate", "drop_ratio_temp", "drop_ratio_hum",
    "hum_final_above_13"
] 

In [115]:
# Columnas que el modelo NO usa y deben eliminarse antes de predecir
cols_borrar = ["ID_tachada", "planta", "año", "timestamp_min", "timestamp_max"]

resumen_pred = resumen.drop(columns=cols_borrar, errors="ignore").copy()

# Reordenar EXACTAMENTE en el orden del entrenamiento
resumen_pred = resumen_pred[features_modelo]

# Aseguramos columnas categóricas correctas
cat_cols = ["variedad", "sensor_id", "momento_dia"]

for c in cat_cols:
    resumen_pred[c] = resumen_pred[c].astype(str).fillna("DESCONOCIDO")

print("Dataset final para predecir:", resumen_pred.shape)

Dataset final para predecir: (5, 35)


In [116]:
# Umbral final del modelo elegido en la tesis
UMBRAL = 0.20

# Probabilidad de ser defectuosa
y_prob = modelo.predict_proba(resumen_pred)[:, 1]
resumen["probabilidad"] = y_prob

# Predicción final
resumen["prediccion"] = (y_prob >= UMBRAL).astype(int)

print("Predicciones generadas.")
resumen[["ID_tachada", "probabilidad", "prediccion"]].head(11)

Predicciones generadas.


Unnamed: 0,ID_tachada,probabilidad,prediccion
0,90001,0.31322,1
1,90002,0.11383,0
2,90003,0.366156,1
3,90004,0.069236,0
4,90005,0.01883,0


In [117]:

# Calcular humedad promedio de los últimos 30 minutos por tachada


resumen["hum_30fin_prom"] = np.nan

for idx, row in resumen.iterrows():
    id_tach = row["ID_tachada"]

    # USAR df (limpiado), NO df_raw
    df_tach = df[df["ID_tachada"] == id_tach].sort_values("timestamp")

    # Último timestamp
    fin = df_tach["timestamp"].max()

    # Marca 30 minutos antes
    inicio_30 = fin - pd.Timedelta(minutes=30)

    # Filtrar últimas mediciones
    df_ultimos30 = df_tach[df_tach["timestamp"] >= inicio_30]

    # Asignar promedio
    if len(df_ultimos30) > 0:
        resumen.at[idx, "hum_30fin_prom"] = df_ultimos30["HUMEDAD"].mean()
    else:
        resumen.at[idx, "hum_30fin_prom"] = np.nan


In [118]:
extra_cols = ["HumedadInicial", "HumedadFinal"]

resumen = resumen.merge(
    df_raw[["ID_tachada"] + extra_cols].drop_duplicates("ID_tachada"),
    on="ID_tachada",
    how="left"
)

In [119]:
nombre_archivo = "predicciones_tachadas.csv"

resumen.to_csv(nombre_archivo, index=False)

In [120]:
from IPython.display import FileLink
FileLink("predicciones_tachadas.csv")