# Model training

## LightGBM

In [1]:
from pathlib import Path
import pandas as pd, numpy as np, gc, time, json, math, random
import lightgbm as lgb

In [2]:
FEATS_DIR = Path("features_ready")
files = sorted(FEATS_DIR.glob("*_features.parquet"))
assert files, "No hay archivos en features_ready/*.parquet"

In [3]:
SAMPLE_ROWS = 100_000

cat_cols = ["LINEA","DIR","proxima_est_teorica","DIR_init"]
num_cols = [
    "dist_a_prox_m","dist_estacion_m","vel_mps","Altitud (m)","s_m","dist_m",
    "time_diff","dwell_same_xy_s","is_no_progress","progress_event","hour","dow",
    "is_weekend","is_peak"
]
extra_cols = ["trip_id","Fecha"]
target_col = "ETA_proxima_est_s"
all_needed = list(dict.fromkeys(cat_cols + num_cols + extra_cols + [target_col]))

In [4]:
# -------------------------------
# PASO 1: Construir trip_meta por archivo (muy barato en RAM)
# -------------------------------
trip_meta_parts = []
for f in files:
    # Lee solo columnas mínimas
    cols_min = [c for c in ["trip_id","LINEA","DIR","Fecha"] if c in pd.read_parquet(f, columns=["trip_id"]).columns or True]
    dfm = pd.read_parquet(f, columns=list(set(cols_min)))
    # Asegura tipos básicos
    if not np.issubdtype(dfm["Fecha"].dtype, np.datetime64):
        dfm["Fecha"] = pd.to_datetime(dfm["Fecha"], errors="coerce")
    g = dfm.groupby("trip_id").agg(
        n=("trip_id","size"),
        LINEA=("LINEA","first"),
        DIR=("DIR","first"),
        t0=("Fecha","min")
    ).reset_index()
    g["src_file"] = f.name  # de dónde viene cada trip
    trip_meta_parts.append(g)
    del dfm, g; gc.collect()

trip_meta = pd.concat(trip_meta_parts, ignore_index=True)
del trip_meta_parts; gc.collect()

# Limpieza menor
trip_meta = trip_meta.dropna(subset=["t0"])
trip_meta["line_dir"] = trip_meta["LINEA"].astype("string").fillna("NA") + "|" + trip_meta["DIR"].astype("string").fillna("NA")

total_rows = int(trip_meta["n"].sum())
print(f"Trips totales: {len(trip_meta):,} | filas totales (estimadas): {total_rows:,}")

Trips totales: 54,360 | filas totales (estimadas): 35,612,037


In [5]:
# -------------------------------
# PASO 2: Muestreo estratificado por LINEA|DIR a nivel de trip
# -------------------------------
rng = random.Random(42)

# cuota por bucket ≈ proporcional al tamaño
bucket = trip_meta.groupby("line_dir")["n"].sum()
bucket_quota = (SAMPLE_ROWS * (bucket / bucket.sum())).round().astype(int).to_dict()

chosen_ids = []
rows_acc = 0
for ld, sub in trip_meta.groupby("line_dir"):
    quota = bucket_quota.get(ld, 0)
    if quota <= 0:
        continue
    # barajar
    idx = list(sub.index)
    rng.shuffle(idx)
    for i in idx:
        if rows_acc >= SAMPLE_ROWS: break
        chosen_ids.append(trip_meta.at[i, "trip_id"])
        rows_acc += int(trip_meta.at[i, "n"])
    if rows_acc >= SAMPLE_ROWS:
        break

# Si faltan filas, llena con el resto barajado
if rows_acc < SAMPLE_ROWS:
    print("Llenando con trips adicionales...")
    rest = trip_meta.loc[~trip_meta["trip_id"].isin(chosen_ids)].index.tolist()
    rng.shuffle(rest)
    for i in rest:
        if rows_acc >= SAMPLE_ROWS: break
        chosen_ids.append(trip_meta.at[i, "trip_id"])
        rows_acc += int(trip_meta.at[i, "n"])

chosen_ids = pd.Index(chosen_ids).unique()
chosen_meta = trip_meta.loc[trip_meta["trip_id"].isin(chosen_ids)]
print(f"Elegidos: {len(chosen_meta):,} trips | ~{int(chosen_meta['n'].sum()):,} filas estimadas")


Elegidos: 8,032 trips | ~5,569,928 filas estimadas


In [6]:
# -------------------------------
# PASO 3: Re-leer por archivo SOLO los trips elegidos y SOLO columnas necesarias
#         + decimar si nos pasamos del target
# -------------------------------
data_parts = []
rows_loaded = 0
for f in files:
    sel_trips = chosen_meta.loc[chosen_meta["src_file"] == f.name, "trip_id"]
    if sel_trips.empty:
        continue
    # Lee mínimo necesario del archivo
    avail_cols = pd.read_parquet(f, columns=None).columns
    use_cols = [c for c in all_needed if c in avail_cols]
    df = pd.read_parquet(f, columns=use_cols)
    df = df.loc[df["trip_id"].isin(sel_trips)]
    # Tipos
    if not np.issubdtype(df["Fecha"].dtype, np.datetime64):
        df["Fecha"] = pd.to_datetime(df["Fecha"], errors="coerce")
    for c in (set(cat_cols) & set(df.columns)):
        df[c] = df[c].astype("category")
    # Mantén orden temporal
    df.sort_values(["trip_id","Fecha"], inplace=True)
    data_parts.append(df)
    rows_loaded += len(df)
    del df; gc.collect()

data_s = pd.concat(data_parts, ignore_index=True) if data_parts else pd.DataFrame(columns=all_needed)
del data_parts; gc.collect()

print(f"Agregadas {len(data_s):,} filas antes de decimar")

# Si te pasaste mucho del target, decima por trip de forma temporal
""" if len(data_s) > int(SAMPLE_ROWS * 1.25):
    factor = len(data_s) / SAMPLE_ROWS
    # toma cada k-ésimo por trip (k≈factor, mínimo 1)
    k = max(1, int(np.ceil(factor)))
    data_s = (
        data_s
        .sort_values(["trip_id","Fecha"])
        .groupby("trip_id", group_keys=False)
        .apply(lambda g: g.iloc[::k])
        .reset_index(drop=True)
    )
    print(f"Después de decimar k={k}: {len(data_s):,} filas") """

# Limpieza final
data_s = data_s.dropna(subset=[target_col, "Fecha"])
data_s = data_s.loc[data_s[target_col] >= 0].reset_index(drop=True)
print(f"Muestra final: {len(data_s):,} filas; trips: {data_s['trip_id'].nunique():,}")

Agregadas 5,569,928 filas antes de decimar
Muestra final: 5,569,928 filas; trips: 107


In [22]:
# Encontrar en data un tiempo específico y mostrar filas
specific_time = pd.to_datetime("2024-08-27 19:20:50")
specific_rows = data_s[data_s["Fecha"] == specific_time]
specific_rows.columns

Index(['LINEA', 'DIR', 'proxima_est_teorica', 'DIR_init', 'dist_a_prox_m',
       'dist_estacion_m', 'vel_mps', 'Altitud (m)', 's_m', 'dist_m',
       'time_diff', 'dwell_same_xy_s', 'is_no_progress', 'progress_event',
       'hour', 'dow', 'is_weekend', 'is_peak', 'trip_id', 'Fecha',
       'ETA_proxima_est_s'],
      dtype='object')

In [7]:
# 1) Corrige dtypes del target
if target_col in data_s.columns:
    if pd.api.types.is_timedelta64_dtype(data_s[target_col]):
        data_s[target_col] = data_s[target_col].dt.total_seconds()
    data_s[target_col] = pd.to_numeric(data_s[target_col], errors="coerce")

# 2) Convierte columnas numéricas
for c in num_cols:
    if c not in data_s.columns:
        continue
    col = data_s[c]
    # timedelta -> segundos
    if pd.api.types.is_timedelta64_dtype(col):
        data_s[c] = col.dt.total_seconds()
        col = data_s[c]
    # object -> numérico (coerce NaN)
    if col.dtype == "object":
        data_s[c] = pd.to_numeric(col, errors="coerce")
        col = data_s[c]
    # bool -> uint8 (o deja bool, ambos sirven)
    if col.dtype == "bool":
        data_s[c] = col.astype("uint8")
        col = data_s[c]
    # compacta a float32/int32
    if pd.api.types.is_float_dtype(col):
        data_s[c] = col.astype("float32")
    elif pd.api.types.is_integer_dtype(col):
        # ojo con IDs; estas son features numéricas, no IDs
        data_s[c] = col.astype("int32")
        
# 3) Categóricas: asegura dtype 'category'
for c in cat_cols:
    if c in data_s.columns:
        # si viene object (strings), pásala a category
        data_s[c] = data_s[c].astype("category")
        
# 4) Limpieza mínima de filas inválidas
need_cols = [target_col] + num_cols + cat_cols
present = [c for c in need_cols if c in data_s.columns]
data_s = data_s.dropna(subset=[target_col]).reset_index(drop=True)

In [23]:
# Filtrar ETAs muy altos (outliers)
data_s = data_s.loc[data_s[target_col] <= 7200].reset_index(drop=True)

In [24]:
# -------------------------------
# SPLIT por trips (sin fuga)
# -------------------------------
trip_start = data_s.groupby("trip_id")["Fecha"].min().rename("t0")
ref_date = pd.Timestamp("2024-01-15")
train_trips = trip_start.index[trip_start < ref_date]
valid_trips = trip_start.index[trip_start >= ref_date]

train = data_s.loc[data_s["trip_id"].isin(train_trips)]
valid = data_s.loc[data_s["trip_id"].isin(valid_trips)]

X_tr = train[num_cols + cat_cols]
y_tr = train[target_col].values
X_va = valid[num_cols + cat_cols]
y_va = valid[target_col].values

train_data = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
valid_data = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols, free_raw_data=False)

In [25]:
data_s.head()

Unnamed: 0,LINEA,DIR,proxima_est_teorica,DIR_init,dist_a_prox_m,dist_estacion_m,vel_mps,Altitud (m),s_m,dist_m,...,dwell_same_xy_s,is_no_progress,progress_event,hour,dow,is_weekend,is_peak,trip_id,Fecha,ETA_proxima_est_s
0,Linea_12,IDA,MONTE MARÍA,IDA,2507.478516,361.775177,0.0,1404.0,0.0,320.33194,...,0.0,0,0,9,4,0,1,1,2024-01-12 09:38:23,657.0
1,Linea_12,IDA,MONTE MARÍA,IDA,2507.478516,361.775177,0.0,1404.0,0.0,320.33194,...,60.0,1,0,9,4,0,1,1,2024-01-12 09:39:23,597.0
2,Linea_12,IDA,MONTE MARÍA,IDA,2507.478516,361.775177,0.0,1404.0,0.0,320.33194,...,120.0,1,0,9,4,0,1,1,2024-01-12 09:40:23,537.0
3,Linea_12,IDA,MONTE MARÍA,IDA,2507.478516,361.775177,0.0,1404.0,0.0,320.33194,...,180.0,1,0,9,4,0,1,1,2024-01-12 09:41:23,477.0
4,Linea_12,IDA,MONTE MARÍA,IDA,2507.478516,379.234497,1.666667,1407.0,0.0,335.106476,...,0.0,0,1,9,4,0,1,1,2024-01-12 09:44:23,297.0


In [None]:
# Monotonicidad (opcional):
mono = []
for c in num_cols:
    if   c == "dist_a_prox_m":   mono.append( 1)
    elif c == "dist_estacion_m": mono.append( 1)
    elif c == "vel_mps":         mono.append(-1)
    else:                        mono.append( 0)


In [26]:

params = dict(
    objective="mae", metric="mae",
    learning_rate=0.08, num_leaves=127,
    feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=1,
    min_data_in_leaf=50, max_depth=-1, reg_lambda=1.0,
    max_bin=255, bin_construct_sample_cnt=200_000, min_data_in_bin=1,
    force_row_wise=True,
)

eval_result = {}
model = lgb.train(
    params, train_data, num_boost_round=10_000,
    valid_sets=[train_data, valid_data],
    valid_names=["train","valid"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=300, verbose=True),
        lgb.log_evaluation(period=100),
        lgb.record_evaluation(eval_result),
    ],
)
print("Best iteration:", model.best_iteration)

[LightGBM] [Info] Total Bins 1991
[LightGBM] [Info] Number of data points in the train set: 1682466, number of used features: 18
[LightGBM] [Info] Start training from score 174.000000
Training until validation scores don't improve for 300 rounds
[100]	train's l1: 151.831	valid's l1: 148.212
[200]	train's l1: 147.742	valid's l1: 145.743
[300]	train's l1: 144.411	valid's l1: 143.931
[400]	train's l1: 141.692	valid's l1: 142.783
[500]	train's l1: 139.3	valid's l1: 142.136
[600]	train's l1: 137.702	valid's l1: 141.763
[700]	train's l1: 136.672	valid's l1: 141.551
[800]	train's l1: 135.64	valid's l1: 141.364
[900]	train's l1: 134.432	valid's l1: 141.2
[1000]	train's l1: 133.516	valid's l1: 141.06
[1100]	train's l1: 132.691	valid's l1: 140.943
[1200]	train's l1: 131.976	valid's l1: 140.904
[1300]	train's l1: 131.47	valid's l1: 140.88
[1400]	train's l1: 131.043	valid's l1: 140.849
[1500]	train's l1: 130.567	valid's l1: 140.823
[1600]	train's l1: 129.992	valid's l1: 140.787
[1700]	train's l1: 

In [99]:
# Guardar modelo
model.save_model("lightgbm_model.txt")

<lightgbm.basic.Booster at 0x1ce86e682d0>

## Métricas

In [27]:
# Calidad por baseline histórico

mae_model = np.mean(np.abs(model.predict(X_va, num_iteration=model.best_iteration) - y_va))
grp = train.groupby(["LINEA","DIR","proxima_est_teorica"])[target_col].median()
valid_hist = valid.merge(grp.rename("eta_med"), left_on=["LINEA","DIR","proxima_est_teorica"], right_index=True, how="left")["eta_med"]
valid_hist = valid_hist.fillna(train[target_col].median())
mae_hist = np.mean(np.abs(valid_hist.values - y_va))
print(f"Baseline histórico MAE: {mae_hist:,.1f} s")
print(f"Mejora vs histórico:    {(1 - mae_model/mae_hist)*100:.1f}%")

  grp = train.groupby(["LINEA","DIR","proxima_est_teorica"])[target_col].median()


Baseline histórico MAE: 224.5 s
Mejora vs histórico:    37.4%


In [28]:
# Calidad por bandas

pred = model.predict(X_va, num_iteration=model.best_iteration)
err  = np.abs(pred - y_va)
bands = pd.cut(valid["dist_a_prox_m"], bins=[-1,100,300,600,1200,999999], labels=["0-100","100-300","300-600","600-1200",">1200"])
err_s = pd.Series(err, index=valid.index, name="abs_err")
tabla = err_s.groupby(bands).agg(["count","mean","median"]).round(1)
print(tabla)

                count   mean  median
dist_a_prox_m                       
0-100          813380  136.6    21.3
100-300        662252  105.3    25.4
300-600        804853   78.7    32.8
600-1200       813189  119.3    40.2
>1200          609166  294.6    96.9


  tabla = err_s.groupby(bands).agg(["count","mean","median"]).round(1)


In [29]:
tabla_p = (
  err_s.groupby(bands)
       .quantile([0.5, 0.9, 0.95, 0.99])
       .unstack()
       .round(1)
)
print(tabla_p)

  err_s.groupby(bands)


               0.50   0.90    0.95    0.99
dist_a_prox_m                             
0-100          21.3  184.8   374.3  3869.8
100-300        25.4  167.7   377.8  1732.0
300-600        32.8  160.0   271.9   764.3
600-1200       40.2  219.3   407.5  1565.3
>1200          96.9  630.5  1138.8  4175.4


In [30]:
df_eval = valid[["trip_id","Fecha","LINEA","DIR","proxima_est_teorica","dist_a_prox_m","vel_mps"]].copy()
df_eval["abs_err"] = err_s
df_eval["band"] = bands
worst = df_eval.sort_values("abs_err", ascending=False).groupby("band").head(20)
worst.head(50)


  worst = df_eval.sort_values("abs_err", ascending=False).groupby("band").head(20)


Unnamed: 0,trip_id,Fecha,LINEA,DIR,proxima_est_teorica,dist_a_prox_m,vel_mps,abs_err,band
2633226,422,2024-11-20 16:51:05,Linea_12,VUELTA,CENMA,0.0,1.666667,7224.234406,0-100
3507774,435,2025-02-28 16:18:07,Linea_12,VUELTA,CENMA,0.0,3.333333,7201.008279,0-100
2468474,346,2024-09-22 16:20:49,Linea_12,VUELTA,CENMA,0.0,2.777778,7168.312298,0-100
3507773,435,2025-02-28 16:18:04,Linea_12,VUELTA,CENMA,0.0,4.166667,7151.726144,0-100
3507775,435,2025-02-28 16:18:15,Linea_12,VUELTA,CENMA,0.0,2.222222,7135.996516,0-100
2606568,134,2024-05-31 16:17:00,Linea_12,VUELTA,CENMA,0.0,5.0,7114.545487,0-100
2606572,134,2024-05-31 16:17:28,Linea_12,VUELTA,CENMA,0.0,4.722222,7110.975413,0-100
2606569,134,2024-05-31 16:17:10,Linea_12,VUELTA,CENMA,0.0,6.666667,7105.197927,0-100
1866717,333,2024-02-26 08:10:45,Linea_12,VUELTA,MONTE MARÍA,176.278076,12.777778,7103.938123,100-300
3196322,136,2024-06-06 17:19:31,Linea_12,VUELTA,CENMA,0.0,4.444445,7103.516344,0-100


In [None]:
# Encontrar en data un tiempo específico y mostrar filas


            LINEA     DIR proxima_est_teorica DIR_init  dist_a_prox_m  \
2432779  Linea_12  VUELTA               CENMA      IDA            0.0   

         dist_estacion_m   vel_mps  Altitud (m)           s_m      dist_m  \
2432779       283.372009  6.944445       1426.0  12200.808594  225.311783   

         ...  dwell_same_xy_s  is_no_progress  progress_event  hour  dow  \
2432779  ...              0.0               0               1    19    1   

         is_weekend  is_peak  trip_id               Fecha ETA_proxima_est_s  
2432779           0        1      333 2024-08-27 19:20:50         7816552.0  

[1 rows x 21 columns]


In [None]:
# Realizar una predicción aislada
# Selecciona la primera fila válida para evitar ValueError
sample_X = X_va.iloc[0:1]
sample_pred = model.predict(sample_X, num_iteration=model.best_iteration)
sample_pred

ValueError: Input data must be 2 dimensional and non empty.

In [84]:
sample_X = X_va[X_va['LINEA'] == 'Linea_6'].iloc[456:457]
idx = sample_X.index[0]

# ¡Re-calcular la predicción para ESTA muestra!
sample_pred = model.predict(sample_X, num_iteration=model.best_iteration)[0]

sample_true = valid.loc[idx, target_col]
print('Muestra:')
print(sample_X)
print(f"Predicción: {sample_pred:.1f} s | Verdadero: {sample_true:.1f} s")

Muestra:
         dist_a_prox_m  dist_estacion_m  vel_mps  Altitud (m)          s_m  \
3721486      12.518419         8.348965      0.0       1521.0  5154.088867   

           dist_m  time_diff  dwell_same_xy_s  is_no_progress  progress_event  \
3721486  3.309954       60.0              0.0               0               1   

         hour  dow  is_weekend  is_peak    LINEA  DIR proxima_est_teorica  \
3721486    19    2           0        1  Linea_6  IDA          CAPUCHINAS   

        DIR_init  
3721486      IDA  
Predicción: 53.7 s | Verdadero: 60.0 s


In [98]:
print(train.query('LINEA=="Linea_18-B"').shape)

(13427, 21)
