# Model training

## LightGBM

In [2]:
import pandas as pd, numpy as np
from typing import Iterator, Tuple
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

### Carga de datos

In [3]:
# Obtener .parquet de datos muestreados
SAMPLE_PATH = 'D:/2025/UVG/Tesis/repos/backend/features_sampled_without_idle_rows/sample_features.parquet'

data = pd.read_parquet(SAMPLE_PATH)

# Renombrar columna "Altitud (m)" a "Altitud"
data = data.rename(columns={"Altitud (m)": "Altitud"})

data.head()

Unnamed: 0,Placa,trip_id,block_id,LINEA,DIR,proxima_est_teorica,Fecha,dist_a_prox_m,dist_estacion_m,vel_mps,...,dist_m,time_diff,dwell_same_xy_s,is_no_progress,progress_event,hour,dow,is_weekend,is_peak,ETA_proxima_est_s
0,49,7,1,Linea_12,IDA,MONTE MARÍA,2024-02-07 06:39:59,2507.478516,305.589294,0.0,...,246.906372,60.0,0.0,0,0,6,2,0,1,462.0
1,49,7,1,Linea_12,IDA,MONTE MARÍA,2024-02-07 06:40:51,2507.478516,300.03714,2.222222,...,241.114578,52.0,0.0,0,1,6,2,0,1,410.0
2,49,7,1,Linea_12,IDA,MONTE MARÍA,2024-02-07 06:41:22,2507.478516,282.31366,4.166667,...,222.698257,31.0,0.0,0,1,6,2,0,1,379.0
3,49,7,1,Linea_12,IDA,MONTE MARÍA,2024-02-07 06:42:22,2507.478516,236.359512,0.0,...,177.069855,60.0,0.0,0,1,6,2,0,1,319.0
4,49,7,1,Linea_12,IDA,MONTE MARÍA,2024-02-07 06:43:22,2249.54834,201.316711,10.833334,...,6.144639,60.0,0.0,0,1,6,2,0,1,259.0


In [9]:
# Preparar datos

# Columnas iniciales
str_cols = ["Placa","trip_id","block_id","LINEA","DIR","proxima_est_teorica"]
dt_cols  = ["Fecha"]
float_cols = [
    "dist_a_prox_m","dist_estacion_m","vel_mps","Altitud","s_m","dist_m",
    "time_diff","dwell_same_xy_s","ETA_proxima_est_s"
]
int_cols = ["hour","dow"]
boolish_cols = ["is_no_progress","progress_event","is_weekend","is_peak"]

# Convertir tipos
for c in str_cols:
    data[c] = data[c].astype("category")
for c in dt_cols:
    data[c] = pd.to_datetime(data[c])
for c in float_cols:
    data[c] = data[c].astype("float32")
for c in int_cols:
    data[c] = data[c].astype("int32")
for c in boolish_cols:
    data[c] = data[c].astype("bool")

### Train / valid / test split

In [5]:
df = data.copy()
dev_df = df[df["Fecha"] < "2025-04-01"]   # TODO lo anterior a abril 2025
test_df = df[df["Fecha"] >= "2025-04-01"] # TODO abril 2025 en adelante

In [25]:
def day_based_time_cv_full(
    df: pd.DataFrame,
    day_col: str = "Fecha",
    min_train_days: int = 30,   # entrena al menos 30 días antes del 1er fold
    valid_days: int = 2,        # ventana de validación por fold
    step_days: int = 7,         # cuánto avanzas la ventana por fold (stride)
    embargo_days: int = 0,      # buffer entre train y valid (opcional)
    max_splits = None,  # None = hasta agotar el timeline
) -> Iterator[Tuple[np.ndarray, np.ndarray]]:
    d = df.copy()
    d["__day__"] = pd.to_datetime(d[day_col]).dt.normalize()
    unique_days = np.array(sorted(d["__day__"].unique()))
    total_days = len(unique_days)

    start_valid = min_train_days + embargo_days
    splits = 0
    while start_valid + valid_days <= total_days:
        train_last = start_valid - embargo_days - 1
        valid_start = start_valid
        valid_end   = start_valid + valid_days  # exclusivo

        train_days = set(unique_days[:train_last+1])
        valid_days_set = set(unique_days[valid_start:valid_end])

        mask_train = d["__day__"].isin(train_days).values
        mask_valid = d["__day__"].isin(valid_days_set).values

        tr_idx = df.index[mask_train].values
        va_idx = df.index[mask_valid].values

        yield np.sort(tr_idx), np.sort(va_idx)

        splits += 1
        if (max_splits is not None) and (splits >= max_splits):
            break
        start_valid += step_days  # avanza la ventana

In [23]:
def summarize_splits(df, splits, day_col="Fecha", key_cols=("Placa","trip_id")):
    df = df.copy()
    df["__day__"] = pd.to_datetime(df[day_col]).dt.normalize()
    df["__trip_key__"] = list(zip(*[df[c].astype(str) for c in key_cols]))  # (Placa, trip_id)

    for i, (tr_idx, va_idx) in enumerate(splits, 1):
        tr_days = df.loc[tr_idx, "__day__"].unique()
        va_days = df.loc[va_idx, "__day__"].unique()

        print(f"\nFold {i}")
        print("  Train days:", tr_days.min(), "→", tr_days.max(), f"({len(tr_days)} días, {len(tr_idx):,} filas)")
        print("  Valid days:", va_days.min(), "→", va_days.max(), f"({len(va_days)} días, {len(va_idx):,} filas)")
        day_overlap = set(tr_days) & set(va_days)
        print("  Day overlap? ", "YES" if day_overlap else "NO")

        tr_keys = set(df.loc[tr_idx, "__trip_key__"].unique())
        va_keys = set(df.loc[va_idx, "__trip_key__"].unique())
        key_overlap = tr_keys & va_keys
        print("  Trip overlap (Placa,trip_id)? ", f"YES ({len(key_overlap)})" if key_overlap else "NO")


In [33]:
splits = list(day_based_time_cv_full(
    dev_df,
    day_col="Fecha",
    min_train_days=30,
    valid_days=5,
    step_days=30,        # un fold por dos meses
    embargo_days=0,     # buffer de 0 días
    max_splits=None     # None = hasta el final del timeline
))


# Verificar los splits
summarize_splits(dev_df, splits)


Fold 1
  Train days: 2024-01-01 00:00:00 → 2024-01-30 00:00:00 (30 días, 332,231 filas)
  Valid days: 2024-01-31 00:00:00 → 2024-02-04 00:00:00 (5 días, 67,868 filas)
  Day overlap?  NO
  Trip overlap (Placa,trip_id)?  NO

Fold 2
  Train days: 2024-01-01 00:00:00 → 2024-02-29 00:00:00 (60 días, 691,367 filas)
  Valid days: 2024-03-01 00:00:00 → 2024-03-05 00:00:00 (5 días, 55,245 filas)
  Day overlap?  NO
  Trip overlap (Placa,trip_id)?  NO

Fold 3
  Train days: 2024-01-01 00:00:00 → 2024-03-30 00:00:00 (90 días, 980,057 filas)
  Valid days: 2024-03-31 00:00:00 → 2024-04-04 00:00:00 (5 días, 56,996 filas)
  Day overlap?  NO
  Trip overlap (Placa,trip_id)?  NO

Fold 4
  Train days: 2024-01-01 00:00:00 → 2024-04-29 00:00:00 (120 días, 1,302,882 filas)
  Valid days: 2024-04-30 00:00:00 → 2024-05-04 00:00:00 (5 días, 54,200 filas)
  Day overlap?  NO
  Trip overlap (Placa,trip_id)?  NO

Fold 5
  Train days: 2024-01-01 00:00:00 → 2024-05-29 00:00:00 (150 días, 1,648,828 filas)
  Valid days:

### Entrenamiento

In [10]:
# Definir características y objetivo

categorical_features = ["LINEA","DIR","proxima_est_teorica"]
numeric_features = [
    "dist_a_prox_m","dist_estacion_m",
    "vel_mps","Altitud","s_m","dist_m",
    "time_diff","dwell_same_xy_s","hour","dow",
    "is_no_progress","progress_event","is_weekend","is_peak"
]

feature_cols = categorical_features + numeric_features
target_col = "ETA_proxima_est_s"

In [11]:
SEED = 25

Baseline

In [None]:
X_train = dev_df[feature_cols]
y_train = dev_df[target_col]
X_valid = test_df[feature_cols]
y_valid = test_df[target_col]

train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)

In [8]:
# Definir parámetros
params = {
    "objective": "regression",
    "metric": ["mae","rmse"],
    "learning_rate": 0.05,
    "num_leaves": 64,
    "min_data_in_leaf": 200,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "max_bin": 255,
    "lambda_l1": 0.1,
    "lambda_l2": 0.1,
    "verbose": 1
}

# Entrenar modelo
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=["train","valid"],
    num_boost_round=3000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=300, verbose=True),
        lgb.log_evaluation(period=100),
    ]
)


NameError: name 'train_data' is not defined

In [None]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
# Realizar una predicción de prueba
for i in range(5):
    print(f'Linea: {X_test["LINEA"].iloc[i]}, Siguiente estación: {X_test["proxima_est_teorica"].iloc[i]}, Predicción: {y_pred[i]:.2f}, Real: {y_test.iloc[i]:.2f}')

Linea: Linea_12, Siguiente estación: CENMA, Predicción: 328.60, Real: 600.00
Linea: Linea_12, Siguiente estación: CENMA, Predicción: 589.14, Real: 300.00
Linea: Linea_12, Siguiente estación: TRÉBOL DIRECCIÓN CENTRO, Predicción: 382.93, Real: 600.00
Linea: Linea_12, Siguiente estación: TRÉBOL DIRECCIÓN CENTRO, Predicción: 584.19, Real: 300.00
Linea: Linea_12, Siguiente estación: LAS CHARCAS DIRECCIÓN CENMA, Predicción: 362.56, Real: 300.00


MAE

In [None]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae:.2f} s")

MAE: 125.70 s


RSME

In [None]:
from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse:.2f} s")

RMSE: 351.43 s


R2 - Coeficiente de determinación

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f"R²: {r2:.3f}")

R²: 0.566


In [None]:
by_line = data.copy()
by_line["pred"] = model.predict(X, num_iteration=model.best_iteration)
by_line["abs_err"] = np.abs(by_line["pred"] - by_line["ETA_proxima_est_s"])
line_stats = by_line.groupby("LINEA", observed=False)["abs_err"].agg(["mean","median","count"]).sort_values("mean")
print('MAE por línea:')
print(line_stats)

  line_stats = by_line.groupby("LINEA")["abs_err"].agg(["mean","median","count"]).sort_values("mean")


                  mean      median    count
LINEA                                      
Linea_1      61.126804   32.019461   577277
Linea_13-A   94.980999   40.152969   371452
Linea_6     104.827698   40.983663   939870
Linea_12    105.028820   28.941709  1772805
Linea_2     112.369283   49.643148   239408
Linea_7     152.022106   58.874034    37876
Linea_18-A  167.657788   79.098083   806776
Linea_18-B  181.953693   87.914202   170019
Linea_13-B  212.699097  103.227633    13042


In [None]:
# Guardar modelo
model.save_model("lightgbm_baseline_model.txt")

<lightgbm.basic.Booster at 0x29567e01910>

Validación cruzada

In [14]:
# Para métricas

SLA_THRESH = [60, 120, 180]    # segundos

def compute_metrics(y_true, y_pred, sla_thresh=SLA_THRESH):
    mae  = float(mean_absolute_error(y_true, y_pred))
    rmse = float(root_mean_squared_error(y_true, y_pred))
    abs_err = np.abs(y_pred - y_true)
    sla = {f"sla_le_{t}s": float((abs_err <= t).mean()) for t in sla_thresh}
    return {"mae": mae, "rmse": rmse, **sla}


# Definir parámetros
params = {
    "objective": "regression",
    "metric": ["mae","rmse"],
    "learning_rate": 0.05,
    "num_leaves": 64,
    "min_data_in_leaf": 200,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "max_bin": 255,
    "lambda_l1": 0.1,
    "lambda_l2": 0.1,
    "verbose": 1,
    "seed": SEED
}

In [44]:
fold_results = []
models = []
best_iters = []
fi_gain_list = []

for fold, (tr_idx, va_idx) in enumerate(splits, 1):
    
    print(f"\n=== Fold {fold} ===")
    
    X_train = dev_df.loc[tr_idx, feature_cols]
    y_train = dev_df.loc[tr_idx, target_col]
    X_valid = dev_df.loc[va_idx, feature_cols]
    y_valid = dev_df.loc[va_idx, target_col]
    
    lgb_train = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features, free_raw_data=True)
    lgb_valid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features, free_raw_data=True)
    
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_valid],
        valid_names=[f"valid{fold}"],
        num_boost_round=3000,
        callbacks=[lgb.early_stopping(stopping_rounds=300, verbose=False)]
    )
    
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    metrics = compute_metrics(y_valid.values, y_pred, SLA_THRESH)
    metrics["fold"] = fold
    metrics["best_iter"] = int(model.best_iteration)
    fold_results.append(metrics)
    best_iters.append(model.best_iteration)
    models.append(model)

    # importancias (gain)
    fi_gain = pd.DataFrame({
        "feature": feature_cols,
        "gain": model.feature_importance(importance_type="gain"),
        "fold": fold,
    })
    fi_gain_list.append(fi_gain)


=== Fold 1 ===
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2108
[LightGBM] [Info] Number of data points in the train set: 332231, number of used features: 17
[LightGBM] [Info] Start training from score 309.223140

=== Fold 2 ===
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026417 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2061
[LightGBM] [Info] Number of data points in the train set: 691367, number of used features: 17
[LightGBM] [Info] Start training from score 306.681998

=== Fold 3 ===
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042476 seconds.
You can set `force_row_wise=tr

In [45]:
cv_df = pd.DataFrame(fold_results).set_index("fold")
print("\n=== Métricas por fold ===")
print(cv_df)

print("\n=== Promedio CV ± std ===")
summary = cv_df.agg(["mean","std"])
print(summary)

avg_best_iter = int(np.mean(best_iters))
print(f"\nIteraciones promedio (best_iteration): {avg_best_iter}")

fi_gain_all = pd.concat(fi_gain_list, ignore_index=True)
fi_gain_mean = fi_gain_all.groupby("feature")["gain"].mean().sort_values(ascending=False)
print("\nTop-20 features por gain promedio:")
print(fi_gain_mean.head(20))


=== Métricas por fold ===
             mae        rmse  sla_le_60s  sla_le_120s  sla_le_180s  best_iter
fold                                                                         
1     123.375711  304.764493    0.595126     0.761360     0.835814        297
2     105.208006  272.522437    0.628238     0.796597     0.867970        253
3     118.302275  370.476806    0.649625     0.804513     0.865798        979
4     103.972052  284.459356    0.640867     0.802768     0.872251       1383
5     102.157242  270.759024    0.643892     0.803880     0.872084       2165
6     107.985908  298.637310    0.615176     0.785522     0.862946       1220
7     136.089878  352.630542    0.592287     0.754288     0.827319       1634
8     112.733356  292.282094    0.613649     0.784575     0.858344        595
9     125.435332  325.330707    0.610972     0.771686     0.843115        860
10    128.783631  331.325955    0.586054     0.754699     0.834402        264
11    130.325481  321.783620    0.559

In [48]:
# Guardar modelos entrenados por fold
import os

model_dir = "D:/2025/UVG/Tesis/repos/backend/models/lightgbm/cross_validation_models"
os.makedirs(model_dir, exist_ok=True)

for fold, model in enumerate(models, 1):
    model.save_model(f"{model_dir}/lgb_model_fold{fold}.txt")
    
# Guardar métricas CV
cv_df.to_csv(f"{model_dir}/cv_metrics.csv")
# Guardar importancias
fi_gain_all.to_csv(f"{model_dir}/cv_feature_importances_gain.csv", index=False)

# Guardar parámetros usados
import json
with open(f"{model_dir}/lgb_params.json", "w") as f:
    json.dump(params, f, indent=4)

Modelo final

In [12]:
final_rounds = int(np.median(best_iters))

lgb_train_full = lgb.Dataset(dev_df[feature_cols], label=dev_df[target_col], categorical_feature=categorical_features)

final_model = lgb.train(
    {**params, "metric": []},
    lgb_train_full,
    num_boost_round=final_rounds
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.183048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2118
[LightGBM] [Info] Number of data points in the train set: 4665551, number of used features: 17
[LightGBM] [Info] Start training from score 308.566816


In [17]:
import os, json

# Métricas en set de prueba
y_test_pred = final_model.predict(test_df[feature_cols], num_iteration=final_model.best_iteration)
y_test = test_df[target_col]

test_metrics = compute_metrics(y_test.values, y_test_pred, SLA_THRESH)
print("\n=== Métricas en set de prueba (LightGBM final) ===")

for k, v in test_metrics.items():
    print(f"{k}: {v*100:.2f}%" if k.startswith("sla_") else f"{k}: {v:.2f} s")
    
# Guardar modelo final
final_dir = "D:/2025/UVG/Tesis/repos/backend/models/lightgbm/final"
os.makedirs(final_dir, exist_ok=True)
final_model.save_model(f"{final_dir}/lgb_final_model.txt")

# Guardar métricas del set de prueba
with open(f"{final_dir}/test_metrics.json", "w") as f:
    json.dump({k: float(v) for k, v in test_metrics.items()}, f, indent=2)
    
# Guardar parámetros y metadatos del modelo final

final_meta = {
    "final_rounds": final_rounds,
    "params": params,
    "feature_cols": feature_cols,
    "target_col": target_col,
}

with open(f"{final_dir}/final_meta.json", "w") as f:
    json.dump(final_meta, f, indent=4)
    
pd.DataFrame({
    "Fecha": test_df["Fecha"].values,
    "Placa": test_df["Placa"].astype(str).values if "Placa" in test_df.columns else None,
    "ETA_true": test_df[target_col].values,
    "ETA_pred": y_test_pred,
    "abs_error": np.abs(test_df[target_col].values - y_test_pred)
}).to_csv(f"{final_dir}/test_predictions.csv", index=False)


=== Métricas en set de prueba (LightGBM final) ===
mae: 134.09 s
rmse: 381.64 s
sla_le_60s: 59.85%
sla_le_120s: 76.34%
sla_le_180s: 83.85%


In [18]:
# Desglose por línea
def group_report(df, y_true, y_pred, by=["LINEA","DIR"]):
    tmp = df.copy()
    tmp["y_true"] = y_true
    tmp["y_pred"] = y_pred
    tmp["abs_err"] = (y_true - y_pred).abs()
    agg = tmp.groupby(by).apply(
        lambda g: pd.Series({
            "MAE": g["abs_err"].mean(),
            "RMSE": (( (g["y_true"]-g["y_pred"])**2 ).mean())**0.5,
            "SLA<=60": (g["abs_err"]<=60).mean(),
            "SLA<=120": (g["abs_err"]<=120).mean(),
            "SLA<=180": (g["abs_err"]<=180).mean(),
            "n": len(g)
        })
    ).reset_index()
    return agg

seg = group_report(test_df, test_df[target_col], y_test_pred, by=["LINEA","DIR"])
print("\n=== TEST por LINEA,DIR ===")
print(seg.sort_values("MAE").head(10))
seg.to_csv(f"{final_dir}/test_segment_metrics.csv", index=False)


=== TEST por LINEA,DIR ===
         LINEA       DIR         MAE        RMSE   SLA<=60  SLA<=120  \
5     Linea_12    VUELTA   37.241484   73.890660  0.837830  0.960785   
0      Linea_1  CIRCULAR   66.873459  179.296865  0.721675  0.884076   
8   Linea_13-A    VUELTA   84.555965  214.185734  0.639465  0.822200   
16  Linea_18-B       IDA  112.268444  215.568132  0.585340  0.759561   
23     Linea_6    VUELTA  114.803768  321.494400  0.643081  0.798885   
19     Linea_2       IDA  119.450127  359.277722  0.513148  0.726772   
7   Linea_13-A       IDA  130.956218  402.555213  0.548346  0.773098   
26     Linea_7    VUELTA  132.455665  367.345494  0.598203  0.774069   
22     Linea_6       IDA  139.513441  371.884852  0.565573  0.746143   
20     Linea_2    VUELTA  165.661599  547.476555  0.656984  0.800239   

    SLA<=180        n  
5   0.985283  42739.0  
0   0.932820  33522.0  
8   0.900586  13982.0  
16  0.831445   2824.0  
23  0.860219  15603.0  
19  0.852742   7986.0  
7   0.86778

  agg = tmp.groupby(by).apply(
