# Obtener día con mayor cobertura

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# 1) Carga: .parquet (antes del muestreo)
DATA_DIR = Path('D:/2025/UVG/Tesis/repos/backend/features_ready_without_idle_rows/')
files = list(DATA_DIR.glob("**/*.parquet"))

def load_concat(files, cols=None, limit=None):
    dfs = []
    for i, f in enumerate(files):
        if limit and i >= limit:
            break
        df = pd.read_parquet(f, columns=cols)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# Columnas mínimas para cobertura y para el API:
COLS = [
    "Fecha","Placa","trip_id","LINEA","DIR","proxima_est_teorica",
    "dist_a_prox_m","dist_estacion_m","vel_mps","Altitud (m)","s_m","dist_m",
    "time_diff","dwell_same_xy_s","is_no_progress","progress_event","hour","dow","is_weekend","is_peak",
]

dfv = load_concat(files, cols=COLS)

# 2) Normalizar fecha y agregar campos auxiliares
dfv["Fecha"] = pd.to_datetime(dfv["Fecha"])
dfv["date"] = dfv["Fecha"].dt.date

# 4) Rankear días por cobertura
agg = (
    dfv.groupby("date")
       .agg(
           n_rows=("Placa","size"),
           n_lineas=("LINEA","nunique"),
           n_unidades=("Placa","nunique"),
           n_trips=("trip_id","nunique"),
       )
       .reset_index()
)

# Score: prioriza líneas, luego unidades, luego trips, luego filas
agg["score"] = (
    agg["n_lineas"]*1_000_000
    + agg["n_unidades"]*10_000
    + agg["n_trips"]*100
    + agg["n_rows"]
)

best_day = agg.sort_values("score", ascending=False).iloc[0]["date"]
best_day


datetime.date(2024, 5, 13)

In [3]:
golden = dfv[dfv["date"] == best_day].copy()

In [4]:
# Mostrar resumen del día seleccionado
summary = {
    "date": best_day,
    "n_rows": len(golden),
    "n_lineas": golden["LINEA"].nunique(),
    "n_unidades": golden["Placa"].nunique(),
    "n_trips": golden["trip_id"].nunique(),
}
summary

{'date': datetime.date(2024, 5, 13),
 'n_rows': 110956,
 'n_lineas': 9,
 'n_unidades': 131,
 'n_trips': 105}

In [5]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path(r"D:/2025/UVG/Tesis/repos/backend/data_with_features")
unit_csvs = list(DATA_DIR.glob("*/**/*_trips_with_next_station.csv"))

# Rango: del propio golden
day_min = golden["Fecha"].min().floor("D")
day_max = day_min + pd.Timedelta(days=1)

usecols = ["Placa","Fecha","Latitud","Longitud"]  # sin trip_id
dtypes  = {"Placa": "string"}
latlon_parts = []

for csv in unit_csvs:
    # Leer en trozos para filtrar por rango mientras lees
    for chunk in pd.read_csv(
        csv,
        usecols=usecols,
        dtype=dtypes,
        parse_dates=["Fecha"],
        chunksize=200_000,
        on_bad_lines="skip",
        low_memory=True,
    ):
        # Filtro por día antes de guardar
        mask = (chunk["Fecha"] >= day_min) & (chunk["Fecha"] < day_max)
        chunk = chunk.loc[mask, :]
        if chunk.empty:
            continue
        latlon_parts.append(chunk)

if latlon_parts:
    latlon = pd.concat(latlon_parts, ignore_index=True)
else:
    latlon = pd.DataFrame(columns=usecols)

# Quedarse con UNA fila por (Placa, Fecha). Mantener la última del segundo.
latlon = (
    latlon.sort_values(["Placa","Fecha"])
          .drop_duplicates(subset=["Placa","Fecha"], keep="last")
          .reset_index(drop=True)
)

In [6]:
# Asegura tipos en golden

# --- Tipos recomendados antes de guardar ---
STR_COLS = ["Placa", "LINEA", "DIR", "proxima_est_teorica"]
NUM_COLS = [
    "dist_a_prox_m","dist_estacion_m","vel_mps","Altitud (m)","s_m","dist_m",
    "time_diff","dwell_same_xy_s","hour","dow","is_no_progress",
    "progress_event","is_weekend","is_peak"
]

golden[STR_COLS] = golden[STR_COLS].astype("string")
for col in NUM_COLS:
    golden[col] = pd.to_numeric(golden[col], errors="coerce")

golden["Fecha"]   = pd.to_datetime(golden["Fecha"], errors="coerce")
golden["trip_id"] = golden["trip_id"].astype("string")
golden["Placa"]   = golden["Placa"].astype("string")

golden = golden.sort_values(["Placa","Fecha"], kind="mergesort").reset_index(drop=True)
latlon = latlon.sort_values(["Placa","Fecha"], kind="mergesort").reset_index(drop=True)

golden_ll = pd.merge(
    golden,
    latlon,
    on=["Placa","Fecha"],
    how="left",
    suffixes=("", "_ll")
)

# Guardar resultado
golden_ll.to_parquet("demo_data_best_day.parquet")

In [8]:
# Verificar
saved_df = pd.read_parquet("demo_data_best_day.parquet")
saved_df.head(10)

Unnamed: 0,Fecha,Placa,trip_id,LINEA,DIR,proxima_est_teorica,dist_a_prox_m,dist_estacion_m,vel_mps,Altitud (m),...,dwell_same_xy_s,is_no_progress,progress_event,hour,dow,is_weekend,is_peak,date,Latitud,Longitud
0,2024-05-13 05:23:54,100,122,Linea_12,IDA,MONTE MARÍA,2507.478516,292.295258,0.0,1417.0,...,0.0,0,0,5,0,0,0,2024-05-13,14.56223,-90.56408
1,2024-05-13 05:24:09,100,122,Linea_12,IDA,MONTE MARÍA,2507.478516,292.339996,1.666667,1415.0,...,15.0,1,0,5,0,0,0,2024-05-13,14.562217,-90.56402
2,2024-05-13 05:24:12,100,122,Linea_12,IDA,MONTE MARÍA,2507.478516,284.873291,2.222222,1416.0,...,0.0,0,1,5,0,0,0,2024-05-13,14.562275,-90.563965
3,2024-05-13 05:25:12,100,122,Linea_12,IDA,MONTE MARÍA,2507.478516,248.178253,0.0,1425.0,...,0.0,0,1,5,0,0,0,2024-05-13,14.562592,-90.56386
4,2024-05-13 05:25:23,100,122,Linea_12,IDA,MONTE MARÍA,2507.478516,242.709106,2.222222,1425.0,...,11.0,1,0,5,0,0,0,2024-05-13,14.56264,-90.56385
5,2024-05-13 05:26:01,100,122,Linea_12,IDA,MONTE MARÍA,2507.478516,237.709686,2.5,1424.0,...,49.0,1,0,5,0,0,0,2024-05-13,14.562677,-90.5638
6,2024-05-13 05:27:00,100,122,Linea_12,IDA,MONTE MARÍA,2038.004761,411.122314,12.5,1443.0,...,0.0,0,1,5,0,0,0,2024-05-13,14.568467,-90.56306
7,2024-05-13 05:27:18,100,122,Linea_12,IDA,MONTE MARÍA,1805.783813,637.791321,12.777778,1435.0,...,0.0,0,1,5,0,0,0,2024-05-13,14.570505,-90.56295
8,2024-05-13 05:28:18,100,122,Linea_12,IDA,MONTE MARÍA,1624.441895,731.351685,0.0,1434.0,...,0.0,0,1,5,0,0,0,2024-05-13,14.571305,-90.56441
9,2024-05-13 05:29:18,100,122,Linea_12,IDA,MONTE MARÍA,1531.211548,790.294495,11.944445,1436.0,...,0.0,0,1,5,0,0,0,2024-05-13,14.571705,-90.56517


In [4]:
# Revisar la fila con menor fecha y mayor fecha de cada unidad
saved_df = pd.read_parquet("demo_data_best_day.parquet")

check = saved_df.sort_values(["Placa","Fecha"]).groupby("Placa").agg(
    min_fecha=("Fecha","first"),
    max_fecha=("Fecha","last"),
    linea=("LINEA","first")
)
check.head(10)
check.to_csv("demo_data_best_day_placa_date_range.csv")