# Obtener día con mayor cobertura

In [1]:
import pandas as pd
from pathlib import Path

In [40]:
# 1) Carga: .parquet (antes del muestreo)
DATA_DIR = Path('D:/2025/UVG/Tesis/repos/backend/features_ready_without_idle_rows/')
files = list(DATA_DIR.glob("**/*.parquet"))

def load_concat(files, cols=None, limit=None):
    dfs = []
    for i, f in enumerate(files):
        if limit and i >= limit:
            break
        df = pd.read_parquet(f, columns=cols)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# Columnas mínimas para cobertura y para el API:
COLS = [
    "Fecha","Placa","trip_id","LINEA","DIR","proxima_est_teorica",
    "dist_a_prox_m","dist_estacion_m","vel_mps","Altitud (m)","s_m","dist_m",
    "time_diff","dwell_same_xy_s","is_no_progress","progress_event","hour","dow","is_weekend","is_peak",
]

dfv = load_concat(files, cols=COLS)

# 2) Normalizar fecha y agregar campos auxiliares
dfv["Fecha"] = pd.to_datetime(dfv["Fecha"])
dfv["date"] = dfv["Fecha"].dt.date

# 4) Rankear días por cobertura
agg = (
    dfv.groupby("date")
       .agg(
           n_rows=("Placa","size"),
           n_lineas=("LINEA","nunique"),
           n_unidades=("Placa","nunique"),
           n_trips=("trip_id","nunique"),
       )
       .reset_index()
)

# Score: prioriza líneas, luego unidades, luego trips, luego filas
agg["score"] = (
    agg["n_lineas"]*1_000_000
    + agg["n_unidades"]*10_000
    + agg["n_trips"]*100
    + agg["n_rows"]
)

best_day = agg.sort_values("score", ascending=False).iloc[0]["date"]
best_day


datetime.date(2024, 1, 12)

In [41]:
golden = dfv[dfv["date"] == best_day].copy()

In [42]:
# Mostrar resumen del día seleccionado
summary = {
    "date": best_day,
    "n_rows": len(golden),
    "n_lineas": golden["LINEA"].nunique(),
    "n_unidades": golden["Placa"].nunique(),
    "n_trips": golden["trip_id"].nunique(),
}
summary

{'date': datetime.date(2024, 1, 12),
 'n_rows': 81557,
 'n_lineas': 9,
 'n_unidades': 130,
 'n_trips': 37}

In [43]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path(r"D:/2025/UVG/Tesis/repos/backend/data_with_features")
unit_csvs = list(DATA_DIR.glob("*/**/*_trips_with_next_station.csv"))

# Rango: del propio golden
day_min = golden["Fecha"].min().floor("D")
day_max = day_min + pd.Timedelta(days=1)

usecols = ["Placa","Fecha","Latitud","Longitud"]  # sin trip_id
dtypes  = {"Placa": "string"}
latlon_parts = []

for csv in unit_csvs:
    # Leer en trozos para filtrar por rango mientras lees
    for chunk in pd.read_csv(
        csv,
        usecols=usecols,
        dtype=dtypes,
        parse_dates=["Fecha"],
        chunksize=200_000,
        on_bad_lines="skip",
        low_memory=True,
    ):
        # Filtro por día antes de guardar
        mask = (chunk["Fecha"] >= day_min) & (chunk["Fecha"] < day_max)
        chunk = chunk.loc[mask, :]
        if chunk.empty:
            continue
        latlon_parts.append(chunk)

if latlon_parts:
    latlon = pd.concat(latlon_parts, ignore_index=True)
else:
    latlon = pd.DataFrame(columns=usecols)

# Quedarse con UNA fila por (Placa, Fecha). Mantener la última del segundo.
latlon = (
    latlon.sort_values(["Placa","Fecha"])
          .drop_duplicates(subset=["Placa","Fecha"], keep="last")
          .reset_index(drop=True)
)

In [44]:
# Asegura tipos en golden

# --- Tipos recomendados antes de guardar ---
STR_COLS = ["Placa", "LINEA", "DIR", "proxima_est_teorica"]
NUM_COLS = [
    "dist_a_prox_m","dist_estacion_m","vel_mps","Altitud (m)","s_m","dist_m",
    "time_diff","dwell_same_xy_s","hour","dow","is_no_progress",
    "progress_event","is_weekend","is_peak"
]

golden[STR_COLS] = golden[STR_COLS].astype("string")
for col in NUM_COLS:
    golden[col] = pd.to_numeric(golden[col], errors="coerce")

golden["Fecha"]   = pd.to_datetime(golden["Fecha"], errors="coerce")
golden["trip_id"] = golden["trip_id"].astype("string")
golden["Placa"]   = golden["Placa"].astype("string")

golden = golden.sort_values(["Placa","Fecha"], kind="mergesort").reset_index(drop=True)
latlon = latlon.sort_values(["Placa","Fecha"], kind="mergesort").reset_index(drop=True)

golden_ll = pd.merge(
    golden,
    latlon,
    on=["Placa","Fecha"],
    how="left",
    suffixes=("", "_ll")
)

# Guardar resultado
golden_ll.to_parquet("demo_data_best_day.parquet")

In [45]:
# Verificar
saved_df = pd.read_parquet("demo_data_best_day.parquet")
saved_df.head(50)

Unnamed: 0,Fecha,Placa,trip_id,LINEA,DIR,proxima_est_teorica,dist_a_prox_m,dist_estacion_m,vel_mps,Altitud (m),...,dwell_same_xy_s,is_no_progress,progress_event,hour,dow,is_weekend,is_peak,date,Latitud,Longitud
0,2024-01-12 06:46:51,100,11,Linea_18-A,IDA,PLAZA BARRIOS/FEGUA,128.349152,301.261414,0.0,1417.0,...,0.0,0,0,6,4,0,1,2024-01-12,14.562082,-90.56342
1,2024-01-12 06:47:44,100,11,Linea_18-A,IDA,PLAZA BARRIOS/FEGUA,128.349152,299.510284,1.666667,1419.0,...,0.0,0,1,6,4,0,1,2024-01-12,14.562105,-90.563675
2,2024-01-12 06:47:56,100,11,Linea_18-A,IDA,PLAZA BARRIOS/FEGUA,128.349152,285.028656,2.5,1419.0,...,0.0,0,1,6,4,0,1,2024-01-12,14.562275,-90.56397
3,2024-01-12 06:48:01,100,11,Linea_18-A,IDA,PLAZA BARRIOS/FEGUA,128.349152,273.39859,1.666667,1421.0,...,0.0,0,1,6,4,0,1,2024-01-12,14.562385,-90.56399
4,2024-01-12 06:49:01,100,11,Linea_18-A,IDA,PLAZA BARRIOS/FEGUA,128.349152,239.12117,0.0,1424.0,...,0.0,0,1,6,4,0,1,2024-01-12,14.562672,-90.56384
5,2024-01-12 06:49:53,100,11,Linea_18-A,IDA,PLAZA BARRIOS/FEGUA,128.349152,183.249481,10.555556,1437.0,...,0.0,0,1,6,4,0,1,2024-01-12,14.566428,-90.56327
6,2024-01-12 06:50:17,100,11,Linea_18-A,IDA,PLAZA BARRIOS/FEGUA,128.349152,443.227844,9.722222,1444.0,...,0.0,0,1,6,4,0,1,2024-01-12,14.568755,-90.563034
7,2024-01-12 06:50:41,100,11,Linea_18-A,IDA,PLAZA BARRIOS/FEGUA,128.349152,641.29126,11.388889,1442.0,...,0.0,0,1,6,4,0,1,2024-01-12,14.57054,-90.562996
8,2024-01-12 06:51:41,100,11,Linea_18-A,IDA,PLAZA BARRIOS/FEGUA,128.349152,822.053101,13.888889,1448.0,...,0.0,0,1,6,4,0,1,2024-01-12,14.57191,-90.56553
9,2024-01-12 06:52:41,100,11,Linea_18-A,IDA,PLAZA BARRIOS/FEGUA,128.349152,783.275391,11.388889,1441.0,...,0.0,0,1,6,4,0,1,2024-01-12,14.574683,-90.57183
