In [20]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
from datetime import datetime

In [None]:
BASE_PATH = Path("data/raw/trains_dataset")
OUTPUT_PATH = Path("data/interim")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

def load_all_data(base_path=BASE_PATH):
    all_data = []
    
    # Scan monthly folders (1_2024, 2_2024, ...)
    for month_folder in sorted(base_path.iterdir()):
        if month_folder.is_dir():
            print(f" Processing {month_folder.name}...")
            
            for file_path in tqdm(sorted(month_folder.glob("*.json"))):
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        data = json.load(f)
                        
                        if "treni" in data:
                            all_data.extend(data["treni"])
                except (json.JSONDecodeError, FileNotFoundError) as e:
                    print(f" Errore nel file {file_path}: {e}")
                    continue

    return pd.DataFrame(all_data)

df = load_all_data()

📂 Processing 10_2024...


100%|██████████| 31/31 [00:03<00:00,  8.57it/s]


📂 Processing 11_2024...


100%|██████████| 30/30 [00:06<00:00,  4.52it/s]


📂 Processing 12_2024...


100%|██████████| 31/31 [00:02<00:00, 14.25it/s]


📂 Processing 1_2024...


100%|██████████| 31/31 [00:05<00:00,  5.29it/s]


📂 Processing 2_2024...


100%|██████████| 29/29 [00:02<00:00, 13.81it/s]


📂 Processing 3_2024...


100%|██████████| 31/31 [00:02<00:00, 13.55it/s]


📂 Processing 4_2024...


100%|██████████| 30/30 [00:06<00:00,  4.47it/s]


📂 Processing 5_2024...


100%|██████████| 31/31 [00:02<00:00, 12.37it/s]


📂 Processing 6_2024...


100%|██████████| 30/30 [00:02<00:00, 13.41it/s]


📂 Processing 7_2024...


100%|██████████| 31/31 [00:07<00:00,  4.05it/s]


📂 Processing 8_2024...


100%|██████████| 31/31 [00:02<00:00, 14.19it/s]


📂 Processing 9_2024...


100%|██████████| 30/30 [00:02<00:00, 12.01it/s]


In [None]:
column_mapping = {
    "_id": "train_id",
    "n": "train_number",
    "p": "departure_station",
    "rp": "train_departure_delay",
    "a": "arrival_station",
    "ra": "train_arrival_platform",
    "dl": "delay_info",
    "c": "train_class",
    "oo": "origin_station",
    "od": "final_destination",
    "op": "scheduled_departure_time",
    "oa": "scheduled_arrival_time",
    "pr": "train_status",  # (Soppresso = Canceled)
    "sub": "train_subclass",
    "sea": "extended_final_destination",
    "cn": "connected_train",
    "oae": "official_scheduled_arrival",
    "oaz": "adjusted_scheduled_arrival",
    "opz": "adjusted_scheduled_departure",
    "ope": "official_planned_departure",
    "sep": "starting_extended_point",
    "fr": "route_stops"
}

df.rename(columns=column_mapping, inplace=True)

In [None]:
def safe_to_datetime(series):
    """Convert timestamps to datetime safely, ensuring valid ranges."""
    MIN_TIMESTAMP = 1703980800  # Dec 31, 2023
    MAX_TIMESTAMP = 1735756800  # Jan 1, 2025

    # Se la serie è già in datetime, la restituiamo così com'è
    if pd.api.types.is_datetime64_any_dtype(series):
        return series  

    series = pd.to_numeric(series, errors="coerce")

    series = series.where((series >= MIN_TIMESTAMP) & (series <= MAX_TIMESTAMP))

    return pd.to_datetime(series, unit="s", errors="coerce")

time_columns = ["scheduled_departure_time", "scheduled_arrival_time",
                "adjusted_scheduled_arrival", "adjusted_scheduled_departure"]

for col in time_columns:
    df[col] = safe_to_datetime(df[col])

In [None]:
def convert_hhmmss_to_datetime(time_series, reference_series):
    """Convert HH:MM:SS format to full datetime using a reference date."""
    
    reference_series = pd.to_datetime(reference_series, errors="coerce")

    time_series = pd.to_datetime(time_series, format="%H:%M:%S", errors="coerce").dt.time  

    combined_str = reference_series.dt.strftime("%Y-%m-%d") + " " + time_series.astype(str)

    return pd.to_datetime(combined_str, format="%Y-%m-%d %H:%M:%S", errors="coerce")

df["official_scheduled_arrival"] = convert_hhmmss_to_datetime(df["official_scheduled_arrival"], df["scheduled_arrival_time"])
df["official_planned_departure"] = convert_hhmmss_to_datetime(df["official_planned_departure"], df["scheduled_departure_time"])

In [None]:
df_exploded = df.explode("route_stops").reset_index(drop=True)

# Remove any NaN values in "route_stops"
df_exploded = df_exploded[df_exploded["route_stops"].notna()]

# Normalize the stops
df_stops = pd.json_normalize(df_exploded["route_stops"])

stop_column_mapping = {
    "n": "stop_name",
    "ra": "stop_arrival_delay",
    "rp": "stop_departure_delay",
    "br": "actual_platform",
    "bp": "planned_platform",
    "oa": "stop_arrival_time",
    "op": "stop_departure_time"
}

df_stops.rename(columns=stop_column_mapping, inplace=True)

df_stops["train_id"] = df_exploded["train_id"].values
df_stops["train_number"] = df_exploded["train_number"].values

time_columns = ["stop_arrival_time", "stop_departure_time"]
for col in time_columns:
    df_stops[col] = safe_to_datetime(df_stops[col])

In [None]:
df.to_parquet(OUTPUT_PATH / "train_data.parquet", index=False)
df_stops.to_parquet(OUTPUT_PATH / "train_stops.parquet", index=False)

print("Dati salvati con successo")

✅ Dati salvati con successo!


## Pulizia

In [None]:
INTERIM_PATH = Path("data/interim")
PROCESSED_PATH = Path("data/processed")
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

df_cleaned = pd.read_parquet(INTERIM_PATH / "train_data.parquet")
df_stops = pd.read_parquet(INTERIM_PATH / "train_stops.parquet")

In [None]:
df_cleaned.isnull().sum()


train_id                              0
train_number                          0
departure_station                     0
train_departure_delay                 0
arrival_station                       0
train_arrival_platform                0
train_status                    2820398
train_class                           0
scheduled_departure_time              0
scheduled_arrival_time                0
route_stops                         115
delay_info                      2802422
origin_station                  2802126
final_destination               2802126
connected_train                 2813986
train_subclass                  2754555
adjusted_scheduled_arrival      2844542
extended_final_destination      2842527
official_scheduled_arrival      2842534
starting_extended_point         2842341
official_planned_departure      2842341
adjusted_scheduled_departure    2844861
dtype: int64

In [113]:
df_stops.isnull().sum()

stop_name                      0
stop_arrival_delay             0
stop_departure_delay           0
stop_arrival_time        2806447
stop_departure_time      2829411
actual_platform         32566314
planned_platform        32614781
train_id                       0
train_number                   0
dtype: int64

In [114]:
df_cleaned.shape

(2845355, 22)

In [115]:
df_stops.shape

(32995541, 9)

In [None]:
# Number of rows before cleaning
initial_rows_df = df_cleaned.shape[0]
initial_rows_df_stops = df_stops.shape[0]

Rimuoviamo i treni soppressi

In [None]:
# Remove rows where train_status is "Soppresso"
df_cleaned = df_cleaned[df_cleaned["train_status"] != "Soppresso"]

In [None]:
# Remove columns with too many missing values or non-informative ones
columns_to_drop = [
    "train_status",
    "delay_info",  # Testuale, non utile per la previsione
    "origin_station", "final_destination",  # Pochi valori validi e ridondanti
    "connected_train", "train_subclass", "adjusted_scheduled_arrival",
    "extended_final_destination", "official_scheduled_arrival",
    "starting_extended_point", "official_planned_departure", "adjusted_scheduled_departure"
]
df_cleaned.drop(columns=columns_to_drop, inplace=True)

train_id                    0
train_number                0
departure_station           0
train_departure_delay       0
arrival_station             0
train_arrival_platform      0
train_class                 0
scheduled_departure_time    0
scheduled_arrival_time      0
route_stops                 0
dtype: int64


In [None]:
print(df_cleaned.isnull().sum())

In [None]:
# Removal of columns with >99% missing values
columns_to_drop_stops = ["actual_platform", "planned_platform"]
df_stops.drop(columns=columns_to_drop_stops, inplace=True)

Il problema è che alcune fermate previste sono state cancellate e quindi il treno non ha effettuato la sosta in quelle stazioni. Nell'esempio, vediamo che il treno 2855 doveva arrivare a Milano Rogoredo, ma è stato cancellato su quella tratta e termina il viaggio a Milano Centrale.

<div style="max-height: 300px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;" 
     onmouseover="this.style.overflowY='auto'" 
     onmouseout="this.style.overflowY='hidden'">

```json
 {
      "_id": "2855-1704086040-Q09MSUNP",
      "n": "2855",
      "p": "COLICO",
      "rp": "1",
      "a": "MILANO CENTRALE",
      "ra": "-3",
      "dl": "Treno cancellato da SESTO S. GIOVANNI a MILANO ROGOREDO. Il treno oggi arriva a MILANO CENTRALE.",
      "c": "REG",
      "oo": "COLICO",
      "od": "MILANO ROGOREDO",
      "cn": "73467,MILANO GRECO PIRELLI",
      "op": 1704086040,
      "oa": 1704093120,
      "fr": [
        {
          "n": "COLICO",
          "ra": "N",
          "rp": "1",
          "oa": 0,
          "op": 1704086040
        },
        {
          "n": "PIONA",
          "ra": "-2",
          "rp": "1",
          "oa": 1704086340,
          "op": 1704086400
        },
        {
          "n": "DORIO",
          "ra": "0",
          "rp": "-1",
          "oa": 1704086700,
          "op": 1704086760
        },
        {
          "n": "DERVIO",
          "ra": "-3",
          "rp": "0",
          "oa": 1704087060,
          "op": 1704087120
        },
        {
          "n": "BELLANO TARTAVELLE TERME",
          "ra": "-1",
          "rp": "5",
          "oa": 1704087480,
          "op": 1704087540
        },
        {
          "n": "VARENNA ESINO",
          "ra": "4",
          "rp": "5",
          "oa": 1704087780,
          "op": 1704087840
        },
        {
          "n": "FIUMELATTE",
          "ra": "4",
          "rp": "4",
          "oa": 1704087990,
          "op": 1704088020
        },
        {
          "n": "LIERNA",
          "ra": "3",
          "rp": "4",
          "oa": 1704088320,
          "op": 1704088380
        },
        {
          "n": "OLCIO",
          "ra": "3",
          "rp": "3",
          "oa": 1704088590,
          "op": 1704088620
        },
        {
          "n": "MANDELLO DEL LARIO",
          "ra": "1",
          "rp": "2",
          "oa": 1704088860,
          "op": 1704088920
        },
        {
          "n": "ABBADIA LARIANA",
          "ra": "1",
          "rp": "6",
          "oa": 1704089160,
          "op": 1704089400
        },
        {
          "n": "LECCO",
          "ra": "5",
          "rp": "4",
          "oa": 1704089820,
          "op": 1704090000
        },
        {
          "n": "CALOLZIOCORTE OLGINATE",
          "ra": "2",
          "rp": "3",
          "oa": 1704090420,
          "op": 1704090480
        },
        {
          "n": "CERNUSCO-MERATE",
          "ra": "1",
          "rp": "1",
          "oa": 1704091080,
          "op": 1704091140
        },
        {
          "n": "CARNATE USMATE",
          "ra": "-3",
          "rp": "-1",
          "oa": 1704091560,
          "op": 1704091620
        },
        {
          "n": "MONZA",
          "ra": "-2",
          "rp": "1",
          "oa": 1704092160,
          "op": 1704092220
        },
        {
          "n": "SESTO S. GIOVANNI",
          "ra": "-1",
          "rp": "1",
          "oa": 1704092520,
          "op": 1704092580
        },
        {
          "n": "MILANO LAMBRATE",
          "ra": "S",
          "rp": "S",
          "oa": 1704093240,
          "op": 1704093300
        },
        {
          "n": "MILANO FORLANINI",
          "ra": "S",
          "rp": "S",
          "oa": 1704093540,
          "op": 1704093600
        },
        {
          "n": "MILANO ROGOREDO",
          "ra": "S",
          "rp": "S",
          "oa": 1704093960,
          "op": 0
        },
        {
          "n": "MILANO CENTRALE",
          "ra": "-3",
          "rp": "N",
          "oa": 1704093120,
          "op": 0
        }
      ]
    }

Soluzione: cancelliamo le fermate saltate.

In [None]:
# Remove stops with "S" in arrival or departure delay
df_stops = df_stops[~df_stops["stop_arrival_delay"].astype(str).str.upper().eq("S")]
df_stops = df_stops[~df_stops["stop_departure_delay"].astype(str).str.upper().eq("S")]

print(f"Fermate saltate rimosse. Il dataset finale contiene {len(df_stops)} fermate.")


✅ Fermate saltate rimosse. Il dataset finale contiene 32545672 fermate.


Le fermate che hanno ra = "n.d." (ritardo all'arrivo non disponibile) o rp = "n.d." (ritardo alla partenza non disponibile) rappresentano un caso particolare.
Nel esempio, la fermata di Serravalle Scrivia ha ra = "n.d.", il che significa che non è noto il ritardo all'arrivo per quella stazione.

<div style="max-height: 300px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">

```json
{
      "_id": "2116-1704086820-R0VOT1ZBIFAuUFJJTkNJUEU=",
      "n": "2116",
      "p": "GENOVA P.PRINCIPE",
      "rp": "2",
      "a": "TORINO P.NUOVA",
      "ra": "-3",
      "c": "REG",
      "op": 1704086820,
      "oa": 1704094200,
      "fr": [
        {
          "n": "GENOVA P.PRINCIPE",
          "ra": "N",
          "rp": "2",
          "oa": 0,
          "op": 1704086820
        },
        {
          "n": "RONCO SCR.",
          "ra": "0",
          "rp": "2",
          "oa": 1704088200,
          "op": 1704088260
        },
        {
          "n": "ARQUATA SCRIVIA",
          "ra": "1",
          "rp": "4",
          "oa": 1704088740,
          "op": 1704088800
        },
        {
          "n": "SERRAVALLE SCRIVIA",
          "ra": "n.d.",
          "rp": "1",
          "oa": 1704089160,
          "op": 1704089220
        },
        ...
      ]
    }

Soluzione: Sostituire "n.d." con NaN (np.nan). 
Questo permette ai modelli di machine learning di gestire il valore come mancante senza distorcere i dati.

In [None]:
import numpy as np

# Convert "n.d." delays to NaN to handle them as missing values
df_stops["stop_arrival_delay"] = df_stops["stop_arrival_delay"].replace("n.d.", np.nan)
df_stops["stop_departure_delay"] = df_stops["stop_departure_delay"].replace("n.d.", np.nan)

df_stops["stop_arrival_delay"] = pd.to_numeric(df_stops["stop_arrival_delay"], errors="coerce")
df_stops["stop_departure_delay"] = pd.to_numeric(df_stops["stop_departure_delay"], errors="coerce")

print(f"Fermate con 'n.d.' gestite correttamente. Il dataset finale contiene {len(df_stops)} fermate.")

✅ Fermate con 'n.d.' gestite correttamente. Il dataset finale contiene 32545672 fermate.


Aggirare il problema dell'orario di arrivo uguale a zero nella stazione di partenza e l'orario di partenza uguale a zero nelle stazioni di arriv:
- Lasciare i valori mancanti come NaT (Not a Time): Nei dataframe di pandas, NaT è lo standard per indicare un timestamp mancante, esattamente come NaN per i numeri. I modelli avanzati di machine learning possono gestire NaT senza problemi, mentre un valore errato come "0" potrebbe compromettere le previsioni.

- Aggiungere una colonna is_terminal_stop: Questa colonna booleana (True/False) indica se la fermata è la stazione iniziale o finale della tratta. In questo modo, possiamo identificare facilmente le fermate dove i valori di arrivo/partenza sono mancanti in modo legittimo.

In [None]:
# Creation of the boolean column for terminal stops
df_stops["is_terminal_stop"] = df_stops["stop_arrival_time"].isna() | df_stops["stop_departure_time"].isna()

# Fill with NaT instead of 0
df_stops["stop_arrival_time"] = pd.to_datetime(df_stops["stop_arrival_time"], errors="coerce")
df_stops["stop_departure_time"] = pd.to_datetime(df_stops["stop_departure_time"], errors="coerce")

stop_name                     0
stop_arrival_delay            0
stop_departure_delay          0
stop_arrival_time       2765491
stop_departure_time     2783550
train_id                      0
train_number                  0
is_terminal_stop              0
dtype: int64


In [None]:
print(df_stops.isnull().sum())

In [None]:
# Number of rows after cleaning
final_rows_df = df_cleaned.shape[0]
final_rows_df_stops = df_stops.shape[0]

In [None]:
loss_percentage_df = (1 - final_rows_df / initial_rows_df) * 100
loss_percentage_stops = (1 - final_rows_df_stops / initial_rows_df_stops) * 100

print(f"Data lost in the train dataset: {initial_rows_df - final_rows_df} rows ({loss_percentage_df:.2f}%)")
print(f"Data lost in the train stops dataset: {initial_rows_df_stops - final_rows_df_stops} righe ({loss_percentage_stops:.2f}%)")

Dati persi nel dataset dei treni: 24957 righe (0.88%)
Dati persi nel dataset delle fermate: 449869 righe (1.36%)


In [None]:
INTERIM_PATH = Path("data/interim")
INTERIM_PATH.mkdir(parents=True, exist_ok=True)

df_cleaned.to_parquet(INTERIM_PATH / "train_data_cleaned.parquet", index=False)
df_stops.to_parquet(INTERIM_PATH / "train_stops_cleaned.parquet", index=False)

print("Cleaned datasets successfully saved in 'data/interim'")


Dataset puliti salvati con successo in 'data/interim'
