RICAVARE I DATI GEOGRAFICI

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import googlemaps
import time
from tqdm import tqdm
import plotly.express as px
from plotly.subplots import make_subplots
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from meteostat import Hourly, Stations
from datetime import datetime, timezone
import networkx as nx
from geopy.distance import geodesic


In [2]:
load_dotenv()
api_key = os.getenv('GOOGLE_API_KEY')

In [3]:
gmaps = googlemaps.Client(key=api_key)

In [3]:
INTERIM_PATH = Path("data/interim")

In [4]:
PROCESSED_PATH = Path("data/processed")

In [5]:
df = pd.read_parquet(INTERIM_PATH / "train_data_cleaned.parquet")

In [6]:
print(df.shape)

(27973425, 15)


In [None]:
# unique_stops = df['stop_name'].unique()
# print(f"Number of unique stops: {len(unique_stops)}")

# def get_coordinates_from_google(stop_name):
#     try:
#         geocode_result = gmaps.geocode(stop_name)
#         if geocode_result:
#             lat = geocode_result[0]['geometry']['location']['lat']
#             lon = geocode_result[0]['geometry']['location']['lng']
#             return lat, lon
#         return None, None
#     except Exception as e:
#         print(f"Error retrieving for {stop_name}: {e}")
#         return None, None

# coordinates_list = []

# for stop in tqdm(unique_stops, desc="Retrieving coordinates", unit="stop"):
#     lat, lon = get_coordinates_from_google(stop)
#     coordinates_list.append({'stop_name': stop, 'latitude': lat, 'longitude': lon})
    
#     # Pause between requests to avoid exceeding the request limit
#     time.sleep(1)

# coordinates_df = pd.DataFrame(coordinates_list)
# print(coordinates_df.head())

In [14]:
# Funzione per ottenere le coordinate di una fermata
def get_coordinates_from_google(stop_name, first_stop_name=None):
    try:
        # Se esiste una fermata iniziale, la includiamo nella query per migliorare la precisione
        if first_stop_name:
            query = f"{stop_name}, Italia, stazione, {first_stop_name}"
        else:
            query = f"{stop_name}, Italia, stazione"
        
        geocode_result = gmaps.geocode(query, region="it")

        if geocode_result:
            lat = geocode_result[0]['geometry']['location']['lat']
            lon = geocode_result[0]['geometry']['location']['lng']
            return lat, lon
        return None, None
    except Exception as e:
        print(f"Error retrieving for {stop_name}: {e}")
        return None, None

In [15]:
train_groups = df.groupby('train_number')['stop_name'].apply(lambda x: list(dict.fromkeys(x))).to_dict()

In [16]:
print(f"Number of trains: {len(train_groups)}")
print("First 20 train stop lists:")
for train, stops in list(train_groups.items())[:20]:
    print(f"Train {train}: {stops}")

Number of trains: 16928
First 20 train stop lists:
Train 10: ['MILANO CENTRALE', 'COMO S. GIOVANNI', 'CHIASSO']
Train 10001: ['CREMONA', 'VILLETTA MALAGNINO', 'GAZZO PIEVE S. GIACOMO', 'TORRE DE` PICENARDI', 'PIADENA', 'BOZZOLO', 'MARCARIA', 'S.MICHELE IN BOSCO', 'OSPITALETTO MANTOVANO', 'CASTELLUCCHIO', 'MANTOVA']
Train 10002: ['PIADENA', 'TORRE DE` PICENARDI', 'GAZZO PIEVE S. GIACOMO', 'VILLETTA MALAGNINO', 'CREMONA']
Train 10004: ['BOZZOLO', 'PIADENA', 'CREMONA', 'MANTOVA', 'CASTELLUCCHIO', 'MARCARIA']
Train 10005: ['CREMONA', 'VILLETTA MALAGNINO', 'GAZZO PIEVE S. GIACOMO', 'TORRE DE` PICENARDI', 'PIADENA', 'BOZZOLO', 'MARCARIA', 'S.MICHELE IN BOSCO', 'OSPITALETTO MANTOVANO', 'CASTELLUCCHIO', 'MANTOVA']
Train 10006: ['BOZZOLO', 'PIADENA', 'TORRE DE` PICENARDI', 'GAZZO PIEVE S. GIACOMO', 'VILLETTA MALAGNINO', 'CREMONA', 'MANTOVA', 'CASTELLUCCHIO', 'OSPITALETTO MANTOVANO', 'S.MICHELE IN BOSCO', 'MARCARIA']
Train 10007: ['CREMONA', 'VILLETTA MALAGNINO', 'GAZZO PIEVE S. GIACOMO', 'TORRE

In [17]:
# Dizionario per evitare richieste duplicate
station_coordinates = {}

# Lista per salvare le coordinate
coordinates_list_google = []

total_stops = sum(len(stops) for stops in train_groups.values())

In [None]:
with tqdm(total=total_stops, desc="Retrieving coordinates", unit="stop") as pbar:
    for train_number, stops in train_groups.items():
        # La prima fermata è la nostra "fermata di riferimento"
        first_stop_name = stops[0]

        # Se la fermata non è già stata trovata, la cerchiamo
        if first_stop_name not in station_coordinates:
            lat, lon = get_coordinates_from_google(first_stop_name)
            station_coordinates[first_stop_name] = (lat, lon)
        else:
            lat, lon = station_coordinates[first_stop_name]

        coordinates_list_google.append({'train_number': train_number, 'stop_name': first_stop_name, 'latitude': lat, 'longitude': lon})
        pbar.update(1)

        # Ora cerchiamo le coordinate delle fermate successive
        for stop in stops[1:]:
            if stop not in station_coordinates:
                lat, lon = get_coordinates_from_google(stop, first_stop_name)
                station_coordinates[stop] = (lat, lon)
            else:
                lat, lon = station_coordinates[stop]

            coordinates_list_google.append({'train_number': train_number, 'stop_name': stop, 'latitude': lat, 'longitude': lon})
            pbar.update(1)

        # Pausa tra le richieste per evitare di superare il limite delle API
        time.sleep(0.2)

Retrieving coordinates: 100%|██████████| 185884/185884 [1:02:19<00:00, 49.71stop/s]


In [19]:
coordinates_df = pd.DataFrame(coordinates_list_google)

In [20]:
coordinates_df.to_parquet(PROCESSED_PATH / "coordinates_df_google.parquet", index=False)

In [21]:
coordinates_df.head()

Unnamed: 0,train_number,stop_name,latitude,longitude
0,10,MILANO CENTRALE,45.483971,9.206097
1,10,COMO S. GIOVANNI,45.80902,9.072796
2,10,CHIASSO,45.487137,9.204822
3,10001,CREMONA,45.133313,10.022704
4,10001,VILLETTA MALAGNINO,45.13687,10.11202


Create map for train stops distribution using density_mapbox

In [22]:
fig_stops = px.density_map(
    coordinates_df,
    lat='latitude',
    lon='longitude',
    hover_name="stop_name",
    title="Train Stops Distribution",
    radius=10,
    opacity=0.6,
    zoom=6,
    map_style="carto-positron")
fig_stops.update_layout(height=900)
fig_stops.update_layout(width=1200)

fig_stops.show()

In [8]:
coordinates_df = pd.read_parquet(PROCESSED_PATH / "coordinates_df_google.parquet")

In [9]:
coordinates_df.loc[coordinates_df['stop_name'] == 'BARCELLONA', ['latitude', 'longitude']] = [38.1538275359411, 15.199637501662098]

In [None]:
fig_stops = px.density_map(
    coordinates_df,
    lat='latitude',
    lon='longitude',
    hover_name="stop_name",
    title="Train Stops Distribution",
    radius=10,
    opacity=0.6,
    zoom=6,
    map_style="carto-positron")
fig_stops.update_layout(height=900)
fig_stops.update_layout(width=1200)

fig_stops.show()

In [None]:
# coordinates_df_google.to_parquet(INTERIM_PATH / "coordinates_df_google.parquet", index=False)

# print("Coordinates datasets successfully saved in 'data/interim'")

Coordinates datasets successfully saved in 'data/interim'


In [None]:
df_with_coordinates = pd.merge(df, coordinates_df, on='stop_name', how='left')

In [None]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,stop_arrival_delay,stop_departure_delay,stop_arrival_time,stop_departure_time,is_terminal_stop,latitude,longitude
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,0.0,1.0,NaT,2024-09-30 22:38:00,True,44.505907,11.343369
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,2.0,2.0,2024-09-30 22:45:00,2024-09-30 22:46:00,False,44.478898,11.416181
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,3.0,2.0,2024-09-30 22:51:00,2024-09-30 22:52:00,False,44.450703,11.487476
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,3.0,4.0,2024-09-30 22:58:00,2024-09-30 22:59:00,False,44.407242,11.597668
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,3.0,4.0,2024-09-30 23:07:00,2024-09-30 23:08:00,False,44.359228,11.718817


In [None]:
# Calculation and visualization of missing values
missing_values = df_with_coordinates.isnull().sum()
missing_percentage = (missing_values / len(df_with_coordinates)) * 100

missing_summary = pd.DataFrame({
    "Missing Values": missing_values,
    "Percentage": missing_percentage.round(2)
})

missing_summary = missing_summary[missing_summary["Missing Values"] > 0]
print("\n Null values in the final dataset:")
print(missing_summary)


 Null values in the final dataset:
                     Missing Values  Percentage
stop_arrival_time           2756872        9.86
stop_departure_time         2774884        9.92
latitude                     285069        1.02
longitude                    285069        1.02


spiegare il drop delle righe senza coordinate

In [None]:
df_with_coordinates = df_with_coordinates.dropna(subset=["latitude", "longitude"])

In [None]:
# Calculation and visualization of missing values
missing_values = df_with_coordinates.isnull().sum()
missing_percentage = (missing_values / len(df_with_coordinates)) * 100

missing_summary = pd.DataFrame({
    "Missing Values": missing_values,
    "Percentage": missing_percentage.round(2)
})

missing_summary = missing_summary[missing_summary["Missing Values"] > 0]
print("\n Null values in the final dataset:")
print(missing_summary)


 Null values in the final dataset:
                     Missing Values  Percentage
stop_arrival_time           2736143        9.88
stop_departure_time         2753115        9.94


In [None]:
df_with_coordinates.to_parquet(INTERIM_PATH / "train_data_with_coordinates.parquet", index=False)

print("Datasets successfully saved in 'data/interim'")

Datasets successfully saved in 'data/interim'


passiamo alla mappa dei delay

In [None]:
stop_delays = df_with_coordinates.groupby(["stop_name", "latitude", "longitude"])["stop_arrival_delay"].mean().reset_index()

fig_delays = px.density_map(
    stop_delays, 
    lat="latitude", 
    lon="longitude", 
    hover_name="stop_name", 
    title="Average Train Delay Distribution by Stop", 
    radius=10, 
    opacity=0.6, 
    zoom=6,
    map_style="carto-positron",
)
fig_delays.update_layout(height=900)
fig_delays.update_layout(width=1200)

fig_delays.show()

**Time-Based Features**

In [69]:
df_with_coordinates = pd.read_parquet(INTERIM_PATH / "train_data_with_coordinates.parquet")

In [None]:
df_with_coordinates["hour"] = df_with_coordinates["scheduled_departure_time"].dt.hour
df_with_coordinates["day_of_week"] = df_with_coordinates["scheduled_departure_time"].dt.dayofweek  # Monday=0, Sunday=6
df_with_coordinates["is_weekend"] = df_with_coordinates["day_of_week"].isin([5, 6]).astype(int)
df_with_coordinates["month"] = df_with_coordinates["scheduled_departure_time"].dt.month

# Define rush hours (e.g., 7-9 AM, 5-7 PM)
df_with_coordinates["is_rush_hour"] = df_with_coordinates["hour"].isin([7, 8, 9, 17, 18, 19]).astype(int)

In [None]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,stop_arrival_time,stop_departure_time,is_terminal_stop,latitude,longitude,hour,day_of_week,is_weekend,month,is_rush_hour
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,NaT,2024-09-30 22:38:00,True,44.505907,11.343369,22,0,0,9,0
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,2024-09-30 22:45:00,2024-09-30 22:46:00,False,44.478898,11.416181,22,0,0,9,0
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,2024-09-30 22:51:00,2024-09-30 22:52:00,False,44.450703,11.487476,22,0,0,9,0
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,2024-09-30 22:58:00,2024-09-30 22:59:00,False,44.407242,11.597668,22,0,0,9,0
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,2024-09-30 23:07:00,2024-09-30 23:08:00,False,44.359228,11.718817,22,0,0,9,0


**Station-Specific Features**

In [None]:
# Count how often each station appears (proxy for congestion)
station_counts = df_with_coordinates["stop_name"].value_counts()
df_with_coordinates["station_traffic"] = df_with_coordinates["stop_name"].map(station_counts)

# Define high-traffic stations (above median frequency)
median_traffic = df_with_coordinates["station_traffic"].median()
df_with_coordinates["is_high_traffic_station"] = (df_with_coordinates["station_traffic"] >= median_traffic).astype(int)

In [None]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,is_terminal_stop,latitude,longitude,hour,day_of_week,is_weekend,month,is_rush_hour,station_traffic,is_high_traffic_station
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,True,44.505907,11.343369,22,0,0,9,0,161005,1
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,False,44.478898,11.416181,22,0,0,9,0,14336,0
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,False,44.450703,11.487476,22,0,0,9,0,14878,0
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,False,44.407242,11.597668,22,0,0,9,0,27351,1
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,False,44.359228,11.718817,22,0,0,9,0,42396,1


**Delay Propagation Features**

In [None]:
# Compute difference between arrival and departure delay
df_with_coordinates["delay_change"] = df_with_coordinates["stop_arrival_delay"] - df_with_coordinates["stop_departure_delay"]

# Flag increasing delay
df_with_coordinates["is_delay_increasing"] = (df_with_coordinates["delay_change"] > 0).astype(int)

# Rolling delay average (captures delay trends within a train's route)
df_with_coordinates["rolling_arrival_delay"] = df_with_coordinates.groupby("train_id")["stop_arrival_delay"].shift(1).rolling(3).mean()
df_with_coordinates["rolling_departure_delay"] = df_with_coordinates.groupby("train_id")["stop_departure_delay"].shift(1).rolling(3).mean()

In [None]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,day_of_week,is_weekend,month,is_rush_hour,station_traffic,is_high_traffic_station,delay_change,is_delay_increasing,rolling_arrival_delay,rolling_departure_delay
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,0,0,9,0,161005,1,-1.0,0,,
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,0,0,9,0,14336,0,0.0,0,,
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,0,0,9,0,14878,0,1.0,1,,
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,0,0,9,0,27351,1,-1.0,0,1.666667,1.666667
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,0,0,9,0,42396,1,-1.0,0,2.666667,2.666667


**Historical Delay Trends**

In [None]:
df_with_coordinates["historical_avg_delay"] = df_with_coordinates.groupby(["stop_name", "hour"])["stop_arrival_delay"].transform("mean")

In [None]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,is_weekend,month,is_rush_hour,station_traffic,is_high_traffic_station,delay_change,is_delay_increasing,rolling_arrival_delay,rolling_departure_delay,historical_avg_delay
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,0,9,0,161005,1,-1.0,0,,,0.297568
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,0,9,0,14336,0,0.0,0,,,11.009852
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,0,9,0,14878,0,1.0,1,,,11.737624
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,0,9,0,27351,1,-1.0,0,1.666667,1.666667,10.546798
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,0,9,0,42396,1,-1.0,0,2.666667,2.666667,9.186275


**Length of stay by individual station**

In [None]:
# Planned stop duration
df_with_coordinates["planned_stop_duration"] = (df_with_coordinates["stop_departure_time"] - df_with_coordinates["stop_arrival_time"]).dt.total_seconds() / 60

# Convert delays (minutes) to timedelta
df_with_coordinates["stop_departure_delay_timedelta"] = pd.to_timedelta(df_with_coordinates["stop_departure_delay"], unit="m")
df_with_coordinates["stop_arrival_delay_timedelta"] = pd.to_timedelta(df_with_coordinates["stop_arrival_delay"], unit="m")

df_with_coordinates["actual_stop_duration"] = (
    (df_with_coordinates["stop_departure_time"] + df_with_coordinates["stop_departure_delay_timedelta"]) - 
    (df_with_coordinates["stop_arrival_time"] + df_with_coordinates["stop_arrival_delay_timedelta"])
).dt.total_seconds() / 60

df_with_coordinates["planned_vs_actual_stop_duration_ratio"] = df_with_coordinates["actual_stop_duration"] / df_with_coordinates["planned_stop_duration"]

# Handle division by zero or NaN values (avoid infinities)
df_with_coordinates["planned_vs_actual_stop_duration_ratio"] = df_with_coordinates["planned_vs_actual_stop_duration_ratio"].replace([np.inf, -np.inf], np.nan)
df_with_coordinates["planned_vs_actual_stop_duration_ratio"] = df_with_coordinates["planned_vs_actual_stop_duration_ratio"].fillna(1)  # Default to 1 when missing data

# Drop temporary columns
df_with_coordinates = df_with_coordinates.drop(columns=["stop_departure_delay_timedelta", "stop_arrival_delay_timedelta"])

In [None]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,station_traffic,is_high_traffic_station,delay_change,is_delay_increasing,rolling_arrival_delay,rolling_departure_delay,historical_avg_delay,planned_stop_duration,actual_stop_duration,planned_vs_actual_stop_duration_ratio
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,161005,1,-1.0,0,,,0.297568,,,1.0
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,14336,0,0.0,0,,,11.009852,1.0,1.0,1.0
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,14878,0,1.0,1,,,11.737624,1.0,0.0,0.0
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,27351,1,-1.0,0,1.666667,1.666667,10.546798,1.0,2.0,2.0
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,42396,1,-1.0,0,2.666667,2.666667,9.186275,1.0,2.0,2.0


In [None]:
df_with_coordinates.describe()

Unnamed: 0,train_departure_delay,train_arrival_delay,scheduled_departure_time,scheduled_arrival_time,stop_arrival_delay,stop_departure_delay,stop_arrival_time,stop_departure_time,latitude,longitude,...,station_traffic,is_high_traffic_station,delay_change,is_delay_increasing,rolling_arrival_delay,rolling_departure_delay,historical_avg_delay,planned_stop_duration,actual_stop_duration,planned_vs_actual_stop_duration_ratio
count,27688360.0,27688360.0,27688356,27688356,27688360.0,27688360.0,24952213,24935241,27688360.0,27688360.0,...,27688360.0,27688360.0,27688360.0,27688360.0,19388000.0,19388000.0,27688360.0,22199110.0,22199110.0,27688360.0
mean,2.636891,2.440116,2024-06-29 19:03:39.081762816,2024-06-29 20:52:43.393782528,2.897059,3.987473,2024-06-29 20:03:47.983377920,2024-06-29 19:59:20.907042304,43.6538,11.29434,...,41073.11,0.5000799,-1.090415,0.1278731,3.249732,4.60491,2.897059,1.398718,2.706198,1.966224
min,-9.0,-10.0,2023-12-31 23:02:00,2023-12-31 23:28:00,-10.0,-10.0,2023-12-31 23:13:00,2023-12-31 23:02:00,36.732,6.709861,...,1.0,0.0,-278.0,0.0,-10.0,-10.0,-10.0,-437799.0,-437799.0,-371.0
25%,1.0,-2.0,2024-03-29 05:54:00,2024-03-29 07:40:00,0.0,1.0,2024-03-29 06:28:00,2024-03-29 06:22:00,41.90129,9.175534,...,11663.0,0.0,-2.0,0.0,0.0,1.333333,1.63364,1.0,1.0,1.0
50%,1.0,1.0,2024-06-27 05:02:00,2024-06-27 06:50:00,1.0,2.0,2024-06-27 05:40:00,2024-06-27 05:35:00,44.4507,11.1966,...,23860.0,1.0,-1.0,0.0,1.666667,2.666667,2.694897,1.0,2.0,2.0
75%,3.0,3.0,2024-10-01 05:44:00,2024-10-01 07:35:00,4.0,5.0,2024-10-01 07:38:00,2024-10-01 07:32:00,45.48475,12.66206,...,51728.0,1.0,0.0,0.0,4.0,5.333333,3.934874,1.0,3.0,3.0
max,289.0,300.0,2024-12-31 22:55:00,2025-01-01 13:40:00,300.0,299.0,2025-01-01 13:40:00,2025-01-01 13:34:00,47.00242,18.34687,...,255991.0,1.0,300.0,1.0,296.6667,297.0,295.0,482408.0,482409.0,351.0
std,4.664859,9.438597,,,7.439274,7.157521,,,2.241428,2.424543,...,44140.16,0.5,3.872542,0.3339485,7.127459,7.306664,1.94673,285.2545,285.263,1.83483


**Total Travel Duration Features**

In [None]:
# Planned travel duration (entire journey)
df_with_coordinates["scheduled_total_duration"] = (df_with_coordinates["scheduled_arrival_time"] - df_with_coordinates["scheduled_departure_time"]).dt.total_seconds() / 60

# Convert delays (minutes) to timedelta
df_with_coordinates["train_departure_delay_timedelta"] = pd.to_timedelta(df_with_coordinates["train_departure_delay"], unit="m")
df_with_coordinates["train_arrival_delay_timedelta"] = pd.to_timedelta(df_with_coordinates["train_arrival_delay"], unit="m")

df_with_coordinates["actual_total_duration"] = (
    (df_with_coordinates["scheduled_departure_time"] + df_with_coordinates["train_departure_delay_timedelta"]) - 
    (df_with_coordinates["scheduled_arrival_time"] + df_with_coordinates["train_arrival_delay_timedelta"])
).dt.total_seconds() / 60

# Ratio between actual and planned total duration
df_with_coordinates["planned_vs_actual_total_ratio"] = df_with_coordinates["actual_total_duration"] / df_with_coordinates["scheduled_total_duration"]

# Handle division by zero or NaN values
df_with_coordinates["planned_vs_actual_total_ratio"] = df_with_coordinates["planned_vs_actual_total_ratio"].replace([np.inf, -np.inf], np.nan)
df_with_coordinates["planned_vs_actual_total_ratio"] = df_with_coordinates["planned_vs_actual_total_ratio"].fillna(1)  # Default to 1 when missing data

# Drop temporary columns
df_with_coordinates = df_with_coordinates.drop(columns=["train_departure_delay_timedelta", "train_arrival_delay_timedelta"])

In [None]:
df_with_coordinates.describe()

Unnamed: 0,train_departure_delay,train_arrival_delay,scheduled_departure_time,scheduled_arrival_time,stop_arrival_delay,stop_departure_delay,stop_arrival_time,stop_departure_time,latitude,longitude,...,is_delay_increasing,rolling_arrival_delay,rolling_departure_delay,historical_avg_delay,planned_stop_duration,actual_stop_duration,planned_vs_actual_stop_duration_ratio,scheduled_total_duration,actual_total_duration,planned_vs_actual_total_ratio
count,27016230.0,27016230.0,27016228,27016228,27016230.0,27016230.0,24357569,24339834,27016230.0,27016230.0,...,27016230.0,18722680.0,18722680.0,27016230.0,21681180.0,21681180.0,27016230.0,27016230.0,27016230.0,27016230.0
mean,2.650029,2.43478,2024-06-29 21:30:35.706082560,2024-06-29 23:20:18.895126784,2.90515,4.005559,2024-06-29 22:13:22.531706624,2024-06-29 22:42:49.601634048,43.69838,11.22981,...,0.1247293,3.279785,4.641935,2.90515,1.407259,2.721434,1.964246,109.7198,-109.5046,-0.9915271
min,-9.0,-10.0,2023-12-31 23:02:00,2023-12-31 23:28:00,-10.0,-10.0,2023-12-31 23:13:00,2023-12-31 23:02:00,36.7204,6.703205,...,0.0,-10.0,-10.0,-10.0,-437799.0,-437799.0,-371.0,-3705.0,-4313.0,-30.66667
25%,1.0,-2.0,2024-03-29 06:39:00,2024-03-29 08:26:00,0.0,1.0,2024-03-29 07:08:00,2024-03-29 07:18:00,41.92781,9.187344,...,0.0,0.0,1.333333,1.643377,1.0,1.0,1.0,61.0,-125.0,-1.015873
50%,1.0,1.0,2024-06-27 07:09:00,2024-06-27 08:57:00,1.0,2.0,2024-06-27 07:36:00,2024-06-27 08:08:00,44.48262,11.14876,...,0.0,1.666667,2.666667,2.718004,1.0,2.0,2.0,83.0,-83.0,-0.986014
75%,3.0,3.0,2024-10-01 08:53:00,2024-10-01 10:45:00,4.0,5.0,2024-10-01 10:24:00,2024-10-01 10:52:22.500000,45.48588,12.6184,...,0.0,4.0,5.333333,3.950429,1.0,3.0,3.0,126.0,-60.0,-0.9565217
max,289.0,300.0,2024-12-31 22:55:00,2025-01-01 13:40:00,300.0,299.0,2025-01-01 13:40:00,2025-01-01 13:34:00,47.00374,18.36933,...,1.0,296.6667,296.6667,200.0,482408.0,482409.0,351.0,4312.0,3707.0,185.0
std,4.693856,9.47785,,,7.472914,7.178877,,,2.2452,2.362223,...,0.3304117,7.168221,7.347001,1.933122,288.6414,288.6499,1.831921,99.04845,100.8322,0.1366434


**Extreme Delay Flag**

In [None]:
# Define extreme delay threshold (e.g., top 5% of delays)
extreme_delay_threshold = df_with_coordinates["stop_arrival_delay"].quantile(0.95)
df_with_coordinates["is_extreme_delay"] = (df_with_coordinates["stop_arrival_delay"] >= extreme_delay_threshold).astype(int)

In [None]:
df_with_coordinates.describe()

Unnamed: 0,train_departure_delay,train_arrival_delay,scheduled_departure_time,scheduled_arrival_time,stop_arrival_delay,stop_departure_delay,stop_arrival_time,stop_departure_time,latitude,longitude,...,rolling_arrival_delay,rolling_departure_delay,historical_avg_delay,planned_stop_duration,actual_stop_duration,planned_vs_actual_stop_duration_ratio,scheduled_total_duration,actual_total_duration,planned_vs_actual_total_ratio,is_extreme_delay
count,27688360.0,27688360.0,27688356,27688356,27688360.0,27688360.0,24952213,24935241,27688360.0,27688360.0,...,19388000.0,19388000.0,27688360.0,22199110.0,22199110.0,27688360.0,27688360.0,27688360.0,27688360.0,27688360.0
mean,2.636891,2.440116,2024-06-29 19:03:39.081762816,2024-06-29 20:52:43.393782528,2.897059,3.987473,2024-06-29 20:03:47.983377920,2024-06-29 19:59:20.907042304,43.6538,11.29434,...,3.249732,4.60491,2.897059,1.398718,2.706198,1.966224,109.0719,-108.8751,-0.991959,0.0553044
min,-9.0,-10.0,2023-12-31 23:02:00,2023-12-31 23:28:00,-10.0,-10.0,2023-12-31 23:13:00,2023-12-31 23:02:00,36.732,6.709861,...,-10.0,-10.0,-10.0,-437799.0,-437799.0,-371.0,-3705.0,-4313.0,-30.66667,0.0
25%,1.0,-2.0,2024-03-29 05:54:00,2024-03-29 07:40:00,0.0,1.0,2024-03-29 06:28:00,2024-03-29 06:22:00,41.90129,9.175534,...,0.0,1.333333,1.63364,1.0,1.0,1.0,61.0,-124.0,-1.016129,0.0
50%,1.0,1.0,2024-06-27 05:02:00,2024-06-27 06:50:00,1.0,2.0,2024-06-27 05:40:00,2024-06-27 05:35:00,44.4507,11.1966,...,1.666667,2.666667,2.694897,1.0,2.0,2.0,83.0,-83.0,-0.9861111,0.0
75%,3.0,3.0,2024-10-01 05:44:00,2024-10-01 07:35:00,4.0,5.0,2024-10-01 07:38:00,2024-10-01 07:32:00,45.48475,12.66206,...,4.0,5.333333,3.934874,1.0,3.0,3.0,126.0,-60.0,-0.9570552,0.0
max,289.0,300.0,2024-12-31 22:55:00,2025-01-01 13:40:00,300.0,299.0,2025-01-01 13:40:00,2025-01-01 13:34:00,47.00242,18.34687,...,296.6667,297.0,295.0,482408.0,482409.0,351.0,4312.0,3707.0,185.0,1.0
std,4.664859,9.438597,,,7.439274,7.157521,,,2.241428,2.424543,...,7.127459,7.306664,1.94673,285.2545,285.263,1.83483,98.52571,100.303,0.1357423,0.2285735


In [None]:
df_with_coordinates.to_parquet(INTERIM_PATH / "train_data_fe.parquet", index=False)

print("Datasets successfully saved in 'data/interim'")

Datasets successfully saved in 'data/interim'


**BUILD GRAPH**

Struttura del Grafo
Nodi: Ogni stazione ferroviaria è un nodo del grafo.

Archi: Una connessione tra due nodi esiste se un treno viaggia tra quelle stazioni.

Pesi degli archi: Possiamo assegnare come peso il ritardo medio accumulato su quella tratta.

Feature Estratte dal Grafo
Dopo aver costruito il grafo, possiamo calcolare:
- Degree Centrality → Misura la connettività di una stazione. Stazioni più connesse potrebbero essere più soggette a congestione.
- PageRank → Misura l'importanza della stazione nella rete ferroviaria (es. snodi principali come Milano Centrale avranno valori alti).
- Betweenness Centrality → Quante volte una stazione è "di passaggio" nei percorsi più brevi tra due stazioni?
- Shortest Path to Hub → Quanto una stazione è lontana dalla più importante del sistema?

In [None]:
import pandas as pd
import networkx as nx
from pathlib import Path
import geopandas as gpd
import keplergl

In [None]:
INTERIM_PATH = Path("data/interim")
PROCESSED_PATH = Path("data/processed")

In [None]:
df = pd.read_parquet(INTERIM_PATH / "train_data_fe.parquet")

Currently, the same train route (e.g., Bologna → Faenza) is added multiple times (once per day).
But the railway infrastructure doesn’t change daily → A train_number follows the same path every day.
By keeping only one occurrence per train_number, we:

Reduce computational cost significantly.

Avoid redundant edges (e.g., Bologna → Faenza being added 365 times).

In [None]:
# Consideriamo solo i dati del 2 gennaio 2024
df = df[df["scheduled_departure_time"].dt.date == pd.to_datetime("2024-01-02").date()]

In [None]:
# Ordinare il dataset per train_number e orario di partenza
df.sort_values(by=["train_number", "scheduled_departure_time"], inplace=True)

In [None]:
G = nx.DiGraph() 

In [None]:
# Dizionario per tenere traccia degli archi e dei numeri di treno associati
edges_dict = {}

# Iteriamo per ogni treno per costruire gli archi
for train_number, stops in df.groupby("train_number"):

    previous_station = None

    for _, row in stops.iterrows():
        current_station = row["stop_name"]
        # print(f"Current station: {current_station}")

        # Creiamo un nodo se non esiste (usiamo la prima apparizione della stazione)
        if current_station not in G:
            G.add_node(current_station)

        # Se c'è una stazione precedente, creiamo un arco orientato
        if previous_station:
            edge = (previous_station, current_station)

            # Se l'arco esiste già, aggiungiamo il numero del treno alla lista
            if edge in edges_dict:
                edges_dict[edge].add(train_number)
            else:
                edges_dict[edge] = {train_number}  # Inizializza con il primo treno

        # Aggiorniamo la stazione precedente
        previous_station = current_station

In [None]:
for (station_a, station_b), train_numbers in edges_dict.items():
    G.add_edge(station_a, station_b, trains=",".join(map(str, train_numbers)))

print(f"Graph successfully created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

Graph successfully created with 1812 nodes and 5544 edges.


In [None]:
# Salviamo il grafo in formato GraphML per un'eventuale visualizzazione con Gephi o NetworkX
nx.write_graphml(G, "data/processed/train_network.graphml")

print("Graph saved successfully as 'train_network.graphml'")

Graph saved successfully as 'train_network.graphml'


In [None]:
import pandas as pd
import networkx as nx
import geopandas as gpd
from shapely.geometry import LineString
from keplergl import KeplerGl

# =====================================================
# 📌 1. LOAD GRAPH AND COORDINATES
# =====================================================

# Carica il grafo
G = nx.read_graphml("data/processed/train_network.graphml")

# Creiamo un dizionario con le coordinate delle stazioni
station_coords = df.groupby("stop_name")[["latitude", "longitude"]].first().reset_index()

# =====================================================
# 📌 2. CREATE NODE AND EDGE DATAFRAMES
# =====================================================

# DataFrame per i nodi (stazioni)
nodes_df = pd.DataFrame({
    "station": list(G.nodes),
    "latitude": [station_coords.loc[station_coords["stop_name"] == n, "latitude"].values[0] for n in G.nodes],
    "longitude": [station_coords.loc[station_coords["stop_name"] == n, "longitude"].values[0] for n in G.nodes]
})

# DataFrame per gli archi (tratte ferroviarie) con formato GeoPandas
edges_list = []
for station_a, station_b, attr in G.edges(data=True):
    if station_a in station_coords["stop_name"].values and station_b in station_coords["stop_name"].values:
        lat_a = station_coords.loc[station_coords["stop_name"] == station_a, "latitude"].values[0]
        lon_a = station_coords.loc[station_coords["stop_name"] == station_a, "longitude"].values[0]
        lat_b = station_coords.loc[station_coords["stop_name"] == station_b, "latitude"].values[0]
        lon_b = station_coords.loc[station_coords["stop_name"] == station_b, "longitude"].values[0]

        edges_list.append({
            "start_station": station_a,
            "end_station": station_b,
            "trains": attr["trains"],  # Treni che percorrono questa tratta
            "geometry": LineString([(lon_a, lat_a), (lon_b, lat_b)])  # Linea GeoPandas
        })

# Creiamo il GeoDataFrame per le linee
edges_gdf = gpd.GeoDataFrame(edges_list, geometry="geometry", crs="EPSG:4326")  # EPSG:4326 = coordinate geografiche

# =====================================================
# 📌 3. CREATE KEPLERGL MAP
# =====================================================

# Creiamo la mappa
m = KeplerGl(height=800, data={"Stations": nodes_df, "Train Routes": edges_gdf})

# Modifica il livello di zoom iniziale e la posizione centrale sulla mappa
m.config = {
    "version": "v1",
    "config": {
        "mapState": {
            "bearing": 0,
            "latitude": 42.5,  # Centro Italia
            "longitude": 12.5,
            "pitch": 0,
            "zoom": 6  # Aumentiamo lo zoom iniziale
        }
    }
}

# Salviamo la mappa in formato HTML
m.save_to_html(file_name="data/processed/train_network_kepler.html")

# Mostriamo la mappa
m


User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to data/processed/train_network_kepler.html!


KeplerGl(config={'version': 'v1', 'config': {'mapState': {'bearing': 0, 'latitude': 42.5, 'longitude': 12.5, '…

In [53]:
import pandas as pd
import networkx as nx
import folium
import geopandas as gpd
from folium.plugins import MarkerCluster

In [None]:
import pandas as pd
import networkx as nx

# =====================================================
# 📌 1. COMPUTE GRAPH-BASED FEATURES
# =====================================================

# Degree Centrality (how connected a station is)
degree_centrality = nx.degree_centrality(G)

# PageRank (importance of a station in the network)
pagerank = nx.pagerank(G, alpha=0.85)

# Betweenness Centrality (how often a station is used as a transit point)
betweenness = nx.betweenness_centrality(G, normalized=True, weight="weight")

# Identify the main hub (station with highest degree centrality)
main_hub = max(degree_centrality, key=degree_centrality.get)

# Compute shortest path from each station to the hub
shortest_paths = nx.shortest_path_length(G, source=main_hub, weight="weight")

# =====================================================
# 📌 2. CREATE FEATURE DATAFRAME
# =====================================================

# Convert features into a DataFrame
graph_features = pd.DataFrame({
    "stop_name": list(G.nodes),
    "degree_centrality": [degree_centrality[node] for node in G.nodes],
    "pagerank": [pagerank[node] for node in G.nodes],
    "betweenness": [betweenness[node] for node in G.nodes],
    "shortest_path_to_hub": [shortest_paths.get(node, None) for node in G.nodes]
})

# Normalize features
graph_features["degree_centrality"] /= graph_features["degree_centrality"].max()
graph_features["pagerank"] /= graph_features["pagerank"].max()
graph_features["betweenness"] /= graph_features["betweenness"].max()
graph_features["shortest_path_to_hub"] /= graph_features["shortest_path_to_hub"].max()

# =====================================================
# 📌 3. SAVE PROCESSED DATA
# =====================================================

graph_features.to_csv("data/processed/train_graph_features.csv", index=False)

print("✅ Graph features extracted and saved successfully!")


**COMPUTE GRAPH FEATURES**

In [60]:
# Degree Centrality (how connected a station is)
degree_centrality = nx.degree_centrality(G)
df["degree_centrality"] = df["stop_name"].map(degree_centrality)
df["degree_centrality"] /= df["degree_centrality"].max()

In [62]:
# Ensure all edge weights are non-negative
min_delay = min([data["weight"] for _, _, data in G.edges(data=True)])
for u, v, data in G.edges(data=True):
    data["weight"] = (data["weight"] - min_delay) + 1  # Shift to be >= 1

In [None]:
# Remove self-loops
G.remove_edges_from(nx.selfloop_edges(G))

# Compute PageRank on each connected component
pagerank = {}
for component in nx.weakly_connected_components(G):
    subgraph = G.subgraph(component)
    pr = nx.pagerank(subgraph, alpha=0.85)
    pagerank.update(pr)

df["pagerank"] = df["stop_name"].map(pagerank)
df["pagerank"] /= df["pagerank"].max()  # Normalize

In [64]:
# Betweenness Centrality (how often a station is used as a transit point)
betweenness_centrality = nx.betweenness_centrality(G, weight="weight")

df["betweenness_centrality"] = df["stop_name"].map(betweenness_centrality)
df["betweenness_centrality"] /= df["betweenness_centrality"].max()


In [None]:
# Shortest Path to Hub (how far each station is from the most connected one)
main_hub = max(degree_centrality, key=degree_centrality.get)
shortest_paths = nx.shortest_path_length(G, source=main_hub, weight="weight")

df["shortest_path_to_hub"] = df["stop_name"].map(lambda x: shortest_paths.get(x, {}).get(main_hub, None))
df["shortest_path_to_hub"] /= df["shortest_path_to_hub"].max()

In [14]:
df.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,actual_stop_duration,planned_vs_actual_stop_duration_ratio,scheduled_total_duration,actual_total_duration,planned_vs_actual_total_ratio,is_extreme_delay,degree_centrality,pagerank,betweenness_centrality,shortest_path_to_hub
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,,1.0,47.0,-45.0,-0.957447,0,0.721354,0.814516,0.508636,0.043524
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,1.0,1.0,47.0,-45.0,-0.957447,0,0.033854,0.046908,5e-06,0.082242
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,0.0,0.0,47.0,-45.0,-0.957447,0,0.036458,0.050415,7e-06,0.074098
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,2.0,2.0,47.0,-45.0,-0.957447,0,0.072917,0.10778,0.000734,0.074735
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,2.0,2.0,47.0,-45.0,-0.957447,0,0.125,0.17119,0.000529,0.072611


In [24]:
df.describe()

Unnamed: 0,train_departure_delay,train_arrival_delay,scheduled_departure_time,scheduled_arrival_time,stop_arrival_delay,stop_departure_delay,stop_arrival_time,stop_departure_time,latitude,longitude,...,actual_stop_duration,planned_vs_actual_stop_duration_ratio,scheduled_total_duration,actual_total_duration,planned_vs_actual_total_ratio,is_extreme_delay,degree_centrality,pagerank,betweenness_centrality,shortest_path_to_hub
count,27016230.0,27016230.0,27016228,27016228,27016230.0,27016230.0,24357569,24339834,27016230.0,27016230.0,...,21681180.0,27016230.0,27016230.0,27016230.0,27016230.0,27016230.0,27016230.0,27016230.0,27016230.0,27016220.0
mean,2.650029,2.43478,2024-06-29 21:30:35.706082560,2024-06-29 23:20:18.895126784,2.90515,4.005559,2024-06-29 22:13:22.531706624,2024-06-29 22:42:49.601634048,43.69838,11.22981,...,2.721434,1.964246,109.7198,-109.5046,-0.9915271,0.0556433,0.1256309,0.1480478,0.04891237,0.07140042
min,-9.0,-10.0,2023-12-31 23:02:00,2023-12-31 23:28:00,-10.0,-10.0,2023-12-31 23:13:00,2023-12-31 23:02:00,36.7204,6.703205,...,-437799.0,-371.0,-3705.0,-4313.0,-30.66667,0.0,0.002604167,0.01120632,0.0,0.0
25%,1.0,-2.0,2024-03-29 06:39:00,2024-03-29 08:26:00,0.0,1.0,2024-03-29 07:08:00,2024-03-29 07:18:00,41.92781,9.187344,...,1.0,1.0,61.0,-125.0,-1.015873,0.0,0.03385417,0.04872916,0.0001292126,0.05435244
50%,1.0,1.0,2024-06-27 07:09:00,2024-06-27 08:57:00,1.0,2.0,2024-06-27 07:36:00,2024-06-27 08:08:00,44.48262,11.14876,...,2.0,2.0,83.0,-83.0,-0.986014,0.0,0.05989583,0.07997885,0.004255855,0.07303609
75%,3.0,3.0,2024-10-01 08:53:00,2024-10-01 10:45:00,4.0,5.0,2024-10-01 10:24:00,2024-10-01 10:52:22.500000,45.48588,12.6184,...,3.0,3.0,126.0,-60.0,-0.9565217,0.0,0.1510417,0.1744808,0.02241674,0.08211253
max,289.0,300.0,2024-12-31 22:55:00,2025-01-01 13:40:00,300.0,299.0,2025-01-01 13:40:00,2025-01-01 13:34:00,47.00374,18.36933,...,482409.0,351.0,4312.0,3707.0,185.0,1.0,1.0,1.0,1.0,1.0
std,4.693856,9.47785,,,7.472914,7.178877,,,2.2452,2.362223,...,288.6499,1.831921,99.04845,100.8322,0.1366434,0.2292316,0.1631748,0.1715631,0.1361272,0.0262301


**WEATHER DATA INTEGRATION**

In [6]:
df = pd.read_parquet(PROCESSED_PATH / "train_data_fe_kg.parquet")

In [9]:
df.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,actual_stop_duration,planned_vs_actual_stop_duration_ratio,scheduled_total_duration,actual_total_duration,planned_vs_actual_total_ratio,is_extreme_delay,pagerank,degree_centrality,betweenness_centrality,shortest_path_to_hub
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,,1.0,47.0,-45.0,-0.957447,0,0.792493,0.712082,0.500754,0.042304
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,1.0,1.0,47.0,-45.0,-0.957447,0,0.046737,0.033419,0.000492,0.081072
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,0.0,0.0,47.0,-45.0,-0.957447,0,0.048414,0.03856,2.1e-05,0.072917
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,2.0,2.0,47.0,-45.0,-0.957447,0,0.102761,0.071979,0.000682,0.07483
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,2.0,2.0,47.0,-45.0,-0.957447,0,0.171091,0.125964,0.002284,0.072866


In [13]:
from meteostat import Hourly, Stations
from datetime import datetime, timezone

def get_weather_meteostat(lat, lon, timestamp):
    try:
        date = timestamp.to_pydatetime().replace(tzinfo=timezone.utc)

        # Find the nearest weather station
        stations = Stations().nearby(lat, lon).fetch(1)
        if stations.empty:
            return None

        station_id = stations.index[0]

        # Retrieve hourly weather data for the specific date
        data = Hourly(station_id, start=date, end=date).fetch()

        if data.empty:
            return None

        row = data.iloc[0]
        return {
            "temperature": row["temp"],  # Temperature in °C
            "precipitation": row["prcp"],  # Precipitation in mm
            "snow": row["snow"],  # Snow in cm
            "humidity": row["rhum"],  # Humidity in %
            "wind_speed": row["wspd"],  # Wind speed in km/h
            "pressure": row["pres"],  # Atmospheric pressure in hPa
            "weather_code": row["coco"]  # Weather condition code
        }

    except Exception as e:
        print(f"Error retrieving weather data for {lat}, {lon}: {e}")
        return None

In [None]:
df["stop_arrival_time"] = pd.to_datetime(df["stop_arrival_time"], errors="coerce")

weather_data = []

for index, row in tqdm(df.iterrows(), total=len(df)):
    lat, lon, timestamp = row["latitude"], row["longitude"], row["stop_arrival_time"]

    weather = get_weather_meteostat(lat, lon, timestamp)

    if weather:
        weather_data.append(weather)
    else:
        weather_data.append(
            {
                "temperature": None, 
                "precipitation": None, 
                "snow": None,
                "humidity": None, 
                "wind_speed": None, 
                "pressure": None, 
                "weather_code": None})

In [None]:
# Create a DataFrame for weather data and merge it with the main dataset
weather_df = pd.DataFrame(weather_data)

In [None]:
weather_df.head()

In [None]:
weather_df.describe()

In [None]:
df_with_weather = pd.concat([df_with_coordinates, weather_df], axis=1)

In [None]:
df_with_weather.to_parquet(PROCESSED_PATH / "train_data_with_weather.parquet", index=False)

print("Dataset with weather data saved successfully!")

In [None]:
# =====================================================
# 📌 6. DEFINE TARGET VARIABLE (y) FOR ML
# =====================================================

df["next_stop_arrival_delay"] = df.groupby("train_id")["stop_arrival_delay"].shift(-1)


In [None]:
# =====================================================
# 📌 7. FEATURE SELECTION & SAVE FINAL DATASET
# =====================================================

# Drop unnecessary columns (keep timestamps for time-series models)
drop_cols = ["train_id", "train_number", "stop_departure_time", "stop_arrival_time"]
df.drop(columns=drop_cols, inplace=True)

# Save dataset
df.to_csv("data/processed/train_dataset_with_knowledge_graph.csv", index=False)

print("✅ Feature extraction with Knowledge Graph completed! Processed dataset saved.")

**Drop Unnecessary Features**
You dropped:  
```python
drop_cols = ["train_id", "train_number", "scheduled_departure_time", 
             "scheduled_arrival_time", "stop_departure_time", "stop_arrival_time", 
             "total_journey_start"]
```
- **Possible issue**:  
  - If you're using **LSTM or RNN**, you might **need timestamps** (`scheduled_departure_time`) for time-series modeling.  
  - Instead of dropping `train_id`, you could use it for **cross-validation grouping** (e.g., ensuring train sequences stay in the same fold).  

**1. Historical Delay Trends (Inspired by Literature)**
**Idea from [Real-Time Passenger Train Delay Prediction (Amtrak Study)](11)**  
- **Why?** If a train was delayed at `t-1`, it is more likely to be delayed at `t`.  
- **How?** Compute **past mean delays** per train, per station, per hour.  


**2. Weather Data Integration (Inspired by [Dynamic Delay Predictions Study](12))**  
**Why?** Weather (rain, snow) affects train delays.  
**How?** If you have external data, join with historical weather features:  

If `weather_df` contains features like `"rain_mm"`, `"temperature"`, etc., these could be useful.  

**3. Interaction Features**
Instead of raw congestion values, use **ratios** to station congestion:  
```python
df["relative_congestion"] = df["station_traffic"] / df["station_traffic"].max()
```
This normalizes congestion across different regions.  
