RICAVARE I DATI GEOGRAFICI

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import googlemaps
import time
from tqdm import tqdm
import plotly.express as px
from plotly.subplots import make_subplots
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from meteostat import Hourly, Stations
from datetime import datetime, timezone
import networkx as nx

In [99]:
load_dotenv()
api_key = os.getenv('GOOGLE_API_KEY')

In [100]:
gmaps = googlemaps.Client(key=api_key)

In [2]:
INTERIM_PATH = Path("data/interim")

In [3]:
PROCESSED_PATH = Path("data/processed")

In [10]:
df = pd.read_parquet(INTERIM_PATH / "train_data_cleaned.parquet")

In [102]:
print(df.shape)

(27973425, 15)


In [103]:
unique_stops = df['stop_name'].unique()
print(f"Number of unique stops: {len(unique_stops)}")

Number of unique stops: 2357


In [104]:
def get_coordinates_from_google(stop_name):
    try:
        geocode_result = gmaps.geocode(stop_name, region="it")
        if geocode_result:
            lat = geocode_result[0]['geometry']['location']['lat']
            lon = geocode_result[0]['geometry']['location']['lng']
            return lat, lon
        return None, None
    except Exception as e:
        print(f"Error retrieving for {stop_name}: {e}")
        return None, None

In [105]:
coordinates_list_google = []

for stop in tqdm(unique_stops, desc="Retrieving coordinates", unit="stop"):
    lat, lon = get_coordinates_from_google(stop)
    coordinates_list_google.append({'stop_name': stop, 'latitude': lat, 'longitude': lon})
    
    # Pause between requests to avoid exceeding the request limit
    # time.sleep(1)

Retrieving coordinates: 100%|██████████| 2357/2357 [03:34<00:00, 10.98stop/s]


In [106]:
coordinates_df_google = pd.DataFrame(coordinates_list_google)
print(coordinates_df_google.head())

               stop_name   latitude  longitude
0           BOLOGNA C.LE  44.505907  11.343369
1    S.LAZZARO DI SAVENA  44.468974  11.421816
2     OZZANO DELL'EMILIA  44.446347  11.472402
3  CASTEL S.PIETRO TERME  44.399624  11.589728
4                  IMOLA  44.351305  11.712926


Create map for train stops distribution using density_mapbox

In [107]:
fig_stops = px.density_map(
    coordinates_df_google,
    lat='latitude',
    lon='longitude',
    hover_name="stop_name",
    title="Train Stops Distribution",
    radius=10,
    opacity=0.6,
    zoom=6,
    map_style="carto-positron")
fig_stops.update_layout(height=900)
fig_stops.update_layout(width=1200)

fig_stops.show()

Ci sono svariati errori...

Proviamo con Nominatim per vedere se è più preciso

In [108]:
geolocator = Nominatim(user_agent="train_stops_locator")

def get_coordinates_from_nominatim(stop_name):
    try:
        location = geolocator.geocode(stop_name, country_codes="it", timeout=10)
        if location:
            return location.latitude, location.longitude
        return None, None
    except GeocoderTimedOut:
        print(f"Timeout for {stop_name}")
        return None, None
    except Exception as e:
        print(f"Error retrieving for {stop_name}: {e}")
        return None, None


In [109]:
coordinates_list_nominatim = []

for stop in tqdm(unique_stops, desc="Retrieving coordinates", unit="stop"):
    lat, lon = get_coordinates_from_nominatim(stop)
    coordinates_list_nominatim.append({'stop_name': stop, 'latitude': lat, 'longitude': lon})
    
    # Per evitare di sovraccaricare Nominatim (rispettare le politiche di utilizzo)
    # time.sleep(1)


Retrieving coordinates: 100%|██████████| 2357/2357 [39:18<00:00,  1.00s/stop]


In [110]:
coordinates_df_nominatim = pd.DataFrame(coordinates_list_nominatim)     # Nominatim
print(coordinates_df_nominatim.head())

               stop_name   latitude  longitude
0           BOLOGNA C.LE  44.505878  11.343343
1    S.LAZZARO DI SAVENA  44.471567  11.404859
2     OZZANO DELL'EMILIA  44.444980  11.476050
3  CASTEL S.PIETRO TERME  44.401270  11.585499
4                  IMOLA  44.353515  11.714123


In [111]:
INTERIM_PATH = Path("data/interim")
INTERIM_PATH.mkdir(parents=True, exist_ok=True)

coordinates_df_nominatim.to_parquet(INTERIM_PATH / "coordinates_df_nominatim.parquet", index=False)

print("Coordinates datasets successfully saved in 'data/interim'")


Coordinates datasets successfully saved in 'data/interim'


In [None]:
# coordinates_df_nominatim = pd.read_parquet(INTERIM_PATH / "coordinates_df_nominatim.parquet")

In [112]:
fig_stops = px.density_map(
    coordinates_df_nominatim,
    lat='latitude',
    lon='longitude',
    hover_name="stop_name",
    title="Train Stops Distribution",
    radius=10,
    opacity=0.6,
    zoom=6,
    map_style="carto-positron")
fig_stops.update_layout(height=900)
fig_stops.update_layout(width=1200)

fig_stops.show()

Molto più preciso, mergiamo il dataset delle coordinate con il dataset originale e lo salviamo 

In [12]:
df_with_coordinates = pd.merge(df, coordinates_df_nominatim, on='stop_name', how='left')

In [13]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,stop_arrival_delay,stop_departure_delay,stop_arrival_time,stop_departure_time,is_terminal_stop,latitude,longitude
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,0.0,1.0,NaT,2024-09-30 22:38:00,True,44.505878,11.343343
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,2.0,2.0,2024-09-30 22:45:00,2024-09-30 22:46:00,False,44.471567,11.404859
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,3.0,2.0,2024-09-30 22:51:00,2024-09-30 22:52:00,False,44.44498,11.47605
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,3.0,4.0,2024-09-30 22:58:00,2024-09-30 22:59:00,False,44.40127,11.585499
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,3.0,4.0,2024-09-30 23:07:00,2024-09-30 23:08:00,False,44.353515,11.714123


In [14]:
print(df_with_coordinates.columns)

Index(['train_id', 'train_number', 'departure_station',
       'train_departure_delay', 'arrival_station', 'train_arrival_delay',
       'train_class', 'scheduled_departure_time', 'scheduled_arrival_time',
       'stop_name', 'stop_arrival_delay', 'stop_departure_delay',
       'stop_arrival_time', 'stop_departure_time', 'is_terminal_stop',
       'latitude', 'longitude'],
      dtype='object')


In [24]:
# Calculation and visualization of missing values
missing_values = df_with_coordinates.isnull().sum()
missing_percentage = (missing_values / len(df_with_coordinates)) * 100

missing_summary = pd.DataFrame({
    "Missing Values": missing_values,
    "Percentage": missing_percentage.round(2)
})

missing_summary = missing_summary[missing_summary["Missing Values"] > 0]
print("\n Null values in the final dataset:")
print(missing_summary)


 Null values in the final dataset:
                         Missing Values  Percentage
stop_arrival_time               2756872        9.86
stop_departure_time             2774884        9.92
latitude                         957197        3.42
longitude                        957197        3.42
rolling_arrival_delay           8301316       29.68
rolling_departure_delay         8301316       29.68
planned_stop_duration           5531747       19.78
actual_stop_duration            5531747       19.78


In [116]:
df_with_coordinates.to_parquet(INTERIM_PATH / "train_data_with_coordinates.parquet", index=False)

print("Datasets successfully saved in 'data/interim'")

Datasets successfully saved in 'data/interim'


passiamo alla mappa dei delay

In [117]:
stop_delays = df_with_coordinates.groupby(["stop_name", "latitude", "longitude"])["stop_arrival_delay"].mean().reset_index()

fig_delays = px.density_map(
    stop_delays, 
    lat="latitude", 
    lon="longitude", 
    hover_name="stop_name", 
    title="Average Train Delay Distribution by Stop", 
    radius=10, 
    opacity=0.6, 
    zoom=6,
    map_style="carto-positron",
)
fig_delays.update_layout(height=900)
fig_delays.update_layout(width=1200)

fig_delays.show()

**Time-Based Features**

In [4]:
df_with_coordinates = pd.read_parquet(INTERIM_PATH / "train_data_with_coordinates.parquet")

In [5]:
df_with_coordinates["hour"] = df_with_coordinates["scheduled_departure_time"].dt.hour
df_with_coordinates["day_of_week"] = df_with_coordinates["scheduled_departure_time"].dt.dayofweek  # Monday=0, Sunday=6
df_with_coordinates["is_weekend"] = df_with_coordinates["day_of_week"].isin([5, 6]).astype(int)
df_with_coordinates["month"] = df_with_coordinates["scheduled_departure_time"].dt.month

# Define rush hours (e.g., 7-9 AM, 5-7 PM)
df_with_coordinates["is_rush_hour"] = df_with_coordinates["hour"].isin([7, 8, 9, 17, 18, 19]).astype(int)

In [6]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,stop_arrival_time,stop_departure_time,is_terminal_stop,latitude,longitude,hour,day_of_week,is_weekend,month,is_rush_hour
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,NaT,2024-09-30 22:38:00,True,44.505878,11.343343,22,0,0,9,0
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,2024-09-30 22:45:00,2024-09-30 22:46:00,False,44.471567,11.404859,22,0,0,9,0
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,2024-09-30 22:51:00,2024-09-30 22:52:00,False,44.44498,11.47605,22,0,0,9,0
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,2024-09-30 22:58:00,2024-09-30 22:59:00,False,44.40127,11.585499,22,0,0,9,0
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,2024-09-30 23:07:00,2024-09-30 23:08:00,False,44.353515,11.714123,22,0,0,9,0


**Station-Specific Features**

In [7]:
# Count how often each station appears (proxy for congestion)
station_counts = df_with_coordinates["stop_name"].value_counts()
df_with_coordinates["station_traffic"] = df_with_coordinates["stop_name"].map(station_counts)

# Define high-traffic stations (above median frequency)
median_traffic = df_with_coordinates["station_traffic"].median()
df_with_coordinates["is_high_traffic_station"] = (df_with_coordinates["station_traffic"] >= median_traffic).astype(int)

In [19]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,is_terminal_stop,latitude,longitude,hour,day_of_week,is_weekend,month,is_rush_hour,station_traffic,is_high_traffic_station
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,True,44.505878,11.343343,22,0,0,9,0,161005,1
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,False,44.471567,11.404859,22,0,0,9,0,14336,0
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,False,44.44498,11.47605,22,0,0,9,0,14878,0
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,False,44.40127,11.585499,22,0,0,9,0,27351,1
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,False,44.353515,11.714123,22,0,0,9,0,42396,1


**Delay Propagation Features**

In [8]:
# Compute difference between arrival and departure delay
df_with_coordinates["delay_change"] = df_with_coordinates["stop_arrival_delay"] - df_with_coordinates["stop_departure_delay"]

# Flag increasing delay
df_with_coordinates["is_delay_increasing"] = (df_with_coordinates["delay_change"] > 0).astype(int)

# Rolling delay average (captures delay trends within a train's route)
df_with_coordinates["rolling_arrival_delay"] = df_with_coordinates.groupby("train_id")["stop_arrival_delay"].shift(1).rolling(3).mean()
df_with_coordinates["rolling_departure_delay"] = df_with_coordinates.groupby("train_id")["stop_departure_delay"].shift(1).rolling(3).mean()

In [21]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,day_of_week,is_weekend,month,is_rush_hour,station_traffic,is_high_traffic_station,delay_change,is_delay_increasing,rolling_arrival_delay,rolling_departure_delay
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,0,0,9,0,161005,1,-1.0,0,,
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,0,0,9,0,14336,0,0.0,0,,
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,0,0,9,0,14878,0,1.0,1,,
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,0,0,9,0,27351,1,-1.0,0,1.666667,1.666667
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,0,0,9,0,42396,1,-1.0,0,2.666667,2.666667


**Historical Delay Trends**

In [9]:
df_with_coordinates["historical_avg_delay"] = df_with_coordinates.groupby(["stop_name", "hour"])["stop_arrival_delay"].transform("mean")

In [23]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,is_weekend,month,is_rush_hour,station_traffic,is_high_traffic_station,delay_change,is_delay_increasing,rolling_arrival_delay,rolling_departure_delay,historical_avg_delay
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,0,9,0,161005,1,-1.0,0,,,0.297568
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,0,9,0,14336,0,0.0,0,,,11.009852
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,0,9,0,14878,0,1.0,1,,,11.737624
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,0,9,0,27351,1,-1.0,0,1.666667,1.666667,10.546798
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,0,9,0,42396,1,-1.0,0,2.666667,2.666667,9.186275


**Length of stay by individual station**

In [10]:
# Planned stop duration
df_with_coordinates["planned_stop_duration"] = (df_with_coordinates["stop_departure_time"] - df_with_coordinates["stop_arrival_time"]).dt.total_seconds() / 60

# Convert delays (minutes) to timedelta
df_with_coordinates["stop_departure_delay_timedelta"] = pd.to_timedelta(df_with_coordinates["stop_departure_delay"], unit="m")
df_with_coordinates["stop_arrival_delay_timedelta"] = pd.to_timedelta(df_with_coordinates["stop_arrival_delay"], unit="m")

df_with_coordinates["actual_stop_duration"] = (
    (df_with_coordinates["stop_departure_time"] + df_with_coordinates["stop_departure_delay_timedelta"]) - 
    (df_with_coordinates["stop_arrival_time"] + df_with_coordinates["stop_arrival_delay_timedelta"])
).dt.total_seconds() / 60

df_with_coordinates["planned_vs_actual_stop_duration_ratio"] = df_with_coordinates["actual_stop_duration"] / df_with_coordinates["planned_stop_duration"]

# Handle division by zero or NaN values (avoid infinities)
df_with_coordinates["planned_vs_actual_stop_duration_ratio"] = df_with_coordinates["planned_vs_actual_stop_duration_ratio"].replace([np.inf, -np.inf], np.nan)
df_with_coordinates["planned_vs_actual_stop_duration_ratio"] = df_with_coordinates["planned_vs_actual_stop_duration_ratio"].fillna(1)  # Default to 1 when missing data

# Drop temporary columns
df_with_coordinates = df_with_coordinates.drop(columns=["stop_departure_delay_timedelta", "stop_arrival_delay_timedelta"])

In [25]:
df_with_coordinates.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,station_traffic,is_high_traffic_station,delay_change,is_delay_increasing,rolling_arrival_delay,rolling_departure_delay,historical_avg_delay,planned_stop_duration,actual_stop_duration,planned_vs_actual_stop_duration_ratio
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,161005,1,-1.0,0,,,0.297568,,,1.0
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,14336,0,0.0,0,,,11.009852,1.0,1.0,1.0
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,14878,0,1.0,1,,,11.737624,1.0,0.0,0.0
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,27351,1,-1.0,0,1.666667,1.666667,10.546798,1.0,2.0,2.0
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,42396,1,-1.0,0,2.666667,2.666667,9.186275,1.0,2.0,2.0


In [26]:
df_with_coordinates.describe()

Unnamed: 0,train_departure_delay,train_arrival_delay,scheduled_departure_time,scheduled_arrival_time,stop_arrival_delay,stop_departure_delay,stop_arrival_time,stop_departure_time,latitude,longitude,...,station_traffic,is_high_traffic_station,delay_change,is_delay_increasing,rolling_arrival_delay,rolling_departure_delay,historical_avg_delay,planned_stop_duration,actual_stop_duration,planned_vs_actual_stop_duration_ratio
count,27973420.0,27973420.0,27973425,27973425,27973420.0,27973420.0,25216553,25198541,27016230.0,27016230.0,...,27973420.0,27973420.0,27973420.0,27973420.0,19672110.0,19672110.0,27973420.0,22441680.0,22441680.0,27973420.0
mean,2.640689,2.445902,2024-06-29 19:12:30.748817152,2024-06-29 21:01:46.929380096,2.90757,3.99469,2024-06-29 20:09:50.362128384,2024-06-29 20:04:28.495725568,43.69838,11.22981,...,41144.55,0.5004196,-1.08712,0.1273379,3.262654,4.613763,2.90757,1.396357,2.699687,1.962036
min,-9.0,-10.0,2023-12-31 23:02:00,2023-12-31 23:28:00,-10.0,-10.0,2023-12-31 23:13:00,2023-12-31 23:02:00,36.7204,6.703205,...,1.0,0.0,-278.0,0.0,-10.0,-10.0,-10.0,-437799.0,-437799.0,-371.0
25%,1.0,-1.0,2024-03-29 06:05:00,2024-03-29 07:51:00,0.0,1.0,2024-03-29 06:38:00,2024-03-29 06:31:00,41.92781,9.187344,...,11672.0,0.0,-2.0,0.0,0.0,1.333333,1.638084,1.0,1.0,1.0
50%,1.0,1.0,2024-06-27 05:20:00,2024-06-27 07:10:00,1.0,2.0,2024-06-27 05:56:00,2024-06-27 05:50:00,44.48262,11.14876,...,23942.0,1.0,-1.0,0.0,1.666667,2.666667,2.703022,1.0,2.0,2.0
75%,3.0,3.0,2024-10-01 05:31:00,2024-10-01 07:20:00,4.0,5.0,2024-10-01 07:18:00,2024-10-01 07:11:00,45.48588,12.6184,...,51728.0,1.0,0.0,0.0,4.0,5.333333,3.953846,1.0,3.0,3.0
max,289.0,300.0,2024-12-31 22:55:00,2025-01-01 13:40:00,300.0,299.0,2025-01-01 13:40:00,2025-01-01 13:34:00,47.00374,18.36933,...,255991.0,1.0,300.0,1.0,296.6667,297.0,295.0,482408.0,482409.0,351.0
std,4.677534,9.453813,,,7.457632,7.174272,,,2.2452,2.362223,...,44074.1,0.4999998,3.871372,0.3333511,7.154554,7.329608,1.95351,283.7087,283.7173,1.830868


**Total Travel Duration Features**

In [11]:
# Planned travel duration (entire journey)
df_with_coordinates["scheduled_total_duration"] = (df_with_coordinates["scheduled_arrival_time"] - df_with_coordinates["scheduled_departure_time"]).dt.total_seconds() / 60

# Convert delays (minutes) to timedelta
df_with_coordinates["train_departure_delay_timedelta"] = pd.to_timedelta(df_with_coordinates["train_departure_delay"], unit="m")
df_with_coordinates["train_arrival_delay_timedelta"] = pd.to_timedelta(df_with_coordinates["train_arrival_delay"], unit="m")

df_with_coordinates["actual_total_duration"] = (
    (df_with_coordinates["scheduled_departure_time"] + df_with_coordinates["train_departure_delay_timedelta"]) - 
    (df_with_coordinates["scheduled_arrival_time"] + df_with_coordinates["train_arrival_delay_timedelta"])
).dt.total_seconds() / 60

# Ratio between actual and planned total duration
df_with_coordinates["planned_vs_actual_total_ratio"] = df_with_coordinates["actual_total_duration"] / df_with_coordinates["scheduled_total_duration"]

# Handle division by zero or NaN values
df_with_coordinates["planned_vs_actual_total_ratio"] = df_with_coordinates["planned_vs_actual_total_ratio"].replace([np.inf, -np.inf], np.nan)
df_with_coordinates["planned_vs_actual_total_ratio"] = df_with_coordinates["planned_vs_actual_total_ratio"].fillna(1)  # Default to 1 when missing data

# Drop temporary columns
df_with_coordinates = df_with_coordinates.drop(columns=["train_departure_delay_timedelta", "train_arrival_delay_timedelta"])

In [28]:
df_with_coordinates.describe()

Unnamed: 0,train_departure_delay,train_arrival_delay,scheduled_departure_time,scheduled_arrival_time,stop_arrival_delay,stop_departure_delay,stop_arrival_time,stop_departure_time,latitude,longitude,...,is_delay_increasing,rolling_arrival_delay,rolling_departure_delay,historical_avg_delay,planned_stop_duration,actual_stop_duration,planned_vs_actual_stop_duration_ratio,scheduled_total_duration,actual_total_duration,planned_vs_actual_total_ratio
count,27973420.0,27973420.0,27973425,27973425,27973420.0,27973420.0,25216553,25198541,27016230.0,27016230.0,...,27973420.0,19672110.0,19672110.0,27973420.0,22441680.0,22441680.0,27973420.0,27973420.0,27973420.0,27973420.0
mean,2.640689,2.445902,2024-06-29 19:12:30.748817152,2024-06-29 21:01:46.929380096,2.90757,3.99469,2024-06-29 20:09:50.362128384,2024-06-29 20:04:28.495725568,43.69838,11.22981,...,0.1273379,3.262654,4.613763,2.90757,1.396357,2.699687,1.962036,109.2697,-109.0749,-0.9919872
min,-9.0,-10.0,2023-12-31 23:02:00,2023-12-31 23:28:00,-10.0,-10.0,2023-12-31 23:13:00,2023-12-31 23:02:00,36.7204,6.703205,...,0.0,-10.0,-10.0,-10.0,-437799.0,-437799.0,-371.0,-3705.0,-4313.0,-30.66667
25%,1.0,-1.0,2024-03-29 06:05:00,2024-03-29 07:51:00,0.0,1.0,2024-03-29 06:38:00,2024-03-29 06:31:00,41.92781,9.187344,...,0.0,0.0,1.333333,1.638084,1.0,1.0,1.0,61.0,-124.0,-1.016129
50%,1.0,1.0,2024-06-27 05:20:00,2024-06-27 07:10:00,1.0,2.0,2024-06-27 05:56:00,2024-06-27 05:50:00,44.48262,11.14876,...,0.0,1.666667,2.666667,2.703022,1.0,2.0,2.0,83.0,-83.0,-0.9861751
75%,3.0,3.0,2024-10-01 05:31:00,2024-10-01 07:20:00,4.0,5.0,2024-10-01 07:18:00,2024-10-01 07:11:00,45.48588,12.6184,...,0.0,4.0,5.333333,3.953846,1.0,3.0,3.0,126.0,-60.0,-0.9571429
max,289.0,300.0,2024-12-31 22:55:00,2025-01-01 13:40:00,300.0,299.0,2025-01-01 13:40:00,2025-01-01 13:34:00,47.00374,18.36933,...,1.0,296.6667,297.0,295.0,482408.0,482409.0,351.0,4312.0,3707.0,185.0
std,4.677534,9.453813,,,7.457632,7.174272,,,2.2452,2.362223,...,0.3333511,7.154554,7.329608,1.95351,283.7087,283.7173,1.830868,99.03627,100.8161,0.1353091


**Extreme Delay Flag**

In [12]:
# Define extreme delay threshold (e.g., top 5% of delays)
extreme_delay_threshold = df_with_coordinates["stop_arrival_delay"].quantile(0.95)
df_with_coordinates["is_extreme_delay"] = (df_with_coordinates["stop_arrival_delay"] >= extreme_delay_threshold).astype(int)

In [30]:
df_with_coordinates.describe()

Unnamed: 0,train_departure_delay,train_arrival_delay,scheduled_departure_time,scheduled_arrival_time,stop_arrival_delay,stop_departure_delay,stop_arrival_time,stop_departure_time,latitude,longitude,...,rolling_arrival_delay,rolling_departure_delay,historical_avg_delay,planned_stop_duration,actual_stop_duration,planned_vs_actual_stop_duration_ratio,scheduled_total_duration,actual_total_duration,planned_vs_actual_total_ratio,is_extreme_delay
count,27973420.0,27973420.0,27973425,27973425,27973420.0,27973420.0,25216553,25198541,27016230.0,27016230.0,...,19672110.0,19672110.0,27973420.0,22441680.0,22441680.0,27973420.0,27973420.0,27973420.0,27973420.0,27973420.0
mean,2.640689,2.445902,2024-06-29 19:12:30.748817152,2024-06-29 21:01:46.929380096,2.90757,3.99469,2024-06-29 20:09:50.362128384,2024-06-29 20:04:28.495725568,43.69838,11.22981,...,3.262654,4.613763,2.90757,1.396357,2.699687,1.962036,109.2697,-109.0749,-0.9919872,0.05555891
min,-9.0,-10.0,2023-12-31 23:02:00,2023-12-31 23:28:00,-10.0,-10.0,2023-12-31 23:13:00,2023-12-31 23:02:00,36.7204,6.703205,...,-10.0,-10.0,-10.0,-437799.0,-437799.0,-371.0,-3705.0,-4313.0,-30.66667,0.0
25%,1.0,-1.0,2024-03-29 06:05:00,2024-03-29 07:51:00,0.0,1.0,2024-03-29 06:38:00,2024-03-29 06:31:00,41.92781,9.187344,...,0.0,1.333333,1.638084,1.0,1.0,1.0,61.0,-124.0,-1.016129,0.0
50%,1.0,1.0,2024-06-27 05:20:00,2024-06-27 07:10:00,1.0,2.0,2024-06-27 05:56:00,2024-06-27 05:50:00,44.48262,11.14876,...,1.666667,2.666667,2.703022,1.0,2.0,2.0,83.0,-83.0,-0.9861751,0.0
75%,3.0,3.0,2024-10-01 05:31:00,2024-10-01 07:20:00,4.0,5.0,2024-10-01 07:18:00,2024-10-01 07:11:00,45.48588,12.6184,...,4.0,5.333333,3.953846,1.0,3.0,3.0,126.0,-60.0,-0.9571429,0.0
max,289.0,300.0,2024-12-31 22:55:00,2025-01-01 13:40:00,300.0,299.0,2025-01-01 13:40:00,2025-01-01 13:34:00,47.00374,18.36933,...,296.6667,297.0,295.0,482408.0,482409.0,351.0,4312.0,3707.0,185.0,1.0
std,4.677534,9.453813,,,7.457632,7.174272,,,2.2452,2.362223,...,7.154554,7.329608,1.95351,283.7087,283.7173,1.830868,99.03627,100.8161,0.1353091,0.2290679


In [31]:
df_with_coordinates.to_parquet(INTERIM_PATH / "train_data_fe.parquet", index=False)

print("Datasets successfully saved in 'data/interim'")

Datasets successfully saved in 'data/interim'


**BUILD KNOWLEDGE GRAPH**

Struttura del Grafo
Nodi: Ogni stazione ferroviaria è un nodo del grafo.

Archi: Una connessione tra due nodi esiste se un treno viaggia tra quelle stazioni.

Pesi degli archi: Possiamo assegnare come peso il ritardo medio accumulato su quella tratta.

Feature Estratte dal Grafo
Dopo aver costruito il grafo, possiamo calcolare:
- Degree Centrality → Misura la connettività di una stazione. Stazioni più connesse potrebbero essere più soggette a congestione.
- PageRank → Misura l'importanza della stazione nella rete ferroviaria (es. snodi principali come Milano Centrale avranno valori alti).
- Betweenness Centrality → Quante volte una stazione è "di passaggio" nei percorsi più brevi tra due stazioni?
- Shortest Path to Hub → Quanto una stazione è lontana dalla più importante del sistema?

In [13]:
df = pd.read_parquet(INTERIM_PATH / "train_data_fe.parquet")

In [45]:
df.describe()

Unnamed: 0,train_departure_delay,train_arrival_delay,scheduled_departure_time,scheduled_arrival_time,stop_arrival_delay,stop_departure_delay,stop_arrival_time,stop_departure_time,latitude,longitude,...,rolling_arrival_delay,rolling_departure_delay,historical_avg_delay,planned_stop_duration,actual_stop_duration,planned_vs_actual_stop_duration_ratio,scheduled_total_duration,actual_total_duration,planned_vs_actual_total_ratio,is_extreme_delay
count,27973420.0,27973420.0,27973425,27973425,27973420.0,27973420.0,25216553,25198541,27016230.0,27016230.0,...,19672110.0,19672110.0,27973420.0,22441680.0,22441680.0,27973420.0,27973420.0,27973420.0,27973420.0,27973420.0
mean,2.640689,2.445902,2024-06-29 19:12:30.748817152,2024-06-29 21:01:46.929380096,2.90757,3.99469,2024-06-29 20:09:50.362128384,2024-06-29 20:04:28.495725568,43.69838,11.22981,...,3.262654,4.613763,2.90757,1.396357,2.699687,1.962036,109.2697,-109.0749,-0.9919872,0.05555891
min,-9.0,-10.0,2023-12-31 23:02:00,2023-12-31 23:28:00,-10.0,-10.0,2023-12-31 23:13:00,2023-12-31 23:02:00,36.7204,6.703205,...,-10.0,-10.0,-10.0,-437799.0,-437799.0,-371.0,-3705.0,-4313.0,-30.66667,0.0
25%,1.0,-1.0,2024-03-29 06:05:00,2024-03-29 07:51:00,0.0,1.0,2024-03-29 06:38:00,2024-03-29 06:31:00,41.92781,9.187344,...,0.0,1.333333,1.638084,1.0,1.0,1.0,61.0,-124.0,-1.016129,0.0
50%,1.0,1.0,2024-06-27 05:20:00,2024-06-27 07:10:00,1.0,2.0,2024-06-27 05:56:00,2024-06-27 05:50:00,44.48262,11.14876,...,1.666667,2.666667,2.703022,1.0,2.0,2.0,83.0,-83.0,-0.9861751,0.0
75%,3.0,3.0,2024-10-01 05:31:00,2024-10-01 07:20:00,4.0,5.0,2024-10-01 07:18:00,2024-10-01 07:11:00,45.48588,12.6184,...,4.0,5.333333,3.953846,1.0,3.0,3.0,126.0,-60.0,-0.9571429,0.0
max,289.0,300.0,2024-12-31 22:55:00,2025-01-01 13:40:00,300.0,299.0,2025-01-01 13:40:00,2025-01-01 13:34:00,47.00374,18.36933,...,296.6667,297.0,295.0,482408.0,482409.0,351.0,4312.0,3707.0,185.0,1.0
std,4.677534,9.453813,,,7.457632,7.174272,,,2.2452,2.362223,...,7.154554,7.329608,1.95351,283.7087,283.7173,1.830868,99.03627,100.8161,0.1353091,0.2290679


In [14]:
# Sort by train number and departure time to get correct stop order
df_sorted = df.sort_values(by=["train_number", "scheduled_departure_time"])

In [15]:
# Create a directed graph
G = nx.DiGraph()

In [16]:
# Iterate over each train route
for train_id, group in df_sorted.groupby("train_number"):
    stops = group["stop_name"].tolist()  # Get list of stops in correct order
    delays = group["stop_arrival_delay"].tolist()  # Corresponding delays

    # Add edges between consecutive stops
    for i in range(len(stops) - 1):
        dep_station = stops[i]
        arr_station = stops[i + 1]
        delay = delays[i + 1]  # Arrival delay at next station

        if G.has_edge(dep_station, arr_station):
            # Average the delay if multiple trains exist on this route
            G[dep_station][arr_station]['weight'] = (G[dep_station][arr_station]['weight'] + delay) / 2
        else:
            G.add_edge(dep_station, arr_station, weight=delay)

print(f"✅ Corrected Graph built with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

✅ Corrected Graph built with 2357 nodes and 19832 edges.


**COMPUTE GRAPH FEATURES**

In [17]:
# Degree Centrality (how connected a station is)
degree_centrality = nx.degree_centrality(G)
df["degree_centrality"] = df["stop_name"].map(degree_centrality)
df["degree_centrality"] /= df["degree_centrality"].max()

In [18]:
# Ensure all edge weights are non-negative
min_delay = min([data["weight"] for _, _, data in G.edges(data=True)])
for u, v, data in G.edges(data=True):
    data["weight"] = (data["weight"] - min_delay) + 1  # Shift to be >= 1

# Remove self-loops
G.remove_edges_from(nx.selfloop_edges(G))

# Compute PageRank on each connected component
pagerank = {}
for component in nx.weakly_connected_components(G):
    subgraph = G.subgraph(component)
    pr = nx.pagerank(subgraph, alpha=0.85, max_iter=1000)
    pagerank.update(pr)

df["pagerank"] = df["stop_name"].map(pagerank)
df["pagerank"] /= df["pagerank"].max()  # Normalize

In [19]:
# Betweenness Centrality (how often a station is used as a transit point)
betweenness_centrality = nx.betweenness_centrality(G, weight="weight")

df["betweenness_centrality"] = df["stop_name"].map(betweenness_centrality)
df["betweenness_centrality"] /= df["betweenness_centrality"].max()


In [20]:
# Shortest Path to Hub (how far each station is from the most connected one)
main_hub = max(degree_centrality, key=degree_centrality.get)
shortest_paths = dict(nx.shortest_path_length(G, weight="weight"))

df["shortest_path_to_hub"] = df["stop_name"].map(lambda x: shortest_paths.get(x, {}).get(main_hub, None))
df["shortest_path_to_hub"] /= df["shortest_path_to_hub"].max()

In [55]:
df.head()

Unnamed: 0,train_id,train_number,departure_station,train_departure_delay,arrival_station,train_arrival_delay,train_class,scheduled_departure_time,scheduled_arrival_time,stop_name,...,actual_stop_duration,planned_vs_actual_stop_duration_ratio,scheduled_total_duration,actual_total_duration,planned_vs_actual_total_ratio,is_extreme_delay,pagerank,degree_centrality,betweenness_centrality,shortest_path_to_hub
0,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,...,,1.0,47.0,-45.0,-0.957447,0,0.792493,0.712082,0.500754,0.042304
1,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,...,1.0,1.0,47.0,-45.0,-0.957447,0,0.046737,0.033419,0.000492,0.081072
2,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,...,0.0,0.0,47.0,-45.0,-0.957447,0,0.048414,0.03856,2.1e-05,0.072917
3,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,...,2.0,2.0,47.0,-45.0,-0.957447,0,0.102761,0.071979,0.000682,0.07483
4,17431-1727735880-Qk9MT0dOQSBDLkxF,17431,BOLOGNA C.LE,1.0,FAENZA,-1.0,REG,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,...,2.0,2.0,47.0,-45.0,-0.957447,0,0.171091,0.125964,0.002284,0.072866


In [57]:
df.describe()

Unnamed: 0,train_departure_delay,train_arrival_delay,scheduled_departure_time,scheduled_arrival_time,stop_arrival_delay,stop_departure_delay,stop_arrival_time,stop_departure_time,latitude,longitude,...,actual_stop_duration,planned_vs_actual_stop_duration_ratio,scheduled_total_duration,actual_total_duration,planned_vs_actual_total_ratio,is_extreme_delay,pagerank,degree_centrality,betweenness_centrality,shortest_path_to_hub
count,27973420.0,27973420.0,27973425,27973425,27973420.0,27973420.0,25216553,25198541,27016230.0,27016230.0,...,22441680.0,27973420.0,27973420.0,27973420.0,27973420.0,27973420.0,27973420.0,27973420.0,27973420.0,27973420.0
mean,2.640689,2.445902,2024-06-29 19:12:30.748817152,2024-06-29 21:01:46.929380096,2.90757,3.99469,2024-06-29 20:09:50.362128384,2024-06-29 20:04:28.495725568,43.69838,11.22981,...,2.699687,1.962036,109.2697,-109.0749,-0.9919872,0.05555891,0.1390824,0.121741,0.04691481,0.07184706
min,-9.0,-10.0,2023-12-31 23:02:00,2023-12-31 23:28:00,-10.0,-10.0,2023-12-31 23:13:00,2023-12-31 23:02:00,36.7204,6.703205,...,-437799.0,-371.0,-3705.0,-4313.0,-30.66667,0.0,0.01020242,0.002570694,0.0,0.0
25%,1.0,-1.0,2024-03-29 06:05:00,2024-03-29 07:51:00,0.0,1.0,2024-03-29 06:38:00,2024-03-29 06:31:00,41.92781,9.187344,...,1.0,1.0,61.0,-124.0,-1.016129,0.0,0.04590678,0.03341902,0.0001550106,0.05442239
50%,1.0,1.0,2024-06-27 05:20:00,2024-06-27 07:10:00,1.0,2.0,2024-06-27 05:56:00,2024-06-27 05:50:00,44.48262,11.14876,...,2.0,2.0,83.0,-83.0,-0.9861751,0.0,0.07488127,0.05912596,0.004030275,0.07366231
75%,3.0,3.0,2024-10-01 05:31:00,2024-10-01 07:20:00,4.0,5.0,2024-10-01 07:18:00,2024-10-01 07:11:00,45.48588,12.6184,...,3.0,3.0,126.0,-60.0,-0.9571429,0.0,0.1623505,0.1336761,0.02136336,0.08484347
max,289.0,300.0,2024-12-31 22:55:00,2025-01-01 13:40:00,300.0,299.0,2025-01-01 13:40:00,2025-01-01 13:34:00,47.00374,18.36933,...,482409.0,351.0,4312.0,3707.0,185.0,1.0,1.0,1.0,1.0,1.0
std,4.677534,9.453813,,,7.457632,7.174272,,,2.2452,2.362223,...,283.7173,1.830868,99.03627,100.8161,0.1353091,0.2290679,0.1654614,0.1599821,0.133578,0.0265453


In [58]:
df.to_parquet(PROCESSED_PATH / "train_data_fe_kg.parquet", index=False)

print("Datasets successfully saved in 'data/interim'")

Datasets successfully saved in 'data/interim'


In [25]:
df = df.dropna(subset=["latitude", "longitude"])

In [28]:
import folium

# Keep only the top 30% most connected stations
high_traffic_threshold = df["degree_centrality"].quantile(0.70)  
df_filtered = df[df["degree_centrality"] >= high_traffic_threshold]

df_filtered = df_filtered.dropna(subset=["latitude", "longitude"])

# Reduce edges: only keep connections between high-traffic stations
filtered_edges = []
for dep_station, arr_station in G.edges():
    if dep_station in df_filtered["stop_name"].values and arr_station in df_filtered["stop_name"].values:
        filtered_edges.append((dep_station, arr_station))

print(f"Reduced network: {len(df_filtered)} stations and {len(filtered_edges)} connections.")

Reduced network: 8243207 stations and 2521 connections.


In [29]:
map_center = [42.5, 12.5]  # Approximate center of Italya
m = folium.Map(location=map_center, zoom_start=6, tiles="cartodbpositron")

for _, row in df_filtered.iterrows():
    station_name = row["stop_name"]
    lat, lon = row["latitude"], row["longitude"]
    
    folium.CircleMarker(
        location=(lat, lon),
        radius=4,
        color="blue",
        fill=True,
        fill_color="blue",
        fill_opacity=0.6,
        popup=station_name
    ).add_to(m)

for dep_station, arr_station in filtered_edges:
    dep_row = df_filtered[df_filtered["stop_name"] == dep_station]
    arr_row = df_filtered[df_filtered["stop_name"] == arr_station]

    if not dep_row.empty and not arr_row.empty:
        lat1, lon1 = dep_row.iloc[0]["latitude"], dep_row.iloc[0]["longitude"]
        lat2, lon2 = arr_row.iloc[0]["latitude"], arr_row.iloc[0]["longitude"]

        folium.PolyLine(
            locations=[(lat1, lon1), (lat2, lon2)],
            color="gray",
            weight=1,  # Thin lines for lightweight visualization
            opacity=0.5
        ).add_to(m)

m

KeyboardInterrupt: 

**WEATHER DATA INTEGRATION**

In [None]:
from meteostat import Hourly, Stations
from datetime import datetime, timezone

def get_weather_meteostat(lat, lon, timestamp):
    try:
        # Convert timestamp to datetime format
        date = datetime.utcfromtimestamp(timestamp).replace(tzinfo=timezone.utc)

        # Find the nearest weather station
        stations = Stations().nearby(lat, lon).fetch(1)
        if stations.empty:
            return None

        station_id = stations.index[0]

        # Retrieve hourly weather data for the specific date
        data = Hourly(station_id, start=date, end=date).fetch()

        if data.empty:
            return None

        row = data.iloc[0]
        return {
            "temperature": row["temp"],  # Temperature in °C
            "precipitation": row["prcp"],  # Precipitation in mm
            "snow": row["snow"],  # Snow in cm
            "humidity": row["rhum"],  # Humidity in %
            "wind_speed": row["wspd"],  # Wind speed in km/h
            "pressure": row["pres"],  # Atmospheric pressure in hPa
            "weather_code": row["coco"]  # Weather condition code
        }

    except Exception as e:
        print(f"Error retrieving weather data for {lat}, {lon}: {e}")
        return None

In [None]:
weather_data = []

for index, row in df_with_coordinates.iterrows():
    lat, lon, timestamp = row["latitude"], row["longitude"], row["arrival_timestamp"]

    weather = get_weather_meteostat(lat, lon, timestamp)

    if weather:
        weather_data.append(weather)
    else:
        weather_data.append(
            {
                "temperature": None, 
                "precipitation": None, 
                "snow": None,
                "humidity": None, 
                "wind_speed": None, 
                "pressure": None, 
                "weather_code": None})

# Create a DataFrame for weather data and merge it with the main dataset
weather_df = pd.DataFrame(weather_data)

In [None]:
weather_df.head()

In [None]:
weather_df.describe()

In [None]:
df_with_weather = pd.concat([df_with_coordinates, weather_df], axis=1)

In [None]:
df_with_weather.to_parquet(PROCESSED_PATH / "train_data_with_weather.parquet", index=False)

print("Dataset with weather data saved successfully!")

In [None]:
# =====================================================
# 📌 6. DEFINE TARGET VARIABLE (y) FOR ML
# =====================================================

df["next_stop_arrival_delay"] = df.groupby("train_id")["stop_arrival_delay"].shift(-1)


In [None]:
# =====================================================
# 📌 7. FEATURE SELECTION & SAVE FINAL DATASET
# =====================================================

# Drop unnecessary columns (keep timestamps for time-series models)
drop_cols = ["train_id", "train_number", "stop_departure_time", "stop_arrival_time"]
df.drop(columns=drop_cols, inplace=True)

# Save dataset
df.to_csv("data/processed/train_dataset_with_knowledge_graph.csv", index=False)

print("✅ Feature extraction with Knowledge Graph completed! Processed dataset saved.")

**Drop Unnecessary Features**
You dropped:  
```python
drop_cols = ["train_id", "train_number", "scheduled_departure_time", 
             "scheduled_arrival_time", "stop_departure_time", "stop_arrival_time", 
             "total_journey_start"]
```
- **Possible issue**:  
  - If you're using **LSTM or RNN**, you might **need timestamps** (`scheduled_departure_time`) for time-series modeling.  
  - Instead of dropping `train_id`, you could use it for **cross-validation grouping** (e.g., ensuring train sequences stay in the same fold).  

**1. Historical Delay Trends (Inspired by Literature)**
**Idea from [Real-Time Passenger Train Delay Prediction (Amtrak Study)](11)**  
- **Why?** If a train was delayed at `t-1`, it is more likely to be delayed at `t`.  
- **How?** Compute **past mean delays** per train, per station, per hour.  


**2. Weather Data Integration (Inspired by [Dynamic Delay Predictions Study](12))**  
**Why?** Weather (rain, snow) affects train delays.  
**How?** If you have external data, join with historical weather features:  

If `weather_df` contains features like `"rain_mm"`, `"temperature"`, etc., these could be useful.  

**3. Interaction Features**
Instead of raw congestion values, use **ratios** to station congestion:  
```python
df["relative_congestion"] = df["station_traffic"] / df["station_traffic"].max()
```
This normalizes congestion across different regions.  
