# Feature Engineering

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import googlemaps
import time
from tqdm import tqdm
import plotly.express as px
from plotly.subplots import make_subplots
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from meteostat import Hourly, Stations
from datetime import datetime, timezone
import networkx as nx
from geopy.distance import geodesic


In [44]:
# load_dotenv()
# api_key = os.getenv('GOOGLE_API_KEY')
# gmaps = googlemaps.Client(key=api_key)

In [2]:
INTERIM_PATH = Path("data/interim")
PROCESSED_PATH = Path("data/processed")

In [3]:
stazioni_df = pd.read_csv("data/raw/stazioni_italiane.csv")

In [4]:
stazioni_dict = {}

for _, row in stazioni_df.iterrows():
    long_name_lower = str(row["long_name"]).strip().lower()
    
    stazioni_dict[long_name_lower] = (row["latitude"], row["longitude"])

In [5]:
df = pd.read_parquet(INTERIM_PATH / "train_data_cleaned.parquet")

In [6]:
df["stop_name_lower"] = df["stop_name"].str.strip().str.lower()

In [7]:
df["latitude"] = df["stop_name_lower"].map(lambda stop: stazioni_dict.get(stop, (None, None))[0])
df["longitude"] = df["stop_name_lower"].map(lambda stop: stazioni_dict.get(stop, (None, None))[1])

In [8]:
missing_stops = set(df[df["latitude"].isna()]["stop_name"])
print(len(missing_stops))

376


In [9]:
df = df.dropna(subset=["latitude", "longitude"])

In [10]:
df = df.drop(["stop_name_lower"], axis=1)

In [54]:
df.head()

Unnamed: 0,train_number,departure_station,train_departure_delay,arrival_station,scheduled_departure_time,scheduled_arrival_time,stop_name,stop_arrival_delay,stop_departure_delay,stop_departure_time,is_terminal_stop,latitude,longitude
0,17431,BOLOGNA C.LE,1.0,FAENZA,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,0.0,1.0,2024-09-30 22:38:00,True,44.50626,11.342267
1,17431,BOLOGNA C.LE,1.0,FAENZA,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,2.0,2.0,2024-09-30 22:46:00,False,44.48097,11.416652
2,17431,BOLOGNA C.LE,1.0,FAENZA,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,3.0,2.0,2024-09-30 22:52:00,False,44.450797,11.487297
3,17431,BOLOGNA C.LE,1.0,FAENZA,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,3.0,4.0,2024-09-30 22:59:00,False,44.407376,11.597681
4,17431,BOLOGNA C.LE,1.0,FAENZA,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,3.0,4.0,2024-09-30 23:08:00,False,44.359418,11.718723


In [55]:
df.to_parquet(INTERIM_PATH / "train_data_cleaned_with_coordinates.parquet", index=False)

Google api for coordinates extraction, no more implemented due to temporal cost

In [56]:
# def get_coordinates_from_google(stop_name):
#     try:
#         query = f"Stazione ferroviaria {stop_name}, Italia"
#         geocode_result = gmaps.geocode(query, region="it")

#         if geocode_result:
#             lat = geocode_result[0]['geometry']['location']['lat']
#             lon = geocode_result[0]['geometry']['location']['lng']
#             return lat, lon
#         return None, None
#     except Exception as e:
#         print(f"Errore nel recupero di {stop_name}: {e}")
#         return None, None

# new_coordinates_list = []

# # Cerca le coordinate mancanti con Google Maps
# for stop in tqdm(missing_stops, desc="Retrieving missing coordinates", unit="stop"):
#     lat, lon = get_coordinates_from_google(stop)
#     new_coordinates_list.append({"stop_name": stop, "latitude": lat, "longitude": lon})
#     time.sleep(0.2)  # Pausa per rispettare i limiti API

# new_coordinates_df = pd.DataFrame(new_coordinates_list)

# df = df.merge(new_coordinates_df, on="stop_name", how="left", suffixes=("", "_new"))

# # Se le coordinate iniziali sono NaN, usiamo quelle nuove
# df["latitude"] = df["latitude"].fillna(df["latitude_new"])
# df["longitude"] = df["longitude"].fillna(df["longitude_new"])

# df.drop(columns=["latitude_new", "longitude_new", "stop_name_lower"], inplace=True)

# df = df.merge(new_coordinates_df, on="stop_name", how="left", suffixes=("", "_new"))

# # Se le coordinate iniziali sono NaN, usiamo quelle nuove
# df["latitude"] = df["latitude"].fillna(df["latitude_new"])
# df["longitude"] = df["longitude"].fillna(df["longitude_new"])

# df.drop(columns=["latitude_new", "longitude_new", "stop_name_lower"], inplace=True)

# df.to_parquet(INTERIM_PATH / "train_data_with_coordinates.parquet", index=False)

Create map for train stops distribution using density_mapbox

In [17]:
import folium
import pandas as pd
from IPython.display import display

In [18]:
df_with_coordinates = pd.read_parquet(INTERIM_PATH / "train_data_cleaned_with_coordinates.parquet")

In [19]:
df_small = df_with_coordinates[['stop_name', 'latitude', 'longitude']]
df_small = df_small.drop_duplicates()

In [20]:
fig_stops = px.density_map(
    df_small,
    lat='latitude',
    lon='longitude',
    hover_name="stop_name",
    title="Train Stops Distribution",
    radius=10,
    opacity=0.6,
    zoom=6,
    map_style="carto-positron")
fig_stops.update_layout(height=900)
fig_stops.update_layout(width=1200)

fig_stops.show()

Screenshot inserted as the original density map cannot be viewed on github:

![image](figures/stops_density_map.png)

In [21]:
# Calculation and visualization of missing values
missing_values = df_with_coordinates.isnull().sum()
missing_percentage = (missing_values / len(df_with_coordinates)) * 100

missing_summary = pd.DataFrame({
    "Missing Values": missing_values,
    "Percentage": missing_percentage.round(2)
})

missing_summary = missing_summary[missing_summary["Missing Values"] > 0]
print("\n Null values in the final dataset:")
print(missing_summary)


 Null values in the final dataset:
                     Missing Values  Percentage
stop_departure_time         2544922       10.42


## **Time-Based Features**

In [22]:
df_with_coordinates = pd.read_parquet(INTERIM_PATH / "train_data_cleaned_with_coordinates.parquet")

In [23]:
df_with_coordinates["hour"] = df_with_coordinates["scheduled_departure_time"].dt.hour
df_with_coordinates["day_of_week"] = df_with_coordinates["scheduled_departure_time"].dt.dayofweek  # Monday=0, Sunday=6
df_with_coordinates["is_weekend"] = df_with_coordinates["day_of_week"].isin([5, 6]).astype(int)
df_with_coordinates["month"] = df_with_coordinates["scheduled_departure_time"].dt.month

# Define rush hours (e.g., 7-9 AM, 5-7 PM)
df_with_coordinates["is_rush_hour"] = df_with_coordinates["hour"].isin([7, 8, 9, 17, 18, 19]).astype(int)

## **Station-Specific Features**

In [24]:
# Count how often each station appears (proxy for congestion)
station_counts = df_with_coordinates["stop_name"].value_counts()
df_with_coordinates["station_traffic"] = df_with_coordinates["stop_name"].map(station_counts)

# Define high-traffic stations (above median frequency)
median_traffic = df_with_coordinates["station_traffic"].median()
df_with_coordinates["is_high_traffic_station"] = (df_with_coordinates["station_traffic"] >= median_traffic).astype(int)

df_with_coordinates.drop(columns=["station_traffic"], inplace=True)

In [25]:
df_with_coordinates.head()

Unnamed: 0,train_number,departure_station,train_departure_delay,arrival_station,scheduled_departure_time,scheduled_arrival_time,stop_name,stop_arrival_delay,stop_departure_delay,stop_departure_time,is_terminal_stop,latitude,longitude,hour,day_of_week,is_weekend,month,is_rush_hour,is_high_traffic_station
0,17431,BOLOGNA C.LE,1.0,FAENZA,2024-09-30 22:38:00,2024-09-30 23:25:00,BOLOGNA C.LE,0.0,1.0,2024-09-30 22:38:00,True,44.50626,11.342267,22,0,0,9,0,1
1,17431,BOLOGNA C.LE,1.0,FAENZA,2024-09-30 22:38:00,2024-09-30 23:25:00,S.LAZZARO DI SAVENA,2.0,2.0,2024-09-30 22:46:00,False,44.48097,11.416652,22,0,0,9,0,0
2,17431,BOLOGNA C.LE,1.0,FAENZA,2024-09-30 22:38:00,2024-09-30 23:25:00,OZZANO DELL'EMILIA,3.0,2.0,2024-09-30 22:52:00,False,44.450797,11.487297,22,0,0,9,0,0
3,17431,BOLOGNA C.LE,1.0,FAENZA,2024-09-30 22:38:00,2024-09-30 23:25:00,CASTEL S.PIETRO TERME,3.0,4.0,2024-09-30 22:59:00,False,44.407376,11.597681,22,0,0,9,0,1
4,17431,BOLOGNA C.LE,1.0,FAENZA,2024-09-30 22:38:00,2024-09-30 23:25:00,IMOLA,3.0,4.0,2024-09-30 23:08:00,False,44.359418,11.718723,22,0,0,9,0,1


In [16]:
df_with_coordinates.to_parquet(PROCESSED_PATH / "train_data_fe.parquet", index=False)

print("Datasets successfully saved in 'data/interim'")

Datasets successfully saved in 'data/interim'


## **GRAPH BUILD for FEATURE EXTRACTION**

The railway network is modeled as a graph where each train station is a node, and an edge exists between two nodes if a train operates between them. 
The edges are weighted based on the average positive delay (`avg_positive_delay`), meaning only the delays contribute to the structure of the network's importance.
Advances (`avg_early_arrival`) are recorded, but do not affect metrics such as PageRank or Betweenness Centrality

The extracted features provide insights into the network’s connectivity, efficiency, and congestion points. 
Each metric serves a distinct purpose in understanding how delays propagate and which stations play a critical role in this process.
We hope that such features can help models learning.



In [26]:
import pandas as pd
import networkx as nx
import geopandas as gpd
from shapely.geometry import LineString
from pathlib import Path
from tqdm import tqdm
from keplergl import KeplerGl

In [27]:
PROCESSED_PATH = Path("data/processed")

In [28]:
df = pd.read_parquet(PROCESSED_PATH / "train_data_fe.parquet")
df.sort_values(by=["train_number", "scheduled_departure_time"], inplace=True)

In [29]:
G = nx.DiGraph()
edges_dict = {}

In [30]:
for train_number, stops in tqdm(df.groupby("train_number"), desc="Processing Trains"):
    previous_station = None

    for _, row in stops.iterrows():
        current_station = row["stop_name"]

        # Add node if it doesn't exist
        if current_station not in G:
            G.add_node(current_station)

        # If there is a previous station, create a directed edge
        if previous_station:
            edge = (previous_station, current_station)
            delay = row["stop_arrival_delay"]

            # Efficiently update edge weights
            if edge in edges_dict:
                edges_dict[edge]["trains"].add(train_number)
                edges_dict[edge]["delay_sum"] += max(0, delay)  # Only positive delays
                edges_dict[edge]["early_sum"] += abs(min(0, delay))  # Only early arrivals
                edges_dict[edge]["count"] += 1
            else:
                edges_dict[edge] = {
                    "trains": {train_number},
                    "delay_sum": max(0, delay),  # Only positive delays
                    "early_sum": abs(min(0, delay)),  # Only early arrivals
                    "count": 1
                }

        previous_station = current_station

Processing Trains: 100%|██████████| 16585/16585 [05:57<00:00, 46.40it/s] 


In [31]:
for (station_a, station_b), data in edges_dict.items():
    avg_positive_delay = data["delay_sum"] / data["count"]
    avg_early_arrival = data["early_sum"] / data["count"]

    G.add_edge(station_a, station_b, 
               trains=",".join(map(str, data["trains"])), 
               avg_positive_delay=avg_positive_delay, 
               avg_early_arrival=avg_early_arrival)

nx.write_graphml(G, "other/train_network.graphml")
print("Graph saved successfully as 'train_network.graphml'")

Graph saved successfully as 'train_network.graphml'


**Degree Centrality and Network Congestion**

Degree centrality purely measures the number of direct connections a station has, disregarding delays. 
This metric is useful for identifying hubs that serve as connection points in the network. However, high-degree stations are not necessarily the most congested unless delays propagate significantly through them.

In [32]:
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G, weight="avg_positive_delay")
closeness_centrality = nx.closeness_centrality(G)
pagerank_centrality = nx.pagerank(G, alpha=0.85, weight="avg_positive_delay")

In [33]:
df["degree_centrality"] = df["stop_name"].map(degree_centrality)
df["betweenness_centrality"] = df["stop_name"].map(betweenness_centrality)
df["closeness_centrality"] = df["stop_name"].map(closeness_centrality)
df["pagerank"] = df["stop_name"].map(pagerank_centrality)

In [34]:
# Normalization
for col in ["degree_centrality", "betweenness_centrality", "closeness_centrality", "pagerank"]:
    df[col] = df[col] / df[col].max()
    df[col] = df[col].fillna(0)

In [35]:
df.head()

Unnamed: 0,train_number,departure_station,train_departure_delay,arrival_station,scheduled_departure_time,scheduled_arrival_time,stop_name,stop_arrival_delay,stop_departure_delay,stop_departure_time,...,hour,day_of_week,is_weekend,month,is_rush_hour,is_high_traffic_station,degree_centrality,betweenness_centrality,closeness_centrality,pagerank
5009319,10,MILANO CENTRALE,1.0,CHIASSO,2024-12-15 06:10:00,2024-12-15 06:58:00,MILANO CENTRALE,0.0,1.0,2024-12-15 06:10:00,...,6,6,1,12,0,1,0.830239,0.0001694587,0.98366,0.382947
5009320,10,MILANO CENTRALE,1.0,CHIASSO,2024-12-15 06:10:00,2024-12-15 06:58:00,COMO S. GIOVANNI,-4.0,1.0,2024-12-15 06:54:00,...,6,6,1,12,0,1,0.145889,4.19083e-23,0.728461,0.208147
5009321,10,MILANO CENTRALE,1.0,CHIASSO,2024-12-15 06:10:00,2024-12-15 06:58:00,CHIASSO,-1.0,0.0,NaT,...,6,6,1,12,0,1,0.098143,5.428734000000001e-17,0.740528,0.033407
5061361,10,MILANO CENTRALE,2.0,CHIASSO,2024-12-16 06:10:00,2024-12-16 06:58:00,MILANO CENTRALE,0.0,2.0,2024-12-16 06:10:00,...,6,0,0,12,0,1,0.830239,0.0001694587,0.98366,0.382947
5061362,10,MILANO CENTRALE,2.0,CHIASSO,2024-12-16 06:10:00,2024-12-16 06:58:00,COMO S. GIOVANNI,3.0,3.0,2024-12-16 06:54:00,...,6,0,0,12,0,1,0.145889,4.19083e-23,0.728461,0.208147


In [36]:
df.to_parquet(PROCESSED_PATH / "train_data_fe_graph.parquet", index=False)
print("Graph features extracted and saved successfully")

Graph features extracted and saved successfully


In [37]:
station_coords = df.groupby("stop_name")[["latitude", "longitude"]].first().reset_index()

# Nodes (stations)
nodes_df = pd.DataFrame({
    "station": list(G.nodes),
    "latitude": [station_coords.loc[station_coords["stop_name"] == n, "latitude"].values[0] for n in G.nodes],
    "longitude": [station_coords.loc[station_coords["stop_name"] == n, "longitude"].values[0] for n in G.nodes]
})

# Edges (train routes)
edges_list = []
for station_a, station_b, attr in G.edges(data=True):
    if station_a in station_coords["stop_name"].values and station_b in station_coords["stop_name"].values:
        lat_a = station_coords.loc[station_coords["stop_name"] == station_a, "latitude"].values[0]
        lon_a = station_coords.loc[station_coords["stop_name"] == station_a, "longitude"].values[0]
        lat_b = station_coords.loc[station_coords["stop_name"] == station_b, "latitude"].values[0]
        lon_b = station_coords.loc[station_coords["stop_name"] == station_b, "longitude"].values[0]

        edges_list.append({
            "start_station": station_a,
            "end_station": station_b,
            "trains": attr["trains"],
            "average_positive_delay": attr["avg_positive_delay"],
            "average_early_arrival": attr["avg_early_arrival"],
            "geometry": LineString([(lon_a, lat_a), (lon_b, lat_b)])
        })

edges_gdf = gpd.GeoDataFrame(edges_list, geometry="geometry", crs="EPSG:4326")

m = KeplerGl(height=800, data={"Stations": nodes_df, "Train Routes": edges_gdf})

m.config = {
    "version": "v1",
    "config": {
        "mapState": {
            "bearing": 0,
            "latitude": 42.5,
            "longitude": 12.5,
            "pitch": 0,
            "zoom": 6
        }
    }
}

m.save_to_html(file_name="other/train_network_kepler.html")
m

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to other/train_network_kepler.html!


KeplerGl(config={'version': 'v1', 'config': {'mapState': {'bearing': 0, 'latitude': 42.5, 'longitude': 12.5, '…

Screenshot inserted as the original density map cannot be viewed on github:

![image](figures/graph.png)

## **WEATHER DATA INTEGRATION** (not yet implemented due to time limitation)

Accurately predicting train delays requires integrating relevant weather variables that significantly impact railway operations. Recent studies, such as the one referenced ([arXiv:2203.06956](https://arxiv.org/abs/2203.06956)), highlight several key meteorological factors that influence train performance. These include **temperature, humidity, precipitation, snow depth, and wind speed**, each contributing to different types of disruptions in the railway system.  

- **Air Temperature:** Affects the expansion and contraction of railway materials, influencing track stability and increasing the risk of ice formation. Extreme temperatures can lead to track misalignments, affecting train speed and safety.  
- **Relative Humidity:** High humidity levels contribute to fog formation, reducing visibility for train operators. Additionally, in cold conditions, it increases the likelihood of ice accumulation on tracks.  
- **Precipitation (Rain & Snow):** Heavy rainfall can flood tracks and disrupt electrical components, while snow accumulation can lead to traction issues and blocked tracks.  
- **Snow Depth:** Significant snow accumulation can obstruct railway lines, leading to delays or cancellations. Maintenance efforts must be increased to ensure safe passage.  
- **Wind Speed:** Strong winds pose risks, especially on bridges or open tracks, where they can impact train stability and force speed reductions for safety.  


**Methodology for Integrating Weather Data into the Delay Model**  

To systematically incorporate weather data, we adopt an approach that aligns meteorological conditions with train performance at each station.  

We extract all unique latitude-longitude pairs from the dataset, ensuring that each station's location is accurately represented. This ensures that meteorological data is correctly assigned to the stations most affected by specific weather patterns.  

For each extracted geolocation, we retrieve weather conditions at 12:00 PM every day throughout 2024. This fixed timestamp provides a standardized snapshot of prevailing weather conditions.   

Once collected, the weather data is stored in a new DataFrame. Each row corresponds to a specific station, date, and its associated weather features, including temperature, humidity, precipitation, snow depth, and wind speed.  


To analyze the impact of weather on delays, we merge the weather dataset with the original train dataset based on latitude, longitude, and date.

In [None]:
# import pandas as pd
# import requests
# import time
# from tqdm import tqdm
# from pathlib import Path

In [None]:
# INTERIM_PATH = Path("data/interim")
# PROCESSED_PATH = Path("data/processed")

In [None]:
# API_URL = "https://api.open-meteo.com/v1/forecast"

In [None]:
# df = pd.read_parquet(PROCESSED_PATH / "train_data_fe_kg.parquet")
# stations_df = df[["stop_name", "latitude", "longitude"]].drop_duplicates().reset_index(drop=True)

In [None]:
# weather_data = []

# months_2024 = [(f"2024-{str(m).zfill(2)}-01", f"2024-{str(m).zfill(2)}-{(datetime(2024, m, 1) + timedelta(days=31)).replace(day=1) - timedelta(days=1)}") for m in range(1, 13)]

# total_requests = len(stations_df) * len(months_2024)

# for _, row in tqdm(stations_df.iterrows(), total=len(stations_df), desc="Fetching Weather Data for Each Station"):
#     lat, lon = row["latitude"], row["longitude"]

#     for start_date, end_date in months_2024:
#         params = {
#             "latitude": lat,
#             "longitude": lon,
#             "hourly": ["temperature_2m", "relative_humidity_2m", "precipitation", "snow_depth", "windspeed_10m"],
#             "timezone": "Europe/Rome",
#             "start": start_date,
#             "end": end_date
#         }

#         retries = 3
#         for attempt in range(retries):
#             try:
#                 response = requests.get(API_URL, params=params, timeout=10)

#                 if response.status_code == 200:
#                     data = response.json()

#                     # Extract only 12 PM data
#                     for i, timestamp in enumerate(data["hourly"]["time"]):
#                         if timestamp.endswith("12:00"):
#                             weather_data.append({
#                                 "date": timestamp[:10],
#                                 "latitude": lat,
#                                 "longitude": lon,
#                                 "temperature": data["hourly"]["temperature_2m"][i],
#                                 "humidity": data["hourly"]["relative_humidity_2m"][i],
#                                 "precipitation": data["hourly"]["precipitation"][i],
#                                 "snow_depth": data["hourly"]["snow_depth"][i],
#                                 "wind_speed": data["hourly"]["windspeed_10m"][i]
#                             })
#                     break

#                 else:
#                     print(f"API error for {lat}, {lon} ({start_date} - {end_date}): {response.status_code}")

#             except requests.exceptions.RequestException as e:
#                 print(f"Attempt {attempt+1} failed for {lat}, {lon} ({start_date} - {end_date}): {e}")
#                 time.sleep(2 ** attempt)  # Esponential Backoff: 2s, 4s, 8s

#             if attempt == retries - 1:
#                 print(f"Failed to fetch weather data for {lat}, {lon} ({start_date} - {end_date}) after {retries} attempts.")

#         time.sleep(1)

In [None]:
# weather_df = pd.DataFrame(weather_data)

In [None]:
# weather_df.to_parquet(INTERIM_PATH / "weather_data.parquet", index=False)

In [None]:
# from pathlib import Path

# PROCESSED_PATH = Path("data/processed")
# INTERIM_PATH = Path("data/interim")

# df = pd.read_parquet(INTERIM_PATH / "train_data_fe.parquet")
# df_graph = pd.read_parquet(INTERIM_PATH / "train_data_fe_kg.parquet")

In [None]:
# df.columns


In [None]:
# df_graph.columns

In [None]:
# drop_cols = [
#     "train_id", "train_number", "departure_station", "arrival_station", "stop_name",
#     "scheduled_departure_time", "scheduled_arrival_time", "stop_arrival_time", "stop_departure_time",
#     "is_terminal_stop", "actual_stop_duration", "actual_total_duration", "scheduled_total_duration", "planned_stop_duration",
#     "is_extreme_delay"
# ]

# df_clean = df.drop(columns=drop_cols)
# df_graph_clean = df_graph.drop(columns=drop_cols)

In [None]:
# df_clean.to_parquet(PROCESSED_PATH / "train_data_final.parquet", index=False)
# df_graph_clean.to_parquet(PROCESSED_PATH / "train_data_fe_graph_final.parquet", index=False)