### This is the first test to analyze the data in a train view


**Parameters:**

inputs:

- `train origin and destination`
- `date`


outputs:

- `overall train journey`: what did the train encounter?

In [1]:
# Import required libraries for station_view function
import pandas as pd
import numpy as np
import os
import glob
from datetime import datetime, timedelta
import sys
import matplotlib.pyplot as plt
import geopandas as gpd
import json
import folium
from folium import plugins


print(" Libraries imported successfully for station_view analysis")

 Libraries imported successfully for station_view analysis


In [2]:
# Load all .parquet files from processed_data into a pandas DataFrame, trying both pyarrow and fastparquet engines

data_dir = '../processed_data' if not os.path.isdir('processed_data') else 'processed_data'
all_parquet_files = glob.glob(os.path.join(data_dir, '*', '*.parquet'))

list_df = []
skipped_files = []
for file in all_parquet_files:
    try:
        df = pd.read_parquet(file)
    except Exception as e1:
        try:
            df = pd.read_parquet(file, engine='fastparquet')
        except Exception as e2:
            skipped_files.append(file)
            continue
    stanox = os.path.basename(os.path.dirname(file))
    day = os.path.splitext(os.path.basename(file))[0]
    df['STANOX'] = stanox
    df['DAY'] = day
    list_df.append(df)

if list_df:
    all_data = pd.concat(list_df, ignore_index=True)
    print(f"Loaded {len(all_data)} rows from {len(list_df)} files. Skipped {len(skipped_files)} files.")
else:
    all_data = pd.DataFrame()
    print("No data loaded.")

Loaded 10450237 rows from 2599 files. Skipped 0 files.


just for quick access during the test

In [3]:
all_data.columns.tolist() # Display the columns of the combined DataFrame

['TRAIN_SERVICE_CODE',
 'PLANNED_ORIGIN_LOCATION_CODE',
 'PLANNED_ORIGIN_GBTT_DATETIME',
 'PLANNED_DEST_LOCATION_CODE',
 'PLANNED_DEST_GBTT_DATETIME',
 'PLANNED_CALLS',
 'ACTUAL_CALLS',
 'PFPI_MINUTES',
 'INCIDENT_REASON',
 'INCIDENT_NUMBER',
 'EVENT_TYPE',
 'SECTION_CODE',
 'DELAY_DAY',
 'EVENT_DATETIME',
 'INCIDENT_START_DATETIME',
 'ENGLISH_DAY_TYPE',
 'STATION_ROLE',
 'DFT_CATEGORY',
 'PLATFORM_COUNT',
 'DATASET_TYPE',
 'WEEKDAY',
 'STANOX',
 'DAY']

In [11]:
def train_view(all_data, origin_code, destination_code, input_date_str):
    """
    View all train journeys between an OD pair and check for incidents on a specific date.
    Corrects PLANNED_CALLS using ACTUAL_CALLS - PFPI_MINUTES.
    
    Parameters
    ----------
    all_data : pd.DataFrame
        Must contain:
        ['TRAIN_SERVICE_CODE', 'PLANNED_ORIGIN_LOCATION_CODE', 'PLANNED_ORIGIN_GBTT_DATETIME',
         'PLANNED_DEST_LOCATION_CODE', 'PLANNED_DEST_GBTT_DATETIME', 'PLANNED_CALLS', 'ACTUAL_CALLS',
         'PFPI_MINUTES', 'INCIDENT_REASON', 'INCIDENT_NUMBER', 'EVENT_TYPE', 'SECTION_CODE', 'DELAY_DAY',
         'EVENT_DATETIME', 'INCIDENT_START_DATETIME', 'ENGLISH_DAY_TYPE', 'STATION_ROLE', 'DFT_CATEGORY',
         'PLATFORM_COUNT', 'DATASET_TYPE', 'WEEKDAY', 'STANOX', 'DAY']
    origin_code : str or int
        Origin location code.
    destination_code : str or int
        Destination location code.
    input_date_str : str
        Date in the format 'DD-MMM-YYYY' (e.g., '13-JUN-2024').

    Returns
    -------
    pd.DataFrame or str
        DataFrame of matching incidents (with corrected PLANNED_CALLS) or a message.
    """

    # --- Ensure OD_PAIR exists ---
    if 'OD_PAIR' not in all_data.columns:
        all_data['OD_PAIR'] = (
            all_data['PLANNED_ORIGIN_LOCATION_CODE'].astype(str)
            + '_'
            + all_data['PLANNED_DEST_LOCATION_CODE'].astype(str)
        )

    # --- Define OD pair key and convert dates ---
    od_pair = f"{origin_code}_{destination_code}"
    input_date = pd.to_datetime(input_date_str, format='%d-%b-%Y', errors='coerce')
    all_data['INCIDENT_START_DATETIME'] = pd.to_datetime(
        all_data['INCIDENT_START_DATETIME'], errors='coerce'
    )

    # --- Check if OD pair exists ---
    if od_pair not in all_data['OD_PAIR'].unique():
        message = f"OD pair {od_pair} not found in dataset."
        print(message)
        return message

    # --- Subset for this OD pair ---
    trains_between = all_data[all_data['OD_PAIR'] == od_pair].copy()

    # --- Correct PLANNED_CALLS using ACTUAL_CALLS - PFPI_MINUTES ---
    trains_between['ACTUAL_CALLS_dt'] = pd.to_datetime(
        trains_between['ACTUAL_CALLS'], format='%H%M', errors='coerce'
    )
    trains_between['PFPI_MINUTES_num'] = pd.to_numeric(
        trains_between['PFPI_MINUTES'], errors='coerce'
    )

    trains_between['CORRECTED_PLANNED_CALLS_dt'] = (
        trains_between['ACTUAL_CALLS_dt']
        - pd.to_timedelta(trains_between['PFPI_MINUTES_num'].fillna(0), unit='m')
    )
    trains_between['PLANNED_CALLS'] = trains_between[
        'CORRECTED_PLANNED_CALLS_dt'
    ].dt.strftime('%H%M').fillna(trains_between['PLANNED_CALLS'])

    print(f" Train journeys between {origin_code} and {destination_code}: {len(trains_between)} records found.")
    print(f"Unique train service codes: {trains_between['TRAIN_SERVICE_CODE'].dropna().unique().tolist()}")


    # --- Filter incidents by date ---
    incidents_on_date = trains_between[
        trains_between['INCIDENT_START_DATETIME'].dt.date == input_date.date()
    ].copy()

    if incidents_on_date.empty:
        message = f" No incidents found for OD pair {od_pair} on {input_date_str}."
        print(message)
        return message
    else:
        message = f" {len(incidents_on_date)} incident(s) found for OD pair {od_pair} on {input_date_str}:"
        print(message)

        # --- Columns to show ---
        cols_to_show = [
            'TRAIN_SERVICE_CODE', 'PLANNED_ORIGIN_LOCATION_CODE', 'PLANNED_ORIGIN_GBTT_DATETIME',
            'PLANNED_DEST_LOCATION_CODE', 'PLANNED_DEST_GBTT_DATETIME', 'PLANNED_CALLS', 'ACTUAL_CALLS',
            'PFPI_MINUTES', 'INCIDENT_REASON', 'INCIDENT_NUMBER', 'EVENT_TYPE', 'SECTION_CODE', 'DELAY_DAY',
            'EVENT_DATETIME', 'INCIDENT_START_DATETIME', 'ENGLISH_DAY_TYPE', 'STATION_ROLE', 'DFT_CATEGORY',
            'PLATFORM_COUNT', 'DATASET_TYPE', 'WEEKDAY', 'STANOX', 'DAY'
        ]

        # Display filtered columns only (in your preferred order)
        display(incidents_on_date[cols_to_show])
        return incidents_on_date[cols_to_show]


### OD pair + DATE

In [12]:
result_1 = train_view(all_data, '32530', '11231', '13-JUN-2024')
result_2 = train_view(all_data, '11231', '32530', '13-JUN-2024')


  all_data['INCIDENT_START_DATETIME'] = pd.to_datetime(


 Train journeys between 32530 and 11231: 17626 records found.
Unique train service codes: ['12357820']
 10 incident(s) found for OD pair 32530_11231 on 13-JUN-2024:


Unnamed: 0,TRAIN_SERVICE_CODE,PLANNED_ORIGIN_LOCATION_CODE,PLANNED_ORIGIN_GBTT_DATETIME,PLANNED_DEST_LOCATION_CODE,PLANNED_DEST_GBTT_DATETIME,PLANNED_CALLS,ACTUAL_CALLS,PFPI_MINUTES,INCIDENT_REASON,INCIDENT_NUMBER,...,EVENT_DATETIME,INCIDENT_START_DATETIME,ENGLISH_DAY_TYPE,STATION_ROLE,DFT_CATEGORY,PLATFORM_COUNT,DATASET_TYPE,WEEKDAY,STANOX,DAY
27940,12357820,32530,1029,11231,1302,1202,1205,3.0,VH,551271.0,...,13-JUN-2024 12:05,2024-06-13 12:02:00,[TH],,,,SINGLE_DAY,TH,11720,TH
29857,12357820,32530,1329,11231,1605,1503,1508,5.0,XA,551377.0,...,13-JUN-2024 15:08,2024-06-13 13:05:00,[TH],,,,SINGLE_DAY,TH,11720,TH
1234961,12357820,32530,529,11231,755,701,710,9.0,M8,550428.0,...,13-JUN-2024 07:10,2024-06-13 06:19:00,[TH],,,,SINGLE_DAY,TH,30120,TH
1240227,12357820,32530,1229,11231,1505,1346,1349,3.0,JL,550229.0,...,13-JUN-2024 13:49,2024-06-13 03:18:00,[TH],,,,SINGLE_DAY,TH,30120,TH
1497853,12357820,32530,529,11231,755,633,636,3.0,TZ,550405.0,...,13-JUN-2024 06:36,2024-06-13 06:00:00,[TH],,,,SINGLE_DAY,TH,31521,TH
1551608,12357820,32530,529,11231,755,633,636,3.0,TZ,550405.0,...,13-JUN-2024 06:36,2024-06-13 06:00:00,[TH],,,,SINGLE_DAY,TH,31620,TH
1845236,12357820,32530,1829,11231,2053,1831,1837,6.0,RC,552298.0,...,13-JUN-2024 18:37,2024-06-13 18:26:00,[TH],,,,SINGLE_DAY,TH,32530,TH
2030479,12357820,32530,529,11231,755,552,619,27.0,TZ,550405.0,...,13-JUN-2024 06:19,2024-06-13 06:00:00,[TH],,,,SINGLE_DAY,TH,33087,TH
2030898,12357820,32530,729,11231,953,754,803,9.0,TH,551532.0,...,13-JUN-2024 08:03,2024-06-13 14:08:00,[TH],,,,SINGLE_DAY,TH,33087,TH
2080609,12357820,32530,1229,11231,1505,1339,1341,2.0,JL,550229.0,...,13-JUN-2024 13:41,2024-06-13 03:18:00,[TH],,,,SINGLE_DAY,TH,35136,TH


 Train journeys between 11231 and 32530: 11933 records found.
Unique train service codes: ['12357820']
 6 incident(s) found for OD pair 11231_32530 on 13-JUN-2024:


Unnamed: 0,TRAIN_SERVICE_CODE,PLANNED_ORIGIN_LOCATION_CODE,PLANNED_ORIGIN_GBTT_DATETIME,PLANNED_DEST_LOCATION_CODE,PLANNED_DEST_GBTT_DATETIME,PLANNED_CALLS,ACTUAL_CALLS,PFPI_MINUTES,INCIDENT_REASON,INCIDENT_NUMBER,...,EVENT_DATETIME,INCIDENT_START_DATETIME,ENGLISH_DAY_TYPE,STATION_ROLE,DFT_CATEGORY,PLATFORM_COUNT,DATASET_TYPE,WEEKDAY,STANOX,DAY
1498005,12357820,11231,458,32530,715,651,653,1.5,TZ,550356.0,...,13-JUN-2024 06:53,2024-06-13 05:45:00,[TH],,,,SINGLE_DAY,TH,31521,TH
1498006,12357820,11231,458,32530,715,651,653,1.5,Q1,550356.0,...,13-JUN-2024 06:53,2024-06-13 05:45:00,[TH],,,,SINGLE_DAY,TH,31521,TH
1500128,12357820,11231,848,32530,1115,1057,1103,6.0,TZ,550405.0,...,13-JUN-2024 11:03,2024-06-13 06:00:00,[TH],,,,SINGLE_DAY,TH,31521,TH
1501319,12357820,11231,1151,32530,1415,1400,1403,3.0,TA,551212.0,...,13-JUN-2024 14:03,2024-06-13 12:04:00,[TH],,,,SINGLE_DAY,TH,31521,TH
1501341,12357820,11231,1151,32530,1415,1405,1408,3.0,R7,551525.0,...,13-JUN-2024 14:08,2024-06-13 14:02:00,[TH],,,,SINGLE_DAY,TH,31521,TH
1555381,12357820,11231,1151,32530,1415,1400,1403,3.0,TA,551212.0,...,13-JUN-2024 14:03,2024-06-13 12:04:00,[TH],,,,SINGLE_DAY,TH,31620,TH


In [13]:
def get_stanox_for_service(all_data, train_service_code, origin_code, destination_code):
    """
    For a given train service and OD pair, find the first train of the day (by earliest PLANNED_ORIGIN_GBTT_DATETIME),
    then order the unique STANOX codes for that train:
    - Use PLANNED_ORIGIN_GBTT_DATETIME for the origin,
    - PLANNED_DEST_GBTT_DATETIME for the destination,
    - PLANNED_CALLS for all other stations.
    Return the ordered list of STANOX codes for that train only, always including the origin as the first element.
    """
    import pandas as pd

    # --- Ensure OD_PAIR exists ---
    if 'OD_PAIR' not in all_data.columns:
        all_data['OD_PAIR'] = (
            all_data['PLANNED_ORIGIN_LOCATION_CODE'].astype(str)
            + '_' +
            all_data['PLANNED_DEST_LOCATION_CODE'].astype(str)
        )

    od_pair = f"{origin_code}_{destination_code}"

    # --- Filter dataset for this service and OD pair ---
    subset = all_data[
        (all_data['OD_PAIR'] == od_pair)
        & (all_data['TRAIN_SERVICE_CODE'].astype(str) == str(train_service_code))
    ].copy()

    if subset.empty:
        message = f"üö´ No records found for train service {train_service_code} on OD pair {od_pair}."
        print(message)
        return message

    # --- Find the first train of the day (earliest PLANNED_ORIGIN_GBTT_DATETIME) ---
    subset['PLANNED_ORIGIN_GBTT_DATETIME'] = pd.to_datetime(subset['PLANNED_ORIGIN_GBTT_DATETIME'], format='%H%M', errors='coerce')
    first_origin_time = subset['PLANNED_ORIGIN_GBTT_DATETIME'].min()
    first_train = subset[subset['PLANNED_ORIGIN_GBTT_DATETIME'] == first_origin_time].copy()

    # --- For this train, get all unique STANOX codes and their times ---
    def get_time(row):
        stanox = str(row['STANOX'])
        if stanox == str(origin_code) and pd.notna(row.get('PLANNED_ORIGIN_GBTT_DATETIME', None)):
            return pd.to_datetime(row['PLANNED_ORIGIN_GBTT_DATETIME'], format='%H%M', errors='coerce')
        elif stanox == str(destination_code) and pd.notna(row.get('PLANNED_DEST_GBTT_DATETIME', None)):
            return pd.to_datetime(row['PLANNED_DEST_GBTT_DATETIME'], format='%H%M', errors='coerce')
        elif pd.notna(row.get('PLANNED_CALLS', None)) and str(row['PLANNED_CALLS']).isdigit() and len(str(row['PLANNED_CALLS'])) == 4:
            # Use arbitrary date for time-only values to allow sorting
            return pd.to_datetime(str(row['PLANNED_CALLS']), format='%H%M', errors='coerce')
        else:
            return pd.NaT

    # Keep only unique STANOX for this train, keeping the earliest time for each
    first_train['ORDER_TIME'] = first_train.apply(get_time, axis=1)
    ordered = first_train.groupby('STANOX', as_index=False)['ORDER_TIME'].min()
    ordered = ordered.sort_values('ORDER_TIME')
    stanox_list = ordered['STANOX'].astype(str).tolist()

    # Always include origin_code as the first element, even if not present in the data
    origin_str = str(origin_code)
    if origin_str in stanox_list:
        stanox_list.remove(origin_str)
    stanox_list = [origin_str] + stanox_list

    print(f"‚úÖ Ordered STANOX for first train of the day for service {train_service_code} on OD pair {od_pair} (origin always first):")
    print(stanox_list)
    return stanox_list


In [14]:
service_stanox = get_stanox_for_service(all_data, '12357820', '11231', '32530')

‚úÖ Ordered STANOX for first train of the day for service 12357820 on OD pair 11231_32530 (origin always first):
['11231', '11271', '11720', '30120', '31620', '33087', '32000', '31521', '32530']


In [23]:
def map_train_journey_with_incidents(
    all_data, service_stanox, incident_results=None,
    stations_ref_path=r"C:\Users\39342\University of Glasgow\Ji-Eun Byun - MZ-JB\MSci (Research) 2024-25\reference data\stations_ref_with_dft.json",
    incident_color="purple", service_code=None, date_str=None
    ):
    """
    1. Map each unique incident (INCIDENT_NUMBER, INCIDENT_START_DATETIME) as a numbered marker (chronological index), popup shows incident number, datetime and reason.
    2. Map service_stanox as the journey sequence, connecting stations with lines.
    3. For each STANOX, sum all PFPI_MINUTES from provided incident result DataFrames and use color-grading for the marker. Station popups list the chronological incident indices (1,2,3...) that affected them.
    """
    import json
    import folium
    import pandas as pd

    # Load station reference
    with open(stations_ref_path, "r") as f:
        station_ref = json.load(f)

    # Build station coordinates for the service_stanox sequence only
    stanox_coords = []
    for s in service_stanox:
        s_str = str(int(float(s))) if isinstance(s, (int, float)) else str(s)
        match = next((item for item in station_ref if str(item.get("stanox", "")) == s_str), None)
        if match and 'latitude' in match and 'longitude' in match:
            try:
                lat = float(match['latitude'])
                lon = float(match['longitude'])
                stanox_coords.append((s_str, lat, lon))
            except Exception:
                continue

    if not stanox_coords:
        print("‚ö†Ô∏è No coordinates found for STANOX in this service.")
        return None

    # --- Folium map visualization ---
    mid_lat = sum([lat for _, lat, _ in stanox_coords]) / len(stanox_coords)
    mid_lon = sum([lon for _, _, lon in stanox_coords]) / len(stanox_coords)
    m = folium.Map(location=[mid_lat, mid_lon], zoom_start=8, tiles="CartoDB positron")

    # Add title if provided
    if service_code and date_str:
        title_html = f"<div style='position: fixed; top: 10px; left: 50%; transform: translateX(-50%); z-index:9999; font-size:18px; background: white; border:2px solid grey; border-radius:8px; padding: 10px;'><b>Train Service: {service_code}</b><br><b>Date: {date_str}</b></div>"
        m.get_root().html.add_child(folium.Element(title_html))

    # Draw lines between each consecutive station in stanox_coords
    for i in range(len(stanox_coords) - 1):
        start = stanox_coords[i][1], stanox_coords[i][2]
        end = stanox_coords[i+1][1], stanox_coords[i+1][2]
        folium.PolyLine([start, end], color="blue", weight=4, opacity=0.8).add_to(m)

    # Prepare list of DataFrames from incident_results
    dfs = []
    if incident_results:
        for res in incident_results:
            if isinstance(res, pd.DataFrame):
                dfs.append(res)

    delays_df = pd.concat(dfs, ignore_index=True) if dfs else None

    # Aggregate delays for each STANOX (sum of PFPI_MINUTES for all incidents and all records)
    stanox_delay = {}
    stanox_incidents = {}  # maps stanox -> list of incident id strings
    if delays_df is not None and 'STANOX' in delays_df.columns and 'PFPI_MINUTES' in delays_df.columns:
        delays_df['PFPI_MINUTES_num'] = pd.to_numeric(delays_df['PFPI_MINUTES'], errors='coerce').fillna(0)
        # Normalize INCIDENT_NUMBER to string for consistent display
        if 'INCIDENT_NUMBER' in delays_df.columns:
            def _norm_inc(x):
                try:
                    if pd.isna(x):
                        return None
                    xf = float(x)
                    if xf.is_integer():
                        return str(int(xf))
                    else:
                        return str(x)
                except Exception:
                    return str(x)
            delays_df['INCIDENT_NUMBER_str'] = delays_df['INCIDENT_NUMBER'].apply(_norm_inc)
        else:
            delays_df['INCIDENT_NUMBER_str'] = None

        for stanox, group in delays_df.groupby('STANOX'):
            total_delay = group['PFPI_MINUTES_num'].sum()
            stanox_delay[str(stanox)] = total_delay
            # collect unique incident numbers for this stanox
            if 'INCIDENT_NUMBER_str' in group.columns:
                unique_incs = sorted([str(v) for v in pd.unique(group['INCIDENT_NUMBER_str'].dropna())])
            else:
                unique_incs = []
            stanox_incidents[str(stanox)] = unique_incs

    # Build chronological ranking for unique incidents (1 = earliest start time)
    incident_rank = {}  # maps incident_id_str -> rank int
    if delays_df is not None and 'INCIDENT_NUMBER_str' in delays_df.columns and 'INCIDENT_START_DATETIME' in delays_df.columns:
        temp = delays_df[['INCIDENT_NUMBER_str', 'INCIDENT_START_DATETIME']].dropna(subset=['INCIDENT_NUMBER_str', 'INCIDENT_START_DATETIME']).drop_duplicates(subset=['INCIDENT_NUMBER_str']).copy()
        if not temp.empty:
            temp['INCIDENT_START_dt'] = pd.to_datetime(temp['INCIDENT_START_DATETIME'], errors='coerce')
            temp = temp.sort_values('INCIDENT_START_dt')
            temp = temp.reset_index(drop=True)
            temp['incident_rank'] = temp.index + 1
            incident_rank = dict(zip(temp['INCIDENT_NUMBER_str'].astype(str), temp['incident_rank'].astype(int)))

    # --- Color grading function to match incident_view_heatmap_html legend ---
    def get_color(delay):
        try:
            d = float(delay)
        except Exception:
            d = 0
        if d == 0:
            return "blue"
        if d <= 5:
            return '#32CD32'     # Minor (1-5 min) - Lime Green
        elif d <= 15:
            return '#FFD700'     # Moderate (6-15 min) - Gold
        elif d <= 30:
            return '#FF8C00'     # Significant (16-30 min) - Dark Orange
        elif d <= 60:
            return '#FF0000'     # Major (31-60 min) - Red
        elif d <= 120:
            return '#8B0000'     # Severe (61-120 min) - Dark Red
        else:
            return '#8A2BE2'     # Critical (120+ min) - Blue Violet

    # Map station markers with color-grading (no delay=blue, then severity colours)
    for stanox, lat, lon in stanox_coords:
        delay_val = stanox_delay.get(stanox, 0)
        color = get_color(delay_val)
        # Prepare ranked incident list for popup, truncate if long
        inc_list = stanox_incidents.get(stanox, [])
        if inc_list:
            # Map incident ids to ranks, use id fallback if rank not available
            inc_ranks = [str(incident_rank.get(str(i), i)) for i in inc_list]
            if len(inc_ranks) > 10:
                inc_display = ', '.join(inc_ranks[:10]) + f', ... (+{len(inc_ranks)-10} more)'
            else:
                inc_display = ', '.join(inc_ranks)
            incidents_html = f"<br><b>Incidents (by index):</b> {inc_display}"
        else:
            incidents_html = ''

        popup_html = f"<b>STANOX {stanox}</b><br>Total delay: {delay_val:.1f} min{incidents_html}"
        folium.CircleMarker(
            location=(lat, lon),
            radius=6,
            color=color,
            fill=True,
            fill_opacity=0.8,
            popup=folium.Popup(popup_html, max_width=400)
        ).add_to(m)

    # Map unique incidents aggregated by SECTION_CODE: create one numbered marker per location with all incident ranks
    incident_records = pd.concat(dfs, ignore_index=True) if dfs else None
    if incident_records is not None and 'INCIDENT_NUMBER' in incident_records.columns and 'INCIDENT_START_DATETIME' in incident_records.columns and 'SECTION_CODE' in incident_records.columns:
        # Normalize incident id string
        incident_records['INCIDENT_NUMBER_str'] = incident_records['INCIDENT_NUMBER'].apply(lambda x: (str(int(float(x))) if (pd.notna(x) and float(x).is_integer()) else str(x)))
        # Build unique incident list with parsed start times
        incident_unique = incident_records.drop_duplicates(subset=['INCIDENT_NUMBER_str', 'SECTION_CODE']).copy()
        incident_unique['INCIDENT_START_dt'] = pd.to_datetime(incident_unique['INCIDENT_START_DATETIME'], errors='coerce')
        # Compute incident durations
        incident_durations = {}
        for inc in incident_records['INCIDENT_NUMBER_str'].unique():
            subset = incident_records[incident_records['INCIDENT_NUMBER_str'] == inc]
            start = pd.to_datetime(subset['INCIDENT_START_DATETIME'].min(), format='%Y-%m-%d %H:%M:%S', errors='coerce')
            end = pd.to_datetime(subset['EVENT_DATETIME'].max(), format='%d-%b-%Y %H:%M', errors='coerce')
            # Ensure duration is not negative (handle data inconsistencies)
            duration = end - start
            duration = max(duration, pd.Timedelta(0))
            incident_durations[inc] = duration
        # For each SECTION_CODE, collect all incidents that occurred there
        section_map = {}
        for _, row in incident_unique.iterrows():
            section = str(row['SECTION_CODE'])
            inc_id = str(row['INCIDENT_NUMBER_str'])
            inc_num = row.get('INCIDENT_NUMBER')
            inc_time = row.get('INCIDENT_START_DATETIME')
            inc_reason = row.get('INCIDENT_REASON') if 'INCIDENT_REASON' in row.index else None
            rank = incident_rank.get(inc_id)
            entry = {
                'inc_id': inc_id,
                'inc_num': inc_num,
                'inc_time': inc_time,
                'inc_reason': inc_reason,
                'rank': rank if rank is not None else ''
            }
            section_map.setdefault(section, []).append(entry)

        # For each section, sort entries by rank (if available) then plot a single numbered marker showing combined ranks
        for section_code, entries in section_map.items():
            # sort by rank where possible
            entries_sorted = sorted(entries, key=lambda e: (e['rank'] if isinstance(e['rank'], int) else 999999))
            ranks = [str(e['rank']) if e['rank'] != '' else e['inc_id'] for e in entries_sorted]
            ranks_display = ','.join(ranks)
            # popup lists each incident number, datetime and reason
            popup_lines = []
            for e in entries_sorted:
                reason_text = e['inc_reason'] if e.get('inc_reason') else 'N/A'
                dur = incident_durations.get(e['inc_id'], 'N/A')
                popup_lines.append(f"Incident: {e['inc_num']} ‚Äî {e['inc_time']} ‚Äî Reason: {reason_text} ‚Äî Duration: {dur}")
            popup_html = '<br>'.join(popup_lines)

            # find coords for this section_code
            match = next((item for item in station_ref if str(item.get("stanox", "")) == section_code), None)
            if match and 'latitude' in match and 'longitude' in match:
                lat = float(match['latitude']) + 0.0005  # Small offset to avoid overlap with station markers
                lon = float(match['longitude']) + 0.0005  # Small offset to avoid overlap with station markers
                # adjust DivIcon size based on text length
                size_px = max(28, min(80, 12 * len(ranks_display)))
                number_html = f"<div style='background:{incident_color};color:#fff;border-radius:50%;min-width:{size_px}px;height:{size_px}px;display:inline-flex;align-items:center;justify-content:center;font-weight:bold;border:2px solid #ffffff;padding:4px'>{ranks_display}</div>"
                folium.Marker(
                    location=(lat, lon),
                    icon=folium.DivIcon(html=number_html),
                    popup=folium.Popup(popup_html, max_width=400)
                ).add_to(m)

    # --- Add legend/key for color grading matching incident_view_heatmap_html ---
    legend_html = '''
     <div style="position: fixed; bottom: 100px; left: 50px; width: 260px; height: 170px; z-index:9999; font-size:14px; background: white; border:2px solid grey; border-radius:8px; padding: 10px;">
     <b>Delay Intensity Key</b><br>
     <i class="fa fa-circle" style="color:blue"></i> 0 min (No delay)<br>
     <i class="fa fa-circle" style="color:#32CD32"></i> 1-5 min (Minor)<br>
     <i class="fa fa-circle" style="color:#FFD700"></i> 6-15 min (Moderate)<br>
     <i class="fa fa-circle" style="color:#FF8C00"></i> 16-30 min (Significant)<br>
     <i class="fa fa-circle" style="color:#FF0000"></i> 31-60 min (Major)<br>
     <i class="fa fa-circle" style="color:#8B0000"></i> 61-120 min (Severe)<br>
     <i class="fa fa-circle" style="color:#8A2BE2"></i> 120+ min (Critical)<br>
     <br><b>Incident markers:</b> numbered by chronological order (1 = earliest). Multiple ranks at same location shown as comma-separated list.<br>
     </div>
     '''
    m.get_root().html.add_child(folium.Element(legend_html))

    folium.LayerControl().add_to(m)
    print("Map created for service journey and incidents with color-graded station markers and aggregated numbered incident markers.")
    return m

In [24]:
# result_1 = train_view(all_data, '32530', '11231', '13-JUN-2024')
# result_2 = train_view(all_data, '11231', '32530', '13-JUN-2024')
# service_stanox = get_stanox_for_service(all_data, '12357820', '11231', '32530')

m = map_train_journey_with_incidents(
    all_data,
    service_stanox,
    incident_results=[result_1, result_2],
    service_code='12357820',
    date_str='13-JUN-2024'
)

m.save("journey_map.html")

Map created for service journey and incidents with color-graded station markers and aggregated numbered incident markers.
