In [1]:
import pandas as pd
import datetime 
dfMatches = pd.read_csv("matchInfos.csv")
dfMatches.loc["date"] = pd.to_datetime(dfMatches["date"])


In [46]:
from rapidfuzz import process

def fuzzy_match_station(city_name, stop_names, limit=5, threshold=80):
    """
    Gibt eine Liste von Stop-IDs zurück, deren stop_name dem city_name ähnlich ist.
    """
    matches = process.extract(
        city_name, 
        stop_names, 
        limit=limit, 
        score_cutoff=threshold
    )
    # Rückgabe der Stop-Namen, die den Score >= threshold haben
    return [m[0] for m in matches]


In [137]:
from geopy.geocoders import Nominatim
import time
import numpy as np

# --- Hilfsfunktion: Haversine-Formel ---
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Erdradius in km
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    
    a = np.sin(dphi/2.0)**2 + np.cos(phi1)*np.cos(phi2)*np.sin(dlambda/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# --- Nächstgelegenen Hauptbahnhof finden ---
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time

def find_nearest_station(city, stops, geolocator, retries=3, delay=1):
    """
    Findet den nächstgelegenen Bahnhof zu einer Stadt.
    retries: Anzahl Versuche bei Timeout
    delay: Wartezeit zwischen Versuchen
    """
    for attempt in range(retries):
        try:
            city_loc = geolocator.geocode(city, timeout=10)  # timeout verlängert
            if city_loc:
                # Distanz zu allen Hauptbahnhöfen berechnen
                stops['distance'] = stops.apply(
                    lambda row: haversine(city_loc.latitude, city_loc.longitude, row['stop_lat'], row['stop_lon']),
                    axis=1
                )
                nearest_idx = stops['distance'].idxmin()
                nearest_station = stops.loc[nearest_idx, 'stop_name']
                return nearest_station
            else:
                print(f"⚠️ Keine Geolocation gefunden für {city}")
                return None
        except GeocoderTimedOut:
            print(f"Timeout bei {city}, Versuch {attempt+1}/{retries}")
            time.sleep(delay)
    print(f"⚠️ Kein Bahnhof gefunden für {city} nach {retries} Versuchen")
    return None



In [138]:
def get_train_connections_for_row(row, gtfs_folder, window_hours=3, trip_type='departure'):
    stops = pd.read_csv(f"{gtfs_folder}/stops.txt")
    stops["stop_name"] = stops["stop_name"].str.replace(r"\bHbf\b", "Hauptbahnhof", case=False, regex=True)
    trips = pd.read_csv(f"{gtfs_folder}/trips.txt")
    stop_times = pd.read_csv(f"{gtfs_folder}/stop_times.txt")
    routes = pd.read_csv(f"{gtfs_folder}/routes.txt")
    calendar = pd.read_csv(f"{gtfs_folder}/calendar.txt")
    
    # Start/Ziel
    city_start = row["home_station"] if trip_type == "departure" else row["away_station"]
    city_end   = row["away_station"] if trip_type == "departure" else row["home_station"]
    
    # Datum und Zeit
    dt_obj = pd.to_datetime(row["date"])
    date_str = dt_obj.strftime("%Y-%m-%d")
    
    # Nächste Hauptbahnhöfe
    geolocator = Nominatim(user_agent="bundesliga-bahn-mapper")
    station_start = find_nearest_station(city_start, stops, geolocator)
    station_end   = find_nearest_station(city_end, stops, geolocator)
    if station_start is None or station_end is None:
       print(f"⚠️ Kein Bahnhof gefunden für {city_start} oder {city_end}")
       return pd.DataFrame()  # Keine Station gefunden

    start_ids = stops[stops['stop_name'] == station_start]['stop_id'].tolist()
    end_ids   = stops[stops['stop_name'] == station_end]['stop_id'].tolist()
    
    # Aktive Trips
    weekday = dt_obj.strftime("%A").lower()
    active_service_ids = calendar[calendar[weekday] == 1]['service_id'].tolist()
    active_trips = trips[trips['service_id'].isin(active_service_ids)]
    stop_times_active = stop_times[stop_times['trip_id'].isin(active_trips['trip_id'])]
    
    # Trips zwischen Start und Ziel
    stop_times_start = stop_times_active[stop_times_active['stop_id'].isin(start_ids)]
    stop_times_end   = stop_times_active[stop_times_active['stop_id'].isin(end_ids)]
    common_trips = set(stop_times_start['trip_id']).intersection(set(stop_times_end['trip_id']))
    
    # Zeitfenster
    input_time = datetime.combine(dt_obj.date(), dt_obj.time())
    if trip_type.lower() == 'arrival':
        window_start = input_time - timedelta(hours=(2+window_hours))
        window_end   = input_time + timedelta(hours=2)
    else:  # departure
        window_start = input_time + timedelta(hours=2)
        window_end   = input_time + timedelta(hours=(2+window_hours))
    
    # Stopps sammeln
    rows = []
    for trip_id in common_trips:
        trip_times = stop_times_active[stop_times_active['trip_id'] == trip_id].sort_values('stop_sequence')
        start_idx = trip_times[trip_times['stop_id'].isin(start_ids)].index[0]
        end_idx   = trip_times[trip_times['stop_id'].isin(end_ids)].index[0]
        if start_idx >= end_idx:
            continue
        stops_between = trip_times.loc[start_idx:end_idx]
        
        start_dt = datetime.strptime(stops_between.iloc[0]['departure_time'], "%H:%M:%S").replace(
            year=input_time.year, month=input_time.month, day=input_time.day)
        end_dt   = datetime.strptime(stops_between.iloc[-1]['arrival_time'], "%H:%M:%S").replace(
            year=input_time.year, month=input_time.month, day=input_time.day)
        
        if trip_type.lower() == 'departure' and not (window_start <= start_dt <= window_end):
            continue
        if trip_type.lower() == 'arrival' and not (window_start <= end_dt <= window_end):
            continue
        
        route_id = trips.loc[trips['trip_id'] == trip_id, 'route_id'].values[0]
        route_info = routes[routes['route_id'] == route_id]
        if 'route_long_name' in routes.columns and not route_info['route_long_name'].isna().all():
            route_name = route_info['route_long_name'].values[0]
        elif 'route_short_name' in routes.columns and not route_info['route_short_name'].isna().all():
            route_name = route_info['route_short_name'].values[0]
        else:
            route_name = str(route_id)
        
        # Stopps sammeln
        for _, stop_row in stops_between.iterrows():
            stop_info = stops.loc[stops['stop_id'] == stop_row['stop_id']].iloc[0]  # nur einmal holen
            rows.append({
                'match_id': row.get('match_id', None),
                'trip_id': trip_id,
                'route': route_name,
                'stop_sequence': stop_row['stop_sequence'],
                'stop_name': stop_info['stop_name'],
                'stop_lon': stop_info['stop_lon'],
                'stop_lat': stop_info['stop_lat'],
                'arrival': stop_row.get('arrival_time', ''),
                'departure': stop_row.get('departure_time', ''),
                'date': date_str
            })

    
    return pd.DataFrame(rows)




In [150]:
# only select 14. Spieltag (for testing)
df_try = dfMatches[(dfMatches["matchday"] == "14. Spieltag") & (dfMatches["league"] == "bl1")]

#df = get_train_connections_with_stops("gtfs", df_test, window_hours=3, trip_type = 'departure')
#print(df.head(20))

In [151]:
# --- Anwendung auf alle Zeilen ---
# df_try: DataFrame mit allen Matches
dfs = df_try.apply(lambda row: get_train_connections_for_row(row, "gtfs", window_hours=3, trip_type='departure'), axis=1)
df_all = pd.concat(dfs.tolist(), ignore_index=True)

In [152]:
df_all

Unnamed: 0,match_id,trip_id,route,stop_sequence,stop_name,stop_lon,stop_lat,arrival,departure,date
0,77375,1068568,ICE 41,8,Frankfurt(Main)Hauptbahnhof,8.662828,50.10668,18:48:00,18:54:00,2025-12-13
1,77375,1068568,ICE 41,9,Aschaffenburg Hauptbahnhof,9.143697,49.980556,19:22:00,19:23:00,2025-12-13
2,77375,1068568,ICE 41,10,Würzburg Hauptbahnhof,9.93578,49.801796,20:01:00,20:03:00,2025-12-13
3,77375,1068568,ICE 41,11,Nürnberg Hauptbahnhof,11.082989,49.445618,21:00:00,21:16:00,2025-12-13
4,77375,1068568,ICE 41,12,Augsburg Hauptbahnhof,10.88557,48.36544,22:31:00,22:33:00,2025-12-13
5,77375,1507258,ICE 11,8,Frankfurt(Main)Hauptbahnhof,8.662828,50.10668,17:44:00,17:50:00,2025-12-13
6,77375,1507258,ICE 11,9,Mannheim Hauptbahnhof,8.468921,49.479355,18:27:00,18:30:00,2025-12-13
7,77375,1507258,ICE 11,10,Stuttgart Hauptbahnhof,9.181635,48.784084,19:08:00,19:15:00,2025-12-13
8,77375,1507258,ICE 11,11,Ulm Hauptbahnhof,9.982608,48.3994,19:58:00,20:01:00,2025-12-13
9,77375,1507258,ICE 11,12,Augsburg Hauptbahnhof,10.88557,48.36544,20:41:00,20:43:00,2025-12-13


In [134]:
import plotly.express as px

def plot_train_routes_plotly(df):
    """
    Plottet die Züge auf einer Karte mit Plotly.
    df muss die Spalten 'trip_id', 'stop_lat', 'stop_lon', 'stop_name', 'arrival', 'departure', 'stop_sequence' enthalten.
    """
    if df.empty:
        print("Keine Daten zum Plotten")
        return None
    
    # Sortieren nach trip_id und stop_sequence
    df = df.sort_values(['trip_id', 'stop_sequence'])
    
    # Popups für jeden Stopp
    df['popup'] = df.apply(lambda row: f"{row['stop_name']}<br>{row['arrival']} → {row['departure']}", axis=1)
    
    # Linien zwischen Stopps innerhalb eines Zuges
    fig = px.line_map(
        df, 
        lat="stop_lat", 
        lon="stop_lon", 
        color="match_id", 
        line_group="trip_id",  # jede Linie für einen Zug
        hover_name="stop_name",
        hover_data={"arrival": True, "departure": True, "stop_lat": False, "stop_lon": False},
        zoom=6,
        height=600
    )
    
    # Marker für jeden Stopp
    fig.add_scattermapbox(
        lat=df['stop_lat'],
        lon=df['stop_lon'],
        mode='markers+text',
        marker=dict(size=8, color='blue'),
        text=df['stop_name'],
        textposition="top right",
        hovertext=df['popup'],
        hoverinfo="text"
    )
    
    # Mapbox Style
    fig.update_layout(mapbox_style="open-street-map")
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    
    return fig


In [136]:
fig = plot_train_routes_plotly(df_all)
fig.show()
