In [50]:
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path

def get_eva_to_name_dict():
    with Path("eva_name_list.txt").open("r") as f:
        eva_to_list = {line.split(",")[0]: line.split(",")[1] for line in f.read().split("\n")}
    return eva_to_list

def get_plan_xml_rows(xml_path, eva_to_name):
    eva = xml_path.name.split("_")[0]
    station = eva_to_name[eva]

    tree = ET.parse(xml_path)
    root = tree.getroot()
    rows = []
    for s in root.findall('s'):
        s_id = s.get('id')
        train_type = s.find('tl').get('c') if s.find('tl') is not None else None
        train_number = s.find('tl').get('n') if s.find('tl') is not None else None
        ar_train_line_number = s.find('ar').get('l') if s.find('ar') is not None else None
        dp_train_line_number = s.find('dp').get('l') if s.find('dp') is not None else None
        
        if train_type in ['IC', 'ICE', 'EC']:
            train_name = f"{train_type} {train_number}"
        else:
            if ar_train_line_number is not None:
                train_name = f"{train_type} {ar_train_line_number}"
            elif dp_train_line_number is not None:
                train_name = f"{train_type} {dp_train_line_number}"
            else:
                train_name = train_type
        
        ar_pp = s.find('ar').get('pp') if s.find('ar') is not None else None
        dp_pp = s.find('dp').get('pp') if s.find('dp') is not None else None
        planned_platform = ar_pp or dp_pp # `or` to select the first non-None value

        dp_ppth = s.find('dp').get('ppth') if s.find('dp') is not None else None # departure planed path
        if dp_ppth is None:
            destination_station = station
        else:
            destination_station = dp_ppth.split("|")[-1]
        
        s_id_split = s_id.split('-')

        rows.append({
            'id': s_id,
            'station': station,
            'train_name': train_name,
            'destination_station': destination_station,
            'train_number': int(train_number),
            'arrival_planned_time': s.find('ar').get('pt') if s.find('ar') is not None else None,
            'departure_planned_time': s.find('dp').get('pt') if s.find('dp') is not None else None,
            'planned_platform': planned_platform,
            'train_line_id': '-'.join(s_id_split[:-1]),
            'train_line_station_num': int(s_id_split[-1]),
            
            # 'arrival_planned_path': s.find('ar').get('ppth') if s.find('ar') is not None else None,
            # 'departure_planned_path': s.find('dp').get('ppth') if s.find('dp') is not None else None,

        })
    return rows

def get_plan_db():
    eva_to_name = get_eva_to_name_dict()
    rows = []
    for date_folder_path in Path("data").iterdir():
        for xml_path in sorted(date_folder_path.iterdir()):
            if "plan" in xml_path.name:
                rows.extend(get_plan_xml_rows(xml_path, eva_to_name))
    
    out_df = pd.DataFrame(rows)
    out_df['arrival_planned_time'] = pd.to_datetime(out_df['arrival_planned_time'], format='%y%m%d%H%M', errors='coerce')
    out_df['departure_planned_time'] = pd.to_datetime(out_df['departure_planned_time'], format='%y%m%d%H%M', errors='coerce')
    out_df = out_df.drop_duplicates()
    return out_df

def get_fchg_xml_rows(xml_path, id_to_data):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    for s in root.findall('s'):
        s_id = s.get('id')
        ar_ct = s.find('ar').get('ct') if s.find('ar') is not None else None    # arrival change 
        dp_ct = s.find('dp').get('ct') if s.find('dp') is not None else None    # departure change 
        ar_clt = s.find('ar').get('clt') if s.find('ar') is not None else None    # arrival cancellation time 
        dp_clt = s.find('dp').get('clt') if s.find('dp') is not None else None    # departure cancellation time 

        if ar_clt is None and dp_clt is None:
            stop_canceled = False
        else:
            stop_canceled = True
            ar_ct = None
            dp_ct = None
        
        # arrival or departure changed platform
        ar_cp = s.find('ar').get('cp') if s.find('ar') is not None else None
        dp_cp = s.find('dp').get('cp') if s.find('dp') is not None else None
        changed_platform = ar_cp or dp_cp
        
        if ar_ct is None and dp_ct is None and changed_platform is None and not stop_canceled:
            continue
        
        # overwrite older data with new data
        id_to_data[s_id] = {
            'id': s_id,
            'arrival_change_time': ar_ct,
            'departure_change_time': dp_ct,
            'stop_canceled': stop_canceled,
            'changed_platform': changed_platform,
        }

def get_fchg_db():
    id_to_data = {}
    for date_folder_path in Path("data").iterdir():
        for xml_path in sorted(date_folder_path.iterdir()): # get the oldest data first
            if "fchg" in xml_path.name:
                get_fchg_xml_rows(xml_path, id_to_data)
    
    out_df = pd.DataFrame(id_to_data.values())
    out_df['arrival_change_time'] = pd.to_datetime(out_df['arrival_change_time'], format='%y%m%d%H%M', errors='coerce')
    out_df['departure_change_time'] = pd.to_datetime(out_df['departure_change_time'], format='%y%m%d%H%M', errors='coerce')
    out_df = out_df.drop_duplicates()
    return out_df

plan_df = get_plan_db()
fchg_df = get_fchg_db()
print(len(plan_df), len(fchg_df))

25670 38525


In [35]:
#eva_to_list

In [53]:
df = pd.merge(plan_df, fchg_df, on='id', how='left')
df.loc[df["arrival_planned_time"] == df["arrival_change_time"], "arrival_change_time"] = None
df.loc[df["departure_planned_time"] == df["departure_change_time"], "departure_change_time"] = None
df.loc[df["stop_canceled"].isna(), "stop_canceled"] = False
df = df.drop("id", axis=1)
df.to_csv("data.csv", index=False)

In [54]:
#df[df["train_name"] == "RE 1"]

In [23]:
#df[df["train_name"] == "RE 1"]["station_name"].unique()#.head(40)

array(['Aachen Hbf', 'Bochum Hbf', 'Bremen Hbf', 'Dortmund Hbf',
       'Düsseldorf Flughafen', 'Düsseldorf Hbf', 'Duisburg Hbf',
       'Essen Hbf', 'Göttingen', 'Trier Hbf', 'Hamm (Westf) Hbf',
       'Hannover Hbf', 'Ingolstadt Hbf', 'Kaiserslautern Hbf',
       'Koblenz Hbf', 'Köln Hbf', 'Ludwigshafen (Rhein) Hbf',
       'Mannheim Hbf', 'München Hbf', 'Neustadt (Weinstr) Hbf',
       'Nürnberg Hbf', 'Oldenburg (Oldb) Hbf', 'Saarbrücken Hbf',
       'Hamburg Hbf', 'Köln Messe/Deutz', 'Erfurt Hbf',
       'Frankfurt (Oder)', 'Magdeburg Hbf', 'Berlin Ostbahnhof',
       'Rostock Hbf', 'Berlin Wannsee', 'Berlin Zoologischer Garten',
       'Berlin Hbf', 'Berlin Friedrichstraße', 'Potsdam Hbf'],
      dtype=object)

In [38]:

#print(["station_name"] for i in range(16)])
#df["7448894008162231514" in df["id"]]
#is_value_present = df['id'].isin(["7448894008162231514"])#.any()
#df[is_value_present]
df[df['id'].astype(str).str.contains("3845145637786814987")]

Unnamed: 0,id,station,train_name,destination_station,train_number,arrival_planned_time,departure_planned_time,planned_platform,arrival_change_time,departure_change_time,stop_canceled,changed_platform
9,-3845145637786814987-2405091051-1,Aachen Hbf,RE 1,Hamm(Westf)Hbf,26817,NaT,2024-05-09 10:51:00,2,NaT,2024-05-09 10:54:00,False,
1405,-3845145637786814987-2405091051-20,Bochum Hbf,RE 1,Hamm(Westf)Hbf,26817,2024-05-09 13:02:00,2024-05-09 13:03:00,5,2024-05-09 13:10:00,2024-05-09 13:11:00,False,
3020,-3845145637786814987-2405091051-21,Dortmund Hbf,RE 1,Hamm(Westf)Hbf,26817,2024-05-09 13:14:00,2024-05-09 13:16:00,8,2024-05-09 13:25:00,2024-05-09 13:35:00,False,
3412,-3845145637786814987-2405091051-15,Düsseldorf Flughafen,RE 1,Hamm(Westf)Hbf,26817,2024-05-09 12:27:00,2024-05-09 12:28:00,1,2024-05-09 12:31:00,2024-05-09 12:34:00,False,
3628,-3845145637786814987-2405091051-14,Düsseldorf Hbf,RE 1,Hamm(Westf)Hbf,26817,2024-05-09 12:20:00,2024-05-09 12:22:00,17,2024-05-09 12:24:00,2024-05-09 12:26:00,False,
4143,-3845145637786814987-2405091051-16,Duisburg Hbf,RE 1,Hamm(Westf)Hbf,26817,2024-05-09 12:36:00,2024-05-09 12:38:00,13,2024-05-09 12:42:00,2024-05-09 12:44:00,False,
5260,-3845145637786814987-2405091051-18,Essen Hbf,RE 1,Hamm(Westf)Hbf,26817,2024-05-09 12:50:00,2024-05-09 12:52:00,6,2024-05-09 12:56:00,2024-05-09 13:00:00,False,
8286,-3845145637786814987-2405091051-26,Hamm (Westf) Hbf,RE 1,Hamm (Westf) Hbf,26817,2024-05-09 13:40:00,NaT,6,2024-05-09 13:58:00,NaT,False,1.0
11734,-3845145637786814987-2405091051-9,Köln Hbf,RE 1,Hamm(Westf)Hbf,26817,2024-05-09 11:44:00,2024-05-09 11:49:00,4 A-C,2024-05-09 11:46:00,2024-05-09 11:50:00,False,
20325,-3845145637786814987-2405091051-10,Köln Messe/Deutz,RE 1,Hamm(Westf)Hbf,26817,2024-05-09 11:52:00,2024-05-09 11:53:00,2,2024-05-09 11:53:00,2024-05-09 11:55:00,False,


In [33]:
#df[df['train_number'] == "10320"]
df[df['train_name'] == "RB 33"]

Unnamed: 0,id,station_name,train_name,train_number,arrival_planned_time,departure_planned_time,arrival_change_time,departure_change_time,stop_canceled
1,7448894008162231514-2405091037-1,Aachen Hbf,RB 33,10320,NaT,2024-05-09 10:37:00,NaT,NaT,False
6,612445710103187398-2405090816-31,Aachen Hbf,RB 33,10319,2024-05-09 10:27:00,NaT,NaT,NaT,False
27,-6797166876779044113-2405091135-1,Aachen Hbf,RB 33,10322,NaT,2024-05-09 11:35:00,NaT,NaT,False
28,-3022599497773762713-2405090916-31,Aachen Hbf,RB 33,10321,2024-05-09 11:27:00,NaT,NaT,NaT,False
43,-4191241432018282090-2405091237-1,Aachen Hbf,RB 33,10324,NaT,2024-05-09 12:37:00,NaT,NaT,False
...,...,...,...,...,...,...,...,...,...
25645,2880506925895775780-2405091841-1,Potsdam Hbf,RB 33,62148,NaT,2024-05-09 18:41:00,NaT,NaT,False
25649,30604161985673120-2405091941-1,Potsdam Hbf,RB 33,62150,NaT,2024-05-09 19:41:00,NaT,NaT,False
25650,2017104854702856007-2405091832-10,Potsdam Hbf,RB 33,62151,2024-05-09 19:18:00,NaT,2024-05-09 19:21:00,NaT,False
25667,3024886621630039813-2405092041-1,Potsdam Hbf,RB 33,62152,NaT,2024-05-09 20:41:00,NaT,NaT,False


In [16]:
df[df["stop_canceled"]]

774

In [13]:
len(df)

18691