## Train Time tables retrieval

---

In [7]:
import pandas as pd
import networkx as nx

from tqdm import tqdm
import pickle

In [2]:
# Load the data
df = pd.read_csv('../data/ist-daten-sbb.csv', sep=';')

# Select and rename columns
cols = {
    "Journey identifier": "journey_id",
    "Arrival time": "arrival",
    "Departure time": "departure",
    "Stop name": "station",
    "OPUIC": "opuic",
    "Geopos": "pos"
}

df = df[cols.keys()].rename(columns=cols)

# Convert time columns to datetime
df['arrival'] = pd.to_datetime(df['arrival'])
df['departure'] = pd.to_datetime(df['departure'])

# Show the result
df.head()

Unnamed: 0,journey_id,arrival,departure,station,opuic,pos
0,85:11:1094:001,2023-12-02 00:06:00,2023-12-02 00:07:00,Thun,8507100,"46.75485273059273, 7.6296058286694795"
1,85:11:1096:001,2023-12-02 00:37:00,NaT,Basel SBB,8500010,"47.5474120550501, 7.589562790156525"
2,85:11:1251:001,NaT,2023-12-01 06:06:00,Basel SBB,8500010,"47.5474120550501, 7.589562790156525"
3,85:11:1258:001,NaT,2023-12-01 20:08:00,Chur,8509000,"46.853084162764006, 9.52893773304132"
4,85:11:1411:001,NaT,2023-12-01 07:10:00,Bern,8507000,"46.948832290498416, 7.439130889923935"


In [3]:
edges = []

for journey_id in tqdm(df.journey_id.unique()):
    trip = df[df.journey_id == journey_id].sort_values('departure', inplace=False)
    trip_name = f"{trip.iloc[0].station} -> {trip.iloc[-1].station}"
    
    for i in range(len(trip) - 1):
        edges.append((
            trip.iloc[i].station, 
            trip.iloc[i + 1].station,
            {
                'departure': trip.iloc[i].departure,
                'arrival': trip.iloc[i + 1].arrival,
                'duration': trip.iloc[i + 1].arrival - trip.iloc[i].departure,
                'journey_id': journey_id,
                'trip_name': trip_name,
                'type': 'train'
            })
        )

100%|██████████| 5726/5726 [01:01<00:00, 93.20it/s] 


In [4]:
# Construct the graph from edges
G = nx.MultiDiGraph(edges)

# Add attributes to the nodes
for node in tqdm(G.nodes):
    # Get the station attributes from the dataframe
    station = df[df.station == node].iloc[0]
    station_attributes = {
        'pos': station.pos,
        'opuic': station.opuic
    }

    # Add the attributes to the node
    G.nodes[node].update(station_attributes)

100%|██████████| 603/603 [00:02<00:00, 241.22it/s]


In [5]:
# Show a sample of the edges
list(G.edges(data=True))[0:2]

[('Interlaken Ost',
  'Interlaken West',
  {'departure': Timestamp('2023-12-01 23:33:00'),
   'arrival': Timestamp('2023-12-01 23:36:00'),
   'duration': Timedelta('0 days 00:03:00'),
   'journey_id': '85:11:1094:001',
   'trip_name': 'Interlaken Ost -> Bern',
   'type': 'train'}),
 ('Interlaken Ost',
  'Interlaken West',
  {'departure': Timestamp('2023-12-01 07:00:00'),
   'arrival': Timestamp('2023-12-01 07:04:00'),
   'duration': Timedelta('0 days 00:04:00'),
   'journey_id': '85:11:809:001',
   'trip_name': 'Interlaken Ost -> Romanshorn',
   'type': 'train'})]

In [6]:
# Show a sample of the nodes
list(G.nodes(data=True))[0:2]

[('Interlaken Ost',
  {'pos': '46.690499996187924, 7.869000004346448', 'opuic': 8507492}),
 ('Interlaken West',
  {'pos': '46.682627980356514, 7.851453137595281', 'opuic': 8507493})]

In [8]:
with open("../data/graph.pickle", 'wb') as file:
    pickle.dump(G, file)