In [14]:
import geopandas as gpd
import pandas as pd
import urllib.request, json

In [15]:
def api_request(stations, subscription_key):
    '''This function makes an API request from the NS server and returns
     the trajectories of the Dutch railway map.

     Args:
        stations (str): Two stations of which you want the trajectory in the form 'Station1,Station2'
        subscription_key (str): An subscribtion key 

    Returns:
        gdf (GeoDataFrame): GeoDataFrame with lines between stations
    '''
    
    url = f"https://gateway.apiportal.ns.nl/Spoorkaart-API/api/v1/traject.geojson?stations={stations}"
    headers = {
        'Cache-Control': 'no-cache',
        'Ocp-Apim-Subscription-Key': subscription_key,
    }

    # Make the API request
    try:
        req = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(req)
        response_content = response.read().decode('utf-8')

        # Load the response content into JSON
        geojson_data = json.loads(response_content)

        # Convert the GeoJSON data to a GeoDataFrame
        gdf = gpd.GeoDataFrame.from_features(geojson_data['features'])

        return gdf
    except urllib.error.HTTPError as e:
        print(f"HTTPError: {e.code} - {e.reason}")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

In [16]:
def travel_data(operators, seats_per_vehicle_type):
    '''Taking a list of operators and a dictionary with the amount of seats per vehicle type.
    Then by way of this list of operators importing their train schedules and using the dictionary
    to assign the amount of seats each service in the schedule has.
    
    Args:
        operators (List): List with operators you want to have in the final DataFrame
        seats_per_vehicle_type (Dict): Dictionary with all train types with corresponding seat amounts per coach
    
    Returns:
        travel_data (DataFrame): Dataframe with number of seats per train per trajectory
    '''
    # Selecting which operators you want in the final DataFrame
    # First, the csv-document corresponding to the operator is made into a DataFrame
    # Then, this DataFrame is concatenated to the other DataFrames

    # Each DataFrame consist of multiple interesting columns:
    # 1. 'OperatingDay' = Which day the train has driven
    # 2. 'JourneyNumber' = This number corresponds to the route of the train (see Trainservices.csv)
    # 3. 'UserStopCodeBegin' = This is the begin station of the trajectory
    # 4. 'UserStopCodeEnd' = This is the end station of the trajectory
    # 5. 'VehicleType' = Which train type is used on this trajectory
    # 6. 'TotalNumberOfCoaches' = Number of coaches of this train on this trajectory
    travel_data = pd.DataFrame()
    for i in operators:
        if i == 'Keolis':
            keolis = pd.read_csv('Keolis.csv', delimiter= ';')
            travel_data = pd.concat([travel_data, keolis], ignore_index=True)
        elif i == 'Arriva':
            arriva = pd.read_csv('Arriva.csv')
            # For train type WINK, the cells are empty, so these are filled with 1's
            arriva['TotalNumberOfCoaches'] = arriva['TotalNumberOfCoaches'].fillna(1)
            travel_data = pd.concat([travel_data, arriva], ignore_index=True)
        elif i == 'Qbuzz':
            qbuzz = pd.read_csv('Qbuzz.csv')
            travel_data = pd.concat([travel_data, qbuzz], ignore_index=True)
        elif i == 'NS':
            ns = pd.read_csv('OC_NS_20241007.csv')
            travel_data = pd.concat([travel_data, ns], ignore_index=True)
        else:
            return print("Operator not known, known operators: [Keolis, Arriva, NS, Qbuzz]")
    
    # Make an extra column where the seats will be stored
    travel_data['Seats'] = 0

    # Calculate the amount of seats by multiplying the number of coaches with the amount of seats per coach
    # (source: Wikipedia) corresponding to the train type in the column 'VehicleType'
    for q, i in enumerate(travel_data['VehicleType']):
        travel_data.loc[q, 'Seats'] = travel_data.loc[q, 'TotalNumberOfCoaches'] * seats_per_vehicle_type[i]
    
    # Some DataFrames consist of 10 days, as we want to see only one week, some days will get removed from the
    # DataFrame. This code will show trains in week 41 (October 7th to 13th)
    travel_data['OperatingDay'] = pd.to_datetime(travel_data['OperatingDay'])
    dates_to_exclude = pd.to_datetime(['2024-10-14', '2024-10-15', '2024-10-16'])
    df_filtered = travel_data[~travel_data['OperatingDay'].isin(dates_to_exclude)]
    travel_data = df_filtered.reset_index(drop=True)

    # During construction works, some train services get alternative numbers, adding 20 or 70 in front of the
    # original number (7903 -> 207903). As these are the same trajectories as the original route, the 20/70 
    # will be removed
    for i in range(len(travel_data)):
        if travel_data.loc[i, 'JourneyNumber'] > 700000:
            travel_data.loc[i, 'JourneyNumber'] -= 700000
        if 200000 < travel_data.loc[i, 'JourneyNumber'] < 700000:
            travel_data.loc[i, 'JourneyNumber'] -= 200000

      
    return travel_data

In [17]:
def interpolate_color(lower_limit, upper_limit, lower_color, upper_color, number):
    '''Assign a color to a number based on where in the range of numbers it lies
    
    Args:
        lower_limit (int): Minimum value of the range
        upper_limit (int): Maximum value of the range
        lower_color (tuple): RGB representation of the color for the minimum value
        upper_color (tuple): RGB representation of the color for the maximum value
        number (int): This is the number you want to interpolate

    Output:
        color (list): RGB representation of the color corresponding to the number
    
    '''
    # Make sure the number is between the lower and upper limit
    number = max(min(number, upper_limit), lower_limit)
    
    # Calculate the interpolation factor
    factor = (number - lower_limit) / (upper_limit - lower_limit)
    
    # Interpolate each RGB component
    interpolated_color = tuple(
        int(lower_component + (upper_component - lower_component) * factor)
        for lower_component, upper_component in zip(lower_color, upper_color)
    )
    
    # Normalize the color for usage
    color = [interpolated_color / 255.0 for interpolated_color in interpolated_color] 

    return color

In [18]:
def seat_sorter(train_services_data, train_travel_data):
    ''' Make a dictionary with the seats per train service

    Args:
        train_services_data (DataFrame): DataFrame with all train services in The Netherlands
        train_travel_data (DataFrame): DataFrame with information about every journey including number of seats

    Output:
        dataframes_dict (dictionary): Dictionary with DataFrames per train service (in both directions)
                                      containing the amount of seats on that train service    
    '''
    # Loading the CSV containing all train services with their stations in The Netherlands
    train_services = pd.read_csv(train_services_data, delimiter = ';')
    
    # Creating an empty dictionary where the DataFrames will be stored
    dataframes_dict = {}

    # Creating the DataFrames for every train service in The Netherlands. This is done by splitting the
    # strings with station abbreviations, and making every row in the DataFrame a trajectory between two 
    # stations. The seats column will remain empty.
    for i in range(len(train_services)):
        stations_per_service = train_services.loc[i,'String'].split(',')
        df = pd.DataFrame({'From':[],'To':[],'Seats':[]})
        for j in range(len(stations_per_service) - 1):
            new_row = pd.DataFrame({'From': [stations_per_service[j]], 'To': [stations_per_service[j + 1]], 'Seats':[None]})
            df = pd.concat([df, new_row], ignore_index=True)
        dataframes_dict[train_services.loc[i,'Code']] = df

        # This code makes DataFrames for the return journey, these DataFrames get a 1 at the end (900 -> 901)
        stations_per_service = train_services.loc[i,'String'].split(',')
        stations_per_service = stations_per_service[::-1]
        df = pd.DataFrame({'From':[],'To':[],'Seats':[]})
        for j in range(len(stations_per_service) - 1):
            new_row = pd.DataFrame({'From': [stations_per_service[j]], 'To': [stations_per_service[j + 1]], 'Seats':[None]})
            df = pd.concat([df, new_row], ignore_index=True)
        dataframes_dict[train_services.loc[i,'Code'] + 1] = df

    # Filling the seats column by checking the journey number in the train_travel_data. This journey number will be devided by 100
    # and rounded down, to be multiplied by 100 again (946 -> 900). This number must correspond with a train service. If this number 
    # is even, it has to fill the even DataFrame of that train service (e.g. 900), if the number is odd, the odd DataFrame has to be
    # filled (e.g. 901). If the row is still empty, the seats value (from train_travel_data) from a row will be stored in the
    # corresponding DataFrame. If the row already has a value, the new amount of seats will be added on top of the old value.
    for i in range(len(train_travel_data)):
        # Checking if the number is odd, so the odd DataFrames will be filled
        if train_travel_data.loc[i, 'JourneyNumber'] % 2 != 0:
            # Obtaining the first 2/3 digits corresponding to the train service
            service_begin = train_travel_data.loc[i, 'JourneyNumber']//100
            start_value = 0
            # Looping through the dictionary corresponding to the odd DataFrame corresponding to the train service
            for j in range(len(dataframes_dict[service_begin*100+ 1])):
                # Start with adding seats to the DataFrame if the begin station from the train_travel_data corresponds to the
                # from-station in the newly made dictionaries.
                if train_travel_data.loc[i, 'UserStopCodeBegin'].upper() == dataframes_dict[service_begin*100+ 1].loc[j, 'From'].upper():
                    start_value = 1
                
                # If this value is true, seats get added to every row in that dictionary
                if start_value == 1:
                    if dataframes_dict[service_begin*100 + 1].loc[j, 'Seats'] == None:
                        dataframes_dict[service_begin*100+ 1].loc[j, 'Seats'] = train_travel_data.loc[i, 'Seats']
                    else:
                        dataframes_dict[service_begin*100+ 1].loc[j, 'Seats'] += train_travel_data.loc[i, 'Seats']
                
                # If the end station has been reached (UserStopCodeEnd in the train_travel_data DataFrame) we stop adding the seats
                # to the dictionaries
                if train_travel_data.loc[i, 'UserStopCodeEnd'].upper() == dataframes_dict[service_begin*100+ 1].loc[j, 'To'].upper():
                    break

        else:
            # Checking if the number is even, so the even DataFrames will be filled
            service_begin = train_travel_data.loc[i, 'JourneyNumber']//100
            start_value = 0
            for j in range(len(dataframes_dict[service_begin*100])):
                if train_travel_data.loc[i, 'UserStopCodeBegin'].upper() == dataframes_dict[service_begin*100].loc[j, 'From'].upper():
                    start_value = 1
                
                if start_value == 1:
                    if dataframes_dict[service_begin*100].loc[j, 'Seats'] == None:
                        dataframes_dict[service_begin*100].loc[j, 'Seats'] = train_travel_data.loc[i, 'Seats']
                    else:
                        dataframes_dict[service_begin*100].loc[j, 'Seats'] += train_travel_data.loc[i, 'Seats']
                
                if train_travel_data.loc[i, 'UserStopCodeEnd'].upper() == dataframes_dict[service_begin*100].loc[j, 'To'].upper():
                    break

    return dataframes_dict

In [19]:
def seats_per_trajectory(seats_dictionary):
    ''' Multiple services use the same tracks, this function adds seats on the same trajectory from different services.

    Args:
        seats_dictionary (dict): Dictionary with DataFrames per train service (in both directions)
                                 containint the amount of seats on that train service
    
    Output:
        seats_per_trajectory (DataFrame): DataFrame with all trajectories and the amount of seats
    '''
    
    # Creating a new DataFrame for storing the data of the seats
    seats_per_trajectory = pd.DataFrame({'From':[],'To':[],'Seats':[]})

    # Looping through the DataFrames in the dictionary. For each row in each DataFrame in the dictionary a check will 
    # be done if the combination of stations (begin/end) already exists in the newly made DataFrame. If not, this combination
    # will be added. When the row already exists, the number of seats will be added to the corresponding trajectory
    for i in seats_dictionary.keys():
        train_service_df = seats_dictionary[i]
        row_value = 0
        existence_checker = 0
        for j in range(len(train_service_df)):
            # If there are no seats in the row that is being checked, this iteration will be skipped
            if train_service_df.loc[j,'Seats'] == None:
                continue

            # Otherwise, a loop will be started through all DataFrames in the dictionary
            for k in range(len(seats_per_trajectory)):
                # If the tractory is already existing, the existence_checker will be set to 1 and the
                # row_value is equal to the iteration in this loop
                if train_service_df.loc[j,'From'].upper() == seats_per_trajectory.loc[k, 'From'].upper() \
                   and train_service_df.loc[j,'To'].upper() == seats_per_trajectory.loc[k, 'To'].upper():
                    existence_checker = 1
                    row_value = k
                    break
                # Else, if the trajectory is driven in the other direction, the same statements will be True
                elif train_service_df.loc[j,'To'].upper() == seats_per_trajectory.loc[k, 'From'].upper() \
                   and train_service_df.loc[j,'From'].upper() == seats_per_trajectory.loc[k, 'To'].upper():
                    existence_checker = 1
                    row_value = k
                    break
            
            # If the row already exists, the number of seats from the row_value will be added to the already found seats
            # and the existence_checker will be reset to zero
            if existence_checker == 1:
                seats_per_trajectory.loc[row_value, 'Seats'] += train_service_df.loc[j,'Seats']
                existence_checker = 0
            # When the row does not exist yet, a new row with trajectory has to be made including its seats
            else:
                new_row = pd.DataFrame({'From': [train_service_df.loc[j,'From'].upper()], 
                                        'To': [train_service_df.loc[j,'To'].upper()], 'Seats':[train_service_df.loc[j,'Seats']]})
                seats_per_trajectory = pd.concat([seats_per_trajectory, new_row], ignore_index=True)
    return seats_per_trajectory

In [20]:
def geometry_maker(trajectory_list):
    ''' This function retrieves geometry data from the NS API for each trajectory.
        Please note: To run this function, you need an subscription key from the NS API.
                     For more information, see: https://www.ns.nl/reisinformatie/ns-api

    Args:
        trajectory_list (DataFrame): A DataFrame with at least the columns "From" and "To" from which the geometry will be retrieved

    Output:
        trajectory_and_geometry (GeoDataFrame): A DataFrame with the geometries per trajactory added to the existing input DataFrame
    '''    
    # Input for the subscription key before starting the loop
    subscription_key = input('Please enter your subscription key: ')

    geometry_data = []

    for i in range(len(trajectory_list)):
        while True:
            try:
                # Make the API request and retrieve the geodata
                geodata = api_request(f'{trajectory_list["From"][i]},{trajectory_list["To"][i]}', subscription_key)
                
                # If the API request is successful, add geometry data and break out of the loop
                if geodata is not None:
                    geometry_data.append(geodata['geometry'].iloc[0])
                    print(f'{i+1}/{len(trajectory_list)}', end="\r")
                    break
                else:
                    # If the API fails, prompt for a new subscription key
                    subscription_key = input('Invalid subscription key. Please enter a new subscription key: ')
            # If another error occurs, ask again for the subscription key
            except Exception as e:
                print(f"An error occurred: {e}")
                subscription_key = input('Error occurred. Please enter a new subscription key: ')

    trajectory_list['geometry'] = geometry_data
    trajectory_and_geometry = gpd.GeoDataFrame(trajectory_list, geometry='geometry')
    return trajectory_and_geometry

In [21]:
# All input data for the map types that will be made
# Map type 1: Per day/Complete week of all operators per train type
# Map type 2: Per operator/All operators of the complete week per train type

# Train series seperates by train type: intercity/sprinter
intercities = [1000, 1100, 11400, 11600, 11700, 12600, 1400, 1500, 600, 1700, 17900, 1800, 
               200, 2000, 2100, 21400, 21500, 2200, 22200, 22400, 2300, 23400, 240, 2400, 24400, 
               2600, 2800, 2900, 3000, 3100, 3200, 3500, 3600, 32790, 3700, 37300, 37900, 3900, 
               4500, 500, 600, 700, 800, 900, 9200]
sprinters = [13300, 13800, 14300, 14900, 15400, 16400, 17800, 18900, 20100, 20200, 25400, 30400, 
             30700, 30800, 30900, 31000, 31100, 31200, 31300, 31400, 32000, 32200, 32300, 32400, 
             32500, 32700, 3300, 36900, 37000, 37100, 37200, 37400, 37500, 37600, 37700, 37800, 
             3800, 38000, 4000, 4300, 4400, 4600, 4800, 4900, 5000, 5100, 5200, 5400, 5500, 5600, 
             5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6600, 6700, 6800, 6900, 7000, 7100, 
             7200, 7300, 7400, 7500, 7600, 7900, 8000, 8100, 8500, 8600, 8700, 8800, 8900, 9000]
train_options = ['intercities', 'sprinters', 'all']

# A dictionary for every train type with the corresponding number of seats
seats_per_vehicle_type = {"VIRM": 100, "DDZ": 100, "FLIRT FFF": 53, "ICM": 75,
                      "ICNG25": 52, "SLT": 54, "SNG": 50, "SW7-25KV": 48,
                      "SW9-25KV": 48, "GTW": 45, "Flirt": 57, "FLIRT": 57,
                      "Lint": 65, "WINK": 153}

# Dates used in this model (week 41)
dates = [['2024-10-07', '2024-10-08', '2024-10-09', '2024-10-10', '2024-10-11', '2024-10-12', '2024-10-13']]

# Since we made also odd train services in the function seat_sorter, we also need the odd 
# values in the above made lists
intercities_plus_one = [x + 1 for x in intercities] 
intercities = intercities + intercities_plus_one       
sprinters_plus_one = [x + 1 for x in sprinters] 
sprinters = sprinters + sprinters_plus_one  

# Different operators found in The Netherlands
operators = ['Keolis', 'Arriva', 'NS', 'Qbuzz']

In [22]:
# First, the new DataFrames will be made with the extra column containing the seats for every operator
for operator in operators:
    operator_csv = travel_data([operator], seats_per_vehicle_type)
    operator_csv.to_csv(f"TravelData{operator}.csv")

# and also for all operators combined
operators_csv = travel_data(operators, seats_per_vehicle_type)
operators_csv.to_csv(f"TravelData.csv")

In [24]:
# Making of map type 1 (24 Plot-data files)
#   - All operators
#   - 7 of every seperate day and 1 of the complete week
#   - Both train types, only sprinters, only intercities
# All maps get the same color scale

dates = [['2024-10-07', '2024-10-08', '2024-10-09', '2024-10-10', '2024-10-11', '2024-10-12', '2024-10-13'],['2024-10-07'], ['2024-10-08'], ['2024-10-09'], ['2024-10-10'], ['2024-10-11'], ['2024-10-12'], ['2024-10-13']]

for i in range(len(dates)):
    print(f'{i+1}/{len(dates)}', end="\r")
    travel_data = pd.read_csv('TravelData.csv')
    dates_to_include = dates[i]
    df_filtered = travel_data[travel_data['OperatingDay'].isin(dates_to_include)]
    travel_data = df_filtered.reset_index(drop=True)
    SortedSeats = seat_sorter('TrainServices.csv',travel_data)
    for q, j in enumerate(train_options):
        print(f'{q+1}/{len(train_options)}', end="\r")
        if j == 'intercities':
            SortedSeats1 = {k: v for k, v in SortedSeats.items() if k in intercities}
        if j == 'sprinters':
            SortedSeats1 = {k: v for k, v in SortedSeats.items() if k in sprinters}
        elif j == 'all':
            SortedSeats1 = SortedSeats
        SeatsPerTrajectory = seats_per_trajectory(SortedSeats1)
        df = geometry_maker(SeatsPerTrajectory)
        #volgorde zo aanpassen dat de hogere waardes altijd later worden geplot en dus over de lagere waardes heen plotten
        df = df.sort_values(by='Seats', ascending=True).reset_index()
        # kleuren assignen en toevoegen aan de geopanda
        df['color'] = '0'
        for h in range(len(df)):
            df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))
        if len(dates[i]) > 2:
            df.to_csv(f'PlotDataWeek{train_options[q]}.csv')
        else:
            df.to_csv(f'PlotData{dates[i][0]}{train_options[q]}.csv')   

HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized
HTTPError: 401 - Unauthorized


KeyboardInterrupt: Interrupted by user

In [33]:
# Making of map type 2 (15 Plot-data files)
# - Only complete week
# - 4 of every seperate operators, 1 for all operators
# - Both train types, only sprinters, only intercities
# All maps get another color scale

for i in range(len(operators)):
    print(f'{i+1}/{len(operators)}', end="\r")
    travel_data = pd.read_csv(f'TravelData{operators[i]}.csv')
    datestoinclude = ['2024-10-07', '2024-10-08', '2024-10-09', '2024-10-10', '2024-10-11', '2024-10-12', '2024-10-13']
    df_filtered = travel_data[travel_data['OperatingDay'].isin(datestoinclude)]
    travel_data = df_filtered.reset_index(drop=True)
    SortedSeats = seat_sorter('TrainServices.csv',travel_data)
    for q, j in enumerate(train_options):
        print(f'{q+1}/{len(train_options)}', end="\r")
        if j == 'intercities':
            SortedSeats1 = {k: v for k, v in SortedSeats.items() if k in intercities}
        if j == 'sprinters':
            SortedSeats1 = {k: v for k, v in SortedSeats.items() if k in sprinters}
        elif j == 'all':
            SortedSeats1 = SortedSeats
        SeatsPerTrajectory = seats_per_trajectory(SortedSeats1)
        df = geometry_maker(SeatsPerTrajectory)
        #volgorde zo aanpassen dat de hogere waardes altijd later worden geplot en dus over de lagere waardes heen plotten
        df = df.sort_values(by='Seats', ascending=True).reset_index()
        # kleuren assignen en toevoegen aan de geopanda
        df['color'] = '0'
        for h in range(len(df)):
            df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))
        df.to_csv(f'PlotDataWeek{operators[i]}{train_options[q]}.csv')

9/10

  df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))


19/19

  df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))


2/419

  df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))


2/315

  df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))


143/143

  df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))


143/143

  df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))


256/256

  df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))


3/3/240

  df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))


4/4/282

  df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))


7/11

  df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))


11/11

  df.loc[h, 'color'] = str(interpolate_color(df['Seats'].min(), df['Seats'].max(), (255,255,0), (255,0,0), df.loc[h, 'Seats']))
