#### Table of Contents
1. [Imports](#imports)
2. [Data Loading](#data-loading)
3. [Data Cleaning](#data-cleaning)
4. [Data Transformation](#data-transformation)
5. [Adding Extra Datasets](#adding-extra-datasets)
6. [Create CSV files](#create-csv-files)

#### Imports <a name="imports"></a>

In [20]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from fuzzywuzzy import process



#### Data Loading <a name="data-loading"></a>

In [2]:
# Load data from path
folder_path1 = '/Users/merelkamper/Documents/MSc Data Science/Thesis/MSc-Thesis-main/adapted/data/train_rides'
files1 = os.listdir(folder_path1)
dfs1 = []

for file1 in files1:
    if file1.endswith('.csv'):
        # Extract the year from the filename
        year_service = file1.split('-')[1].split('.')[0] 
        file_path1 = os.path.join(folder_path1, file1)
        df1 = pd.read_csv(file_path1)
        # Add a new column for the year
        df1['Year'] = year_service
        dfs1.append(df1)

# Combine different files                
combined_df_service = pd.concat(dfs1, ignore_index=True)

In [4]:
combined_df_service

Unnamed: 0,Service:RDT-ID,Service:Date,Service:Type,Service:Company,Service:Train number,Service:Completely cancelled,Service:Partly cancelled,Service:Maximum delay,Stop:RDT-ID,Stop:Station code,Stop:Station name,Stop:Arrival time,Stop:Arrival delay,Stop:Arrival cancelled,Stop:Departure time,Stop:Departure delay,Stop:Departure cancelled,Year
0,738804,2019-01-01,Intercity,NS,1410,False,False,1,6220112,RTD,Rotterdam Centraal,,,,2019-01-01T02:00:00+01:00,1.0,False,2019
1,738804,2019-01-01,Intercity,NS,1410,False,False,0,6220116,DT,Delft,2019-01-01T02:12:00+01:00,0.0,False,2019-01-01T02:12:00+01:00,0.0,False,2019
2,738804,2019-01-01,Intercity,NS,1410,False,False,0,6220120,GV,Den Haag HS,2019-01-01T02:20:00+01:00,1.0,False,2019-01-01T02:21:00+01:00,1.0,False,2019
3,738804,2019-01-01,Intercity,NS,1410,False,False,0,6220124,LEDN,Leiden Centraal,2019-01-01T02:35:00+01:00,0.0,False,2019-01-01T02:45:00+01:00,0.0,False,2019
4,738804,2019-01-01,Intercity,NS,1410,False,False,0,6220128,SHL,Schiphol Airport,2019-01-01T03:00:00+01:00,0.0,False,2019-01-01T03:02:00+01:00,0.0,False,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108092094,5575436,2020-12-31,Extra trein,NS,29062,False,True,0,49365664,AH,Arnhem Centraal,2020-12-31T21:22:00+01:00,0.0,True,,,,2020
108092095,5575436,2020-12-31,Extra trein,NS,29062,False,True,0,49367183,UTVR,Utrecht Vaartsche Rijn,2020-12-31T21:52:46+01:00,23.0,False,2020-12-31T21:52:46+01:00,23.0,False,2020
108092096,5575436,2020-12-31,Extra trein,NS,29062,False,True,0,49366869,UT,Utrecht Centraal,2020-12-31T21:55:35+01:00,25.0,False,,,,2020
108092097,5575457,2020-12-31,Extra trein,NS,29063,False,False,7,49365850,DB,Driebergen-Zeist,,,,2020-12-31T21:10:00+01:00,7.0,False,2020


#### Data Cleaning <a name="data-cleaning"></a>

In [3]:
# Filter all NS data and skip the rest from the DataFrame
ns_data = combined_df_service[combined_df_service['Service:Company'].str.lower() == 'ns']

# Remove completely cancelled trajectories for NS data
cancelled_trajectories = ns_data.groupby('Service:RDT-ID')['Service:Completely cancelled'].all().sum()
cancelled_trajectory_indices = ns_data.groupby('Service:RDT-ID')['Service:Completely cancelled'].all()
cancelled_trajectory_indices = cancelled_trajectory_indices[cancelled_trajectory_indices].index
ns_data = ns_data[~ns_data['Service:RDT-ID'].isin(cancelled_trajectory_indices)]

# Filter out rare trajectories (< 50 occurrences)
first_stop = ns_data.groupby('Service:RDT-ID')['Stop:Station name'].first()
last_stop = ns_data.groupby('Service:RDT-ID')['Stop:Station name'].last()
trajectories_df = pd.DataFrame({'Trajectory': first_stop + ' - ' + last_stop})
trajectories_df['Count'] = trajectories_df.groupby('Trajectory')['Trajectory'].transform('count')
trajectories_df = trajectories_df.drop_duplicates()
rare_trajectories = trajectories_df[trajectories_df['Count'] < 50]['Trajectory']
rare_trajectory_rdt_ids = ns_data[ns_data['Service:RDT-ID'].isin(rare_trajectories.index)]['Service:RDT-ID'].unique()
ns_data_definite = ns_data[~ns_data['Service:RDT-ID'].isin(rare_trajectory_rdt_ids)]

In [5]:
ns_data_definite

Unnamed: 0,Service:RDT-ID,Service:Date,Service:Type,Service:Company,Service:Train number,Service:Completely cancelled,Service:Partly cancelled,Service:Maximum delay,Stop:RDT-ID,Stop:Station code,Stop:Station name,Stop:Arrival time,Stop:Arrival delay,Stop:Arrival cancelled,Stop:Departure time,Stop:Departure delay,Stop:Departure cancelled,Year
0,738804,2019-01-01,Intercity,NS,1410,False,False,1,6220112,RTD,Rotterdam Centraal,,,,2019-01-01T02:00:00+01:00,1.0,False,2019
1,738804,2019-01-01,Intercity,NS,1410,False,False,0,6220116,DT,Delft,2019-01-01T02:12:00+01:00,0.0,False,2019-01-01T02:12:00+01:00,0.0,False,2019
2,738804,2019-01-01,Intercity,NS,1410,False,False,0,6220120,GV,Den Haag HS,2019-01-01T02:20:00+01:00,1.0,False,2019-01-01T02:21:00+01:00,1.0,False,2019
3,738804,2019-01-01,Intercity,NS,1410,False,False,0,6220124,LEDN,Leiden Centraal,2019-01-01T02:35:00+01:00,0.0,False,2019-01-01T02:45:00+01:00,0.0,False,2019
4,738804,2019-01-01,Intercity,NS,1410,False,False,0,6220128,SHL,Schiphol Airport,2019-01-01T03:00:00+01:00,0.0,False,2019-01-01T03:02:00+01:00,0.0,False,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108092094,5575436,2020-12-31,Extra trein,NS,29062,False,True,0,49365664,AH,Arnhem Centraal,2020-12-31T21:22:00+01:00,0.0,True,,,,2020
108092095,5575436,2020-12-31,Extra trein,NS,29062,False,True,0,49367183,UTVR,Utrecht Vaartsche Rijn,2020-12-31T21:52:46+01:00,23.0,False,2020-12-31T21:52:46+01:00,23.0,False,2020
108092096,5575436,2020-12-31,Extra trein,NS,29062,False,True,0,49366869,UT,Utrecht Centraal,2020-12-31T21:55:35+01:00,25.0,False,,,,2020
108092097,5575457,2020-12-31,Extra trein,NS,29063,False,False,7,49365850,DB,Driebergen-Zeist,,,,2020-12-31T21:10:00+01:00,7.0,False,2020


#### Data Tranformation <a name="data-transformation"></a>

First we will group the data by unique RDT-ID to aggregate information for each trajectory.

In [6]:
grouped = ns_data_definite.groupby('Service:RDT-ID')

rdt_ids = grouped['Service:RDT-ID'].unique()
trajectories = ns_data_definite.groupby('Service:RDT-ID')['Stop:Station name'].agg(['first', 'last']).agg(' - '.join, axis=1)
dates = grouped['Service:Date'].first()
days_of_week = pd.to_datetime(dates).dt.day_name()
max_delays = grouped['Service:Maximum delay'].first()
arrival_delays_last_stop = grouped['Stop:Arrival delay'].last()
planned_stops = grouped.size() - 2  # Subtract 2 for departure and arrival stops
cancelled_arrivals = grouped['Stop:Arrival cancelled'].sum()
cancelled_departures = grouped['Stop:Departure cancelled'].sum()
delayed_arrivals = (ns_data_definite['Stop:Arrival delay'] > 0).groupby(ns_data_definite['Service:RDT-ID']).sum()
delayed_departures = (ns_data_definite['Stop:Departure delay'] > 0).groupby(ns_data_definite['Service:RDT-ID']).sum()
partly_cancelled = grouped['Service:Partly cancelled'].any()

definite_df = pd.DataFrame({
    'RDT-ID': rdt_ids,
    'Trajectory': trajectories,
    'Date': dates,
    'Day of the Week': days_of_week,
    'Maximum Delay': max_delays,
    'Arrival Delay of Last Stop': arrival_delays_last_stop,
    'Nr. of Planned Stops': planned_stops,
    'Nr. of Cancelled Arrivals': cancelled_arrivals,
    'Nr. of Cancelled Departures': cancelled_departures,
    'Nr. of Delayed Arrivals': delayed_arrivals,
    'Nr. of Delayed Departures': delayed_departures,
    'Partly Cancelled': partly_cancelled
})

definite_df['Date'] = pd.to_datetime(definite_df['Date'])

In [10]:
definite_df

Unnamed: 0_level_0,RDT-ID,Trajectory,Date,Day of the Week,Maximum Delay,Arrival Delay of Last Stop,Nr. of Planned Stops,Nr. of Cancelled Arrivals,Nr. of Cancelled Departures,Nr. of Delayed Arrivals,Nr. of Delayed Departures,Partly Cancelled,YEAR,MONTH,DAY,DoW,ORIGIN,DESTINATION
Service:RDT-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
738804,[738804],Rotterdam Centraal - Utrecht Centraal,2019-01-01,Tuesday,1,0.0,5,0,0,2,2,False,2019,1,1,Tuesday,Rotterdam Centraal,Utrecht Centraal
738805,[738805],Utrecht Centraal - Rotterdam Centraal,2019-01-01,Tuesday,2,0.0,6,0,0,2,1,False,2019,1,1,Tuesday,Utrecht Centraal,Rotterdam Centraal
738806,[738806],Rotterdam Centraal - Utrecht Centraal,2019-01-01,Tuesday,2,0.0,5,0,0,2,2,False,2019,1,1,Tuesday,Rotterdam Centraal,Utrecht Centraal
738807,[738807],Utrecht Centraal - Rotterdam Centraal,2019-01-01,Tuesday,2,2.0,5,0,0,2,0,False,2019,1,1,Tuesday,Utrecht Centraal,Rotterdam Centraal
738808,[738808],Rotterdam Centraal - Utrecht Centraal,2019-01-01,Tuesday,1,0.0,5,0,0,0,2,False,2019,1,1,Tuesday,Rotterdam Centraal,Utrecht Centraal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12703954,[12703954],Tiel - Leiden Centraal,2023-12-31,Sunday,10,4.0,12,2,2,10,11,True,2023,12,31,Sunday,Tiel,Leiden Centraal
12704321,[12704321],Uitgeest - Rotterdam Centraal,2023-12-31,Sunday,4,0.0,21,11,11,6,6,True,2023,12,31,Sunday,Uitgeest,Rotterdam Centraal
12704322,[12704322],Rotterdam Centraal - Uitgeest,2023-12-31,Sunday,1,0.0,21,15,15,1,1,True,2023,12,31,Sunday,Rotterdam Centraal,Uitgeest
12704635,[12704635],Nijmegen - Den Helder,2023-12-31,Sunday,1,0.0,16,6,6,2,3,True,2023,12,31,Sunday,Nijmegen,Den Helder


In [11]:
definite_df['YEAR'] = definite_df['Date'].dt.year
definite_df['MONTH'] = definite_df['Date'].dt.month
definite_df['DAY'] = definite_df['Date'].dt.day
definite_df['DoW'] = definite_df['Date'].dt.day_name()
definite_df[['ORIGIN', 'DESTINATION']] = definite_df['Trajectory'].str.split(' - ', expand=True)
definite_df['ORIGIN'] = definite_df['ORIGIN'].replace({'Amersfoort': 'Amersfoort Centraal', 'Eindhoven': 'Eindhoven Centraal'})
definite_df['DESTINATION'] = definite_df['DESTINATION'].replace({'Amersfoort': 'Amersfoort Centraal', 'Eindhoven': 'Eindhoven Centraal'})

rides_performed = definite_df.dropna(subset=['Arrival Delay of Last Stop']).groupby(['YEAR', 'MONTH', 'DAY', 'DoW', 'ORIGIN', 'DESTINATION']).size().reset_index(name='RIDES PERFORMED')
delayed_arrivals = definite_df[definite_df['Arrival Delay of Last Stop'] > 0.0].groupby(['YEAR', 'MONTH', 'DAY', 'DoW', 'ORIGIN', 'DESTINATION']).size().reset_index(name='ARRIVALS DELAYED')
arrival_canceled = definite_df[definite_df['Arrival Delay of Last Stop'].isna()].groupby(['YEAR', 'MONTH', 'DAY', 'DoW', 'ORIGIN', 'DESTINATION']).size().reset_index(name='ARRIVAL CANCELED')
trajectories_per_day = rides_performed.merge(delayed_arrivals, on=['YEAR', 'MONTH', 'DAY', 'DoW', 'ORIGIN', 'DESTINATION'], how='left').merge(arrival_canceled, on=['YEAR', 'MONTH', 'DAY', 'DoW', 'ORIGIN', 'DESTINATION'], how='left')

trajectories_per_day['ARRIVALS DELAYED'].fillna(0, inplace=True)
trajectories_per_day['ARRIVAL CANCELED'].fillna(0, inplace=True)

In [9]:
trajectories_per_day

Unnamed: 0,YEAR,MONTH,DAY,DoW,ORIGIN,DESTINATION,RIDES PERFORMED,ARRIVALS DELAYED,ARRIVAL CANCELED
0,2019,1,1,Tuesday,'s-Hertogenbosch,Arnhem Centraal,1,0.0,0.0
1,2019,1,1,Tuesday,'s-Hertogenbosch,Den Haag Centraal,33,4.0,0.0
2,2019,1,1,Tuesday,'s-Hertogenbosch,Deurne,17,4.0,0.0
3,2019,1,1,Tuesday,'s-Hertogenbosch,Dordrecht,11,0.0,0.0
4,2019,1,1,Tuesday,'s-Hertogenbosch,Eindhoven Centraal,18,2.0,0.0
...,...,...,...,...,...,...,...,...,...
507076,2023,12,31,Sunday,Zwolle,Nijmegen,1,0.0,0.0
507077,2023,12,31,Sunday,Zwolle,Roosendaal,22,6.0,0.0
507078,2023,12,31,Sunday,Zwolle,Tilburg,1,1.0,0.0
507079,2023,12,31,Sunday,Zwolle,Utrecht Centraal,26,4.0,0.0


#### Adding Extra Datasets <a name="adding-extra-datasets"></a>

##### Station distances 

First we will create a **distance overview** from every station to every station based on the matrix from: https://www.rijdendetreinen.nl/en/open-data/station-distances.

In [24]:
# STEP 1: Get distances for codes
matrix_df = pd.read_csv('tariff-distances-2022-01.csv', index_col=0, delimiter=';')

origin_codes = []
destination_codes = []
distances = []

for origin_code in matrix_df.index:
    for destination_code in matrix_df.columns:
        distance = matrix_df.loc[origin_code, destination_code]
        if distance != 'XXX' and origin_code != destination_code:
            origin_codes.append(origin_code)
            destination_codes.append(destination_code)
            distances.append(distance)

new_df = pd.DataFrame({
    'origin_code': origin_codes,
    'destination_code': destination_codes,
    'distance': distances
})

# STEP 2: Let's change the codes to names
station_df = pd.read_csv('stations-2023-09.csv')
code_to_name = dict(zip(station_df['code'], station_df['name_long']))

newer_df = new_df.copy()  # Create a copy of new_df to avoid modifying the original DataFrame
newer_df['origin_name'] = new_df['origin_code'].map(code_to_name)
newer_df['destination_name'] = new_df['destination_code'].map(code_to_name)

newer_df.drop(['origin_code', 'destination_code'], axis=1, inplace=True)
newer_df = newer_df[['origin_name', 'destination_name', 'distance']]

# STEP 3: Additional Modifications
distances = newer_df.copy() 
distances['origin_name'] = distances['origin_name'].replace({'Amersfoort': 'Amersfoort Centraal', 'Eindhoven': 'Eindhoven Centraal'})
distances['destination_name'] = distances['destination_name'].replace({'Amersfoort': 'Amersfoort Centraal', 'Eindhoven': 'Eindhoven Centraal'})

In [25]:
distances

Unnamed: 0,origin_name,destination_name,distance
0,Abcoude,Arnhem Centraal,82
1,Abcoude,Arnhem Velperpoort,83
2,Abcoude,Arnhem Presikhaaf,85
3,Abcoude,Arnhem Zuid,90
4,Abcoude,Arkel,71
...,...,...,...
158797,Zaandijk Zaanse Schans,Zoetermeer Oost,81
158798,Zaandijk Zaanse Schans,Zevenaar,123
158799,Zaandijk Zaanse Schans,Zevenbergen,134
158800,Zaandijk Zaanse Schans,Zandvoort aan Zee,32


##### Station population dataset

Furthermore, we will create a **population overview** of every station based on a list from: https://www.cbs.nl/nl-nl/visualisaties/dashboard-bevolking/regionaal/inwoners.

In [22]:
# STEP 1: Get unique values from the 'ORIGIN' and 'DESTINATION' columns separately
unique_origin = trajectories_per_day['ORIGIN'].unique()
unique_destination = trajectories_per_day['DESTINATION'].unique()
unique_stations = np.unique(np.concatenate([unique_origin, unique_destination]))
unique_stations_normalized = [station.lower().replace("'", "") for station in unique_stations]

population_regions = pd.read_csv('population-nl-2023.csv', index_col=0, delimiter=';')
population_regions['Normalized Region'] = population_regions.index.str.lower().str.replace("'", "")
population_regions.index = population_regions.index.str.lower().str.replace("'", "")

population = []

for station in unique_stations_normalized:
    match = process.extractOne(station, population_regions.index)
    
    if match[1] > 70:
        population_value = population_regions.loc[match[0], 'Inwonersaantal']
        population.append(population_value)
        #print(f"Station: {station}, Population: {population_value}")
    else:
        population.append(None)
        #print(f"No match found for station: {station}")

population_stations_nl = pd.DataFrame({'Station': unique_stations, 'Population': population})

In [23]:
population_stations_nl

Unnamed: 0,Station,Population
0,'s-Hertogenbosch,160740.0
1,'s-Hertogenbosch Oost,160740.0
2,'t Harde,
3,Aachen Hbf,
4,Abcoude,
...,...,...
338,Zoetermeer Oost,128424.0
339,Zuidhorn,
340,Zutphen,48746.0
341,Zwijndrecht,44870.0


#### Create CSV files <a name="create-csv-files"></a>

In [28]:
path_trajectories = '/Users/merelkamper/Documents/MSc Data Science/Thesis/MSc-Thesis-main/Files from Data Processing/data_per_trajectory.csv'
path_days = '/Users/merelkamper/Documents/MSc Data Science/Thesis/MSc-Thesis-main/Files from Data Processing/data_per_day.csv'
path_distances = '/Users/merelkamper/Documents/MSc Data Science/Thesis/MSc-Thesis-main/Files from Data Processing/station_distances.csv'
path_population = '/Users/merelkamper/Documents/MSc Data Science/Thesis/MSc-Thesis-main/Files from Data Processing/station_population.csv'

definite_df.to_csv(path_trajectories)
trajectories_per_day.to_csv(path_days)
distances.to_csv(path_distances)
population_stations_nl.to_csv(path_population)