In [1]:
import requests
import xlrd
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import osmnx as ox
import networkx as nx
import os
from math import radians, cos, sin, sqrt, atan2
from dotenv import load_dotenv
from shapely import wkt
from shapely.geometry import Point
from haversine import haversine


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
os.environ['OGR_GEOMETRY_ACCEPT_UNCLOSED_RING'] = 'NO'

# Retrieving api key
load_dotenv("../key.env")
api_key = os.getenv("API_KEY")

In [2]:
%run get_bus_info_function.ipynb
%run get_geospatial_function.ipynb
bus_services_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusServices", api_key)
bus_routes_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusRoutes", api_key)
bus_stops_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusStops", api_key)
geospatial_train_path = "../datasets/geospatial_layer/TrainStation_Jul2024/RapidTransitSystemStation.shp"
train_stations = pd.read_excel("../datasets/Train_Stations.xls")
geospatial_train_gdf = gpd.read_file(geospatial_train_path)

## Reading in data

In [8]:
train_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Station_Code  211 non-null    object
 1   MRT_Station   211 non-null    object
 2   MRT_Line      211 non-null    object
dtypes: object(3)
memory usage: 5.1+ KB


In [9]:
train_stations.head()

Unnamed: 0,Station_Code,MRT_Station,MRT_Line
0,NS1,Jurong East,North-South Line
1,NS2,Bukit Batok,North-South Line
2,NS3,Bukit Gombak,North-South Line
3,NS4,Choa Chu Kang,North-South Line
4,NS5,Yew Tee,North-South Line


In [10]:
bus_routes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25544 entries, 0 to 25543
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ServiceNo     25544 non-null  object 
 1   Operator      25544 non-null  object 
 2   Direction     25544 non-null  int64  
 3   StopSequence  25544 non-null  int64  
 4   BusStopCode   25544 non-null  object 
 5   Distance      25544 non-null  float64
 6   WD_FirstBus   25544 non-null  object 
 7   WD_LastBus    25544 non-null  object 
 8   SAT_FirstBus  25544 non-null  object 
 9   SAT_LastBus   25544 non-null  object 
 10  SUN_FirstBus  25544 non-null  object 
 11  SUN_LastBus   25544 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 2.3+ MB


In [11]:
bus_routes_df[bus_routes_df['ServiceNo']=='100']['BusStopCode'].unique().shape

(109,)

In [12]:
geospatial_train_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   TYP_CD      230 non-null    int64   
 1   STN_NAM     0 non-null      object  
 2   ATTACHEMEN  51 non-null     object  
 3   TYP_CD_DES  230 non-null    object  
 4   STN_NAM_DE  230 non-null    object  
 5   geometry    230 non-null    geometry
dtypes: geometry(1), int64(1), object(4)
memory usage: 10.9+ KB


In [13]:
geospatial_train_gdf.head()

Unnamed: 0,TYP_CD,STN_NAM,ATTACHEMEN,TYP_CD_DES,STN_NAM_DE,geometry
0,0,,,MRT,GALI BATU DEPOT,"POLYGON ((19210.615 41858.041, 19223.517 41756..."
1,0,,,MRT,HILLVIEW MRT STATION,"POLYGON ((20650.333 38282.331, 20654.770 38298..."
2,0,,,MRT,BEAUTY WORLD MRT STATION,"POLYGON ((21594.717 35882.935, 21584.857 35880..."
3,0,,,MRT,HUME MRT STATION,"POLYGON ((20807.997 37457.716, 20815.376 37460..."
4,0,,,MRT,BUKIT PANJANG MRT STATION,"POLYGON ((19996.270 40187.205, 20028.770 40127..."


In [14]:
bus_stops_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5138 entries, 0 to 5137
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   BusStopCode  5138 non-null   object 
 1   RoadName     5138 non-null   object 
 2   Description  5138 non-null   object 
 3   Latitude     5138 non-null   float64
 4   Longitude    5138 non-null   float64
dtypes: float64(2), object(3)
memory usage: 200.8+ KB


## Data Pre-Processing

### Bus Stop 

In [3]:
bus_routes_services = pd.merge(bus_routes_df,bus_services_df, on =["ServiceNo","Direction"], how = "left")
columns_to_drop = ["Operator_y"]
bus_routes_services.drop(columns=columns_to_drop, inplace=True)

In [4]:
# Merge bus_routes_df with bus_stops_df to get the lat-long of each bus stop
merged_bus_routes = pd.merge(bus_routes_services, bus_stops_df, on='BusStopCode', how='left')
#Ensure there's no rows with na values for longitude and latitude
print(merged_bus_routes.head())

  ServiceNo Operator_x  Direction  StopSequence BusStopCode  Distance  \
0        10       SBST          1             1       75009       0.0   
1        10       SBST          1             2       76059       0.6   
2        10       SBST          1             3       76069       1.1   
3        10       SBST          1             4       96289       2.3   
4        10       SBST          1             5       96109       2.7   

  WD_FirstBus WD_LastBus SAT_FirstBus SAT_LastBus SUN_FirstBus SUN_LastBus  \
0        0500       2300         0500        2300         0500        2300   
1        0502       2302         0502        2302         0502        2302   
2        0504       2304         0504        2304         0503        2304   
3        0508       2308         0508        2309         0507        2308   
4        0509       2310         0509        2311         0508        2309   

  Category OriginCode DestinationCode AM_Peak_Freq AM_Offpeak_Freq  \
0    TRUNK      75009 

In [17]:
merged_bus_routes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25544 entries, 0 to 25543
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ServiceNo        25544 non-null  object 
 1   Operator_x       25544 non-null  object 
 2   Direction        25544 non-null  int64  
 3   StopSequence     25544 non-null  int64  
 4   BusStopCode      25544 non-null  object 
 5   Distance         25544 non-null  float64
 6   WD_FirstBus      25544 non-null  object 
 7   WD_LastBus       25544 non-null  object 
 8   SAT_FirstBus     25544 non-null  object 
 9   SAT_LastBus      25544 non-null  object 
 10  SUN_FirstBus     25544 non-null  object 
 11  SUN_LastBus      25544 non-null  object 
 12  Category         25544 non-null  object 
 13  OriginCode       25544 non-null  object 
 14  DestinationCode  25544 non-null  object 
 15  AM_Peak_Freq     25544 non-null  object 
 16  AM_Offpeak_Freq  25544 non-null  object 
 17  PM_Peak_Freq

### Train Stations

In [5]:
# Step 1: Union the geometries for the same station
unioned_gdf = geospatial_train_gdf.dissolve(by='STN_NAM_DE',aggfunc='first')

# Step 2: Calculate the centroid of the unioned polygon
unioned_gdf['centroid'] = unioned_gdf.centroid

# Optional Step: Replace geometry with centroid point
unioned_gdf['geometry'] = unioned_gdf['centroid']

# Reset index to clean up
unioned_gdf.reset_index(inplace=True)

  merged_geom = block.unary_union


In [19]:
unioned_gdf.head()

Unnamed: 0,STN_NAM_DE,geometry,TYP_CD,STN_NAM,ATTACHEMEN,TYP_CD_DES,centroid
0,ADMIRALTY MRT STATION,POINT (24400.883 46918.344),0,,,MRT,POINT (24400.883 46918.344)
1,ALJUNIED MRT STATION,POINT (33518.605 33189.987),0,,,MRT,POINT (33518.605 33189.987)
2,ANG MO KIO MRT STATION,POINT (29813.745 39107.484),0,,,MRT,POINT (29813.745 39107.484)
3,BAKAU LRT STATION,POINT (36035.791 41115.238),0,,,LRT,POINT (36035.791 41115.238)
4,BANGKIT LRT STATION,POINT (21249.598 40220.704),0,,,LRT,POINT (21249.598 40220.704)


In [6]:
# Function to normalize station names in train_stations_df
def normalize_station_name(name):
    return name.strip().upper()  # Ensure names are uppercase for consistent merging

# Apply normalization function to train_stations_df
train_stations['Normalized_Station'] = train_stations['MRT_Station'].apply(normalize_station_name)

# Create a column to append " MRT STATION" or " LRT STATION" based on the MRT_Line
train_stations['Station_MRT_LRT'] = train_stations.apply(
    lambda row: f"{row['Normalized_Station']} MRT STATION" if "LRT" not in row['MRT_Line'] else f"{row['Normalized_Station']} LRT STATION",
    axis=1
)

# Apply normalization to geospatial_train_df
# Strip ' MRT STATION' and ' LRT STATION' and normalize to uppercase
unioned_gdf['Normalized_Station'] = unioned_gdf['STN_NAM_DE'].str.strip().str.upper()

# Perform the merge on 'Station_MRT_LRT' from train_stations and 'Normalized_Station' from unioned_gdf
merged_train_stations = train_stations.merge(
    unioned_gdf,
    how='left',
    left_on='Station_MRT_LRT',
    right_on='Normalized_Station'
)

# Keeping necessary columns
columns_to_keep = ['Station_Code', 'MRT_Station', 'MRT_Line', 'TYP_CD_DES', 'geometry']
merged_train_stations = merged_train_stations[columns_to_keep]

# Check the resulting column names and sample data
print(merged_train_stations.head())


  Station_Code    MRT_Station           MRT_Line TYP_CD_DES  \
0          NS1    Jurong East  North-South Line         MRT   
1          NS2    Bukit Batok  North-South Line         MRT   
2          NS3   Bukit Gombak  North-South Line         MRT   
3          NS4  Choa Chu Kang  North-South Line         MRT   
4          NS5        Yew Tee  North-South Line         MRT   

                      geometry  
0  POINT (17866.487 35045.184)  
1  POINT (18676.448 36790.872)  
2  POINT (18940.178 37860.706)  
3  POINT (18101.056 40790.989)  
4  POINT (18438.643 42159.628)  


In [8]:
#  Convert Pandas DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(merged_train_stations, geometry='geometry')

#  Reproject the GeoDataFrame to EPSG:4326 (WGS 84 - latitude/longitude)
gdf_4326 = gdf.to_crs(epsg=4326)

# Extract Longitude and Latitude from the reprojected geometries
gdf_4326['Longitude'] = gdf_4326.geometry.x
gdf_4326['Latitude'] = gdf_4326.geometry.y

#  Convert back to a Pandas DataFrame (if you don't need the geometry anymore)
merged_train_stations = pd.DataFrame(gdf_4326)

# Removing redundant columns
columns_to_keep = ['Station_Code', 'MRT_Station', 'MRT_Line', 'Longitude', 'Latitude']
merged_train_stations = merged_train_stations[columns_to_keep]
print(merged_train_stations.head())


  Station_Code    MRT_Station           MRT_Line   Longitude  Latitude
0          NS1    Jurong East  North-South Line   103.742263  1.333209
1          NS2    Bukit Batok  North-South Line   103.749541  1.348997
2          NS3   Bukit Gombak  North-South Line   103.751910  1.358672
3          NS4  Choa Chu Kang  North-South Line   103.744369  1.385172
4          NS5        Yew Tee  North-South Line   103.747402  1.397550


## Overlapping calculation

### Method 1: Haversine Distance

#### Version 1: Without buffer

In [9]:
def calculate_overlap_percentage_no_buffer(df_bus_stops, df_train_stations, max_distance=350):
    # Convert bus stops and train stations to GeoDataFrames
    gdf_bus_stops = gpd.GeoDataFrame(
        df_bus_stops, geometry=gpd.points_from_xy(df_bus_stops['Longitude'], df_bus_stops['Latitude']), crs="EPSG:4326"
    )
    gdf_train_stations = gpd.GeoDataFrame(
        df_train_stations, geometry=gpd.points_from_xy(df_train_stations['Longitude'], df_train_stations['Latitude']), crs="EPSG:4326"
    )

    # Project both GeoDataFrames to a suitable metric CRS (EPSG:3414 for Singapore)
    gdf_bus_stops = gdf_bus_stops.to_crs(epsg=3414)
    gdf_train_stations = gdf_train_stations.to_crs(epsg=3414)

    results = []
    processed_stops = {}  # Dictionary to track processed bus stops and MRT stations within 300m

    # Group by ServiceNo to process each bus service separately
    grouped = gdf_bus_stops.groupby('ServiceNo')

    # Iterate over each bus service group
    for service_no, bus_stops_service in grouped:
        # Drop duplicate bus stops if any
        bus_stops_service_unique = bus_stops_service.drop_duplicates(subset=['BusStopCode'])

        total_bus_stops = len(bus_stops_service_unique)
        overlapping_stops = 0
        bus_stop_overlaps = {}  # Store the overlapped train stations for each bus stop

        # For each unique bus stop in the service
        for _, bus_stop in bus_stops_service_unique.iterrows():
            bus_stop_code = bus_stop['BusStopCode']
            category = bus_stop['Category']

            # Track the train stations within 300m of the bus stop
            train_codes_within_350m = []

            # Iterate over all train stations (no buffer filter here)
            for _, mrt_station in gdf_train_stations.iterrows():
                mrt_lat, mrt_lon = mrt_station['Latitude'], mrt_station['Longitude']
                mrt_code = mrt_station['Station_Code']
                mrt_line = mrt_station['MRT_Line']

                # Calculate the haversine distance between bus stop and MRT station
                distance = haversine((bus_stop['Latitude'], bus_stop['Longitude']), (mrt_lat, mrt_lon)) * 1000  # Convert km to meters

                # If distance is <= 350m, consider it overlapping
                if distance <= max_distance:
                    train_codes_within_350m.append([distance, mrt_code, mrt_line])

            # Only store the bus stop if it has overlapping train stations
            if train_codes_within_350m:
                bus_stop_overlaps[bus_stop_code] = train_codes_within_350m
                overlapping_stops += 1

            # Mark the bus stop as processed (even if no MRT stations were found within 350m)
            processed_stops[bus_stop_code] = train_codes_within_350m

        # Calculate the overlap percentage based on the unique bus stops
        overlap_percentage = (overlapping_stops / total_bus_stops) * 100 if total_bus_stops > 0 else 0
        results.append({
            'ServiceNo': service_no,
            'Category': category,
            'TotalBusStops': total_bus_stops,
            'OverlappingStops': overlapping_stops,
            'OverlapPercentage': overlap_percentage,
            'OverlappedBusStops': bus_stop_overlaps
        })

    # Convert results into a DataFrame
    overlap_df = pd.DataFrame(results)
    return overlap_df

# Example usage:
overlap_results_no_buffer = calculate_overlap_percentage_no_buffer(merged_bus_routes, merged_train_stations, max_distance=350)
print(overlap_results_no_buffer)


KeyboardInterrupt: 

#### Version 2 : With Buffer

In [23]:
def calculate_overlap_percentage_with_buffer(df_bus_stops, df_train_stations, max_distance=500, buffer_distance=1000):
    # Convert bus stops and train stations to GeoDataFrames
    gdf_bus_stops = gpd.GeoDataFrame(
        df_bus_stops, geometry=gpd.points_from_xy(df_bus_stops['Longitude'], df_bus_stops['Latitude']), crs="EPSG:4326"
    )
    gdf_train_stations = gpd.GeoDataFrame(
        df_train_stations, geometry=gpd.points_from_xy(df_train_stations['Longitude'], df_train_stations['Latitude']), crs="EPSG:4326"
    )

    # Project both GeoDataFrames to a suitable metric CRS (EPSG:3414 for Singapore)
    gdf_bus_stops = gdf_bus_stops.to_crs(epsg=3414)
    gdf_train_stations = gdf_train_stations.to_crs(epsg=3414)

    results = []
    processed_stops = {}  # Dictionary to track processed bus stops and MRT stations within 300m

    # Group by ServiceNo to process each bus service separately
    grouped = gdf_bus_stops.groupby('ServiceNo')

    # Iterate over each bus service group
    for service_no, bus_stops_service in grouped:
        # Drop duplicate bus stops if any
        bus_stops_service_unique = bus_stops_service.drop_duplicates(subset=['BusStopCode'])

        total_bus_stops = len(bus_stops_service_unique)
        overlapping_stops = 0
        bus_stop_overlaps = {}  # Store the overlapped train stations for each bus stop

        # For each unique bus stop in the service
        for _, bus_stop in bus_stops_service_unique.iterrows():
            bus_stop_code = bus_stop['BusStopCode']
            category = bus_stop['Category']
            
            # Create a 1km buffer around the bus stop
            bus_stop_buffer = bus_stop.geometry.buffer(buffer_distance)  # Buffer of 1000 meters (1km)
            
            # Filter train stations within the buffer using geometric intersection
            train_stations_in_buffer = gdf_train_stations[gdf_train_stations.intersects(bus_stop_buffer)]

            # Track the train stations within 300m of the bus stop
            train_codes_within_300m = []

            # Iterate over the train stations in the buffer
            for _, mrt_station in train_stations_in_buffer.iterrows():
                mrt_lat, mrt_lon = mrt_station['Latitude'], mrt_station['Longitude']
                mrt_code = mrt_station['Station_Code']
                mrt_line = mrt_station['MRT_Line']

                # Calculate the haversine distance between bus stop and MRT station
                distance = haversine((bus_stop['Latitude'], bus_stop['Longitude']), (mrt_lat, mrt_lon)) * 1000  # Convert km to meters

                # If distance is <= 300m, consider it overlapping
                if distance <= max_distance:
                    train_codes_within_300m.append([distance,mrt_code, mrt_line])

            # Only store the bus stop if it has overlapping train stations
            if train_codes_within_300m:
                bus_stop_overlaps[bus_stop_code] = train_codes_within_300m
                overlapping_stops += 1

            # Mark the bus stop as processed (even if no MRT stations were found within 300m)
            processed_stops[bus_stop_code] = train_codes_within_300m

        # Calculate the overlap percentage based on the unique bus stops
        overlap_percentage = (overlapping_stops / total_bus_stops) * 100 if total_bus_stops > 0 else 0
        results.append({
            'ServiceNo': service_no,
            'Category': category,
            'TotalBusStops': total_bus_stops,
            'OverlappingStops': overlapping_stops,
            'OverlapPercentage': overlap_percentage,
            'OverlappedBusStops': bus_stop_overlaps
        })

    # Convert results into a DataFrame
    overlap_df = pd.DataFrame(results)
    return overlap_df

# Example usage:
overlap_results_naive_buffer = calculate_overlap_percentage_with_buffer(merged_bus_routes, merged_train_stations, max_distance=350)
print(overlap_results_naive_buffer)


    ServiceNo    Category  TotalBusStops  OverlappingStops  OverlapPercentage  \
0          10       TRUNK            146                46          31.506849   
1         100       TRUNK            109                49          44.954128   
2        100A       TRUNK             13                 6          46.153846   
3         101       TRUNK             45                11          24.444444   
4         102       TRUNK             59                24          40.677966   
5        102A       TRUNK             11                 6          54.545455   
6        102B       TRUNK             11                 6          54.545455   
7         103       TRUNK             83                 9          10.843373   
8         105       TRUNK            112                43          38.392857   
9        105B       TRUNK             13                 3          23.076923   
10        106       TRUNK             96                54          56.250000   
11       106A       TRUNK   

#### Visualisations 

In [24]:
# def visualize_overlap_percentage(overlap_df):
#     # Sort the data by overlap percentage
#     overlap_df_sorted = overlap_df.sort_values(by='OverlapPercentage', ascending=False)

#     # Set up the plot
#     plt.figure(figsize=(12, 6))
    
#     # Create a bar plot
#     sns.barplot(x='ServiceNo', y='OverlapPercentage', data=overlap_df_sorted, palette='Blues_d')

#     # Customize the plot
#     plt.xticks(rotation=90)  # Rotate service numbers for better visibility
#     plt.title('Overlap Percentage of Bus Services with MRT Stations (500m)', fontsize=16)
#     plt.xlabel('Bus Service Number', fontsize=12)
#     plt.ylabel('Overlap Percentage (%)', fontsize=12)

#     # Display the plot
#     plt.tight_layout()
#     plt.show()

# # Visualize the overlap percentage
# visualize_overlap_percentage(overlap_results)


### Method 2: Road Distance

In [25]:
# Test cases for subset
merged_bus_routes_rd = merged_bus_routes.copy()
merged_train_stations_rd = merged_train_stations.copy()

def calculate_overlap_percentage_with_road_distance(df_bus_stops, df_train_stations, max_distance=500, buffer_distance=1000):
    # Convert DataFrames to GeoDataFrames for bus stops and train stations
    bus_stops_gdf = gpd.GeoDataFrame(
        df_bus_stops, geometry=gpd.points_from_xy(df_bus_stops['Longitude'], df_bus_stops['Latitude']), crs="EPSG:4326"
    )
    train_stations_gdf = gpd.GeoDataFrame(
        df_train_stations, geometry=gpd.points_from_xy(df_train_stations['Longitude'], df_train_stations['Latitude']), crs="EPSG:4326"
    )

    # Project GeoDataFrames to a suitable metric CRS (EPSG:3414 for Singapore)
    bus_stops_gdf = bus_stops_gdf.to_crs(epsg=3414)
    train_stations_gdf = train_stations_gdf.to_crs(epsg=3414)

    # Prepare a road network using OSMnx
    G = ox.graph_from_place('Singapore', network_type='walk')

    # Function to calculate road distance using OSMnx
    def calculate_road_distance(bus_stop_coords, mrt_coords):
        try:
            orig_node = ox.distance.nearest_nodes(G, bus_stop_coords[1], bus_stop_coords[0])  # lon, lat
            dest_node = ox.distance.nearest_nodes(G, mrt_coords[1], mrt_coords[0])
            road_distance = nx.shortest_path_length(G, orig_node, dest_node, weight='length')  # distance in meters
            return road_distance
        except:
            return None

    results = []
    bus_stops_overlap_tracker = {}  # Store results for bus stops

    # Group by ServiceNo to process each bus route separately
    for service_no, bus_route_stops in bus_stops_gdf.groupby('ServiceNo'):
        # Get the total bus stops before processing
        bus_stops_service_unique = bus_route_stops.drop_duplicates(subset='BusStopCode')
        total_bus_stops = len(bus_stops_service_unique)  # Initialize based on unique stops
        print(total_bus_stops)
        overlapping_stops = 0
        bus_stop_overlaps = {}  # Store the overlapped train stations for each bus stop

        # Iterate through each unique bus stop
        for _, bus_stop in bus_stops_service_unique.iterrows(): 
            bus_stop_code = bus_stop['BusStopCode']
            category = bus_stop['Category']
            bus_stop_coords = (bus_stop['Latitude'], bus_stop['Longitude'])

            # Skip if the bus stop has already been processed
            if bus_stop_code in bus_stops_overlap_tracker:
                # Add previously processed result
                if bus_stops_overlap_tracker[bus_stop_code]:
                    bus_stop_overlaps[bus_stop_code] = bus_stops_overlap_tracker[bus_stop_code]
                    overlapping_stops += 1
                continue

            # Create a 1km buffer around the bus stop
            bus_stop_buffer = bus_stop.geometry.buffer(buffer_distance)

            # Step 7: Check if any train stations fall within this buffer
            train_stations_in_buffer = train_stations_gdf[train_stations_gdf.intersects(bus_stop_buffer)]

            # Track the train stations within 350m of the bus stop
            train_codes_within_500m = []

            # Step 8: Calculate road distances between the bus stop and train stations in the buffer
            for _, train_station in train_stations_in_buffer.iterrows():
                train_station_coords = (train_station['Latitude'], train_station['Longitude'])
                road_distance = calculate_road_distance(bus_stop_coords, train_station_coords)

                if road_distance is not None and road_distance <= max_distance:
                    train_codes_within_500m.append([road_distance,train_station['Station_Code'], train_station['MRT_Line']])

            # Only store the bus stop if it has overlapping train stations
            if train_codes_within_500m:
                bus_stop_overlaps[bus_stop_code] = train_codes_within_500m
                overlapping_stops += 1

            # Mark this bus stop as processed and store its result
            bus_stops_overlap_tracker[bus_stop_code] = train_codes_within_500m

        # Calculate overlap percentage for the route
        overlap_percentage = (overlapping_stops / total_bus_stops) * 100 if total_bus_stops > 0 else 0
        results.append({
            'ServiceNo': service_no,
            'Category': category,
            'TotalBusStops': total_bus_stops,
            'OverlappingStops': overlapping_stops,
            'OverlapPercentage': overlap_percentage,
            'OverlappedBusStops': bus_stop_overlaps
        })

    # Convert results into a DataFrame
    overlap_df = pd.DataFrame(results)
    return overlap_df

In [26]:
# overlap_results_with_road_distance = calculate_overlap_percentage_with_road_distance(merged_bus_routes_rd, merged_train_stations, max_distance=500)
# print(overlap_results_with_road_distance)

In [27]:
# overlap_results_with_road_distance[overlap_results_with_road_distance['OverlapPercentage'] >= 50]

In [28]:
# # Saving the output as pickle file to preserve the python objects
# output_path = "../datasets/overlap_results_with_road_distance2.pkl"
# overlap_results_with_road_distance.to_pickle(output_path)


In [10]:
df = pd.read_pickle("../datasets/overlap_results_with_road_distance2.pkl")
df = pd.DataFrame(df)
print(df.head())

  ServiceNo Category  TotalBusStops  OverlappingStops  OverlapPercentage  \
0        10    TRUNK            146                47          32.191781   
1       100    TRUNK            109                48          44.036697   
2      100A    TRUNK             13                 4          30.769231   
3       101    TRUNK             45                16          35.555556   
4       102    TRUNK             59                20          33.898305   

                                  OverlappedBusStops  
0  {'75009': [[118.10799999999999, 'EW2', 'East-W...  
1  {'62129': [[345.86199999999997, 'NE12', 'North...  
2  {'62129': [[345.86199999999997, 'NE12', 'North...  
3  {'62131': [[92.426, 'NE12', 'North East Line']...  
4  {'64009': [[175.40699999999998, 'NE14', 'North...  


#### Normalising Distances and Penalises higher mean normalised distances with SoftMax function

In [11]:
def calculate_normalized_softmax_distances(df, total_bus_stops_col='TotalBusStops', default_distance=500):
    # Step 1: Extract all distances for normalization across the entire dataset
    all_distances = []
    for entry in df['OverlappedBusStops']:
        if isinstance(entry, dict):
            for stops in entry.values():
                for stop_info in stops:
                    if isinstance(stop_info, list) and len(stop_info) > 0:
                        all_distances.append(stop_info[0])

    # Include default distance in normalization range
    all_distances.append(default_distance)  

    # Step 2: Normalize distances based on global min and max
    if len(all_distances) > 0:
        min_distance, max_distance = min(all_distances), max(all_distances)
    else:
        min_distance, max_distance = 0, 1  # Prevent division by zero

    def normalize_distance(dist):
        return (dist - min_distance) / (max_distance - min_distance + 1e-6)

    # Step 3: Apply normalization for each bus stop in each service
    # Normalize distances for overlapping stops
    df['NormalizedDistances'] = df['OverlappedBusStops'].apply(lambda stops: {
        code: [[normalize_distance(dist), train_code, mrt_line] for dist, train_code, mrt_line in details]
        for code, details in stops.items()
    })

    # Step 4: Add default distance for non-overlapping stops
    def add_non_overlapping_stops(row):
        existing_stop_count = len(row['NormalizedDistances'])
        total_stops_needed = row[total_bus_stops_col]
        # Generate default entries for non-overlapping stops to reach the total count
        non_overlapping_stops = {
            f"default_stop_{i+1}": [[normalize_distance(default_distance), 'NA', 'NA']]
            for i in range(total_stops_needed - existing_stop_count)
        }
        # Combine overlapping and non-overlapping stops
        return {**row['NormalizedDistances'], **non_overlapping_stops}

    df['NormalizedDistances'] = df.apply(add_non_overlapping_stops, axis=1)

    # Step 5: Calculate the mean normalized distance per service
    df['AvgNormalizedDistance'] = df['NormalizedDistances'].apply(
        lambda stops: np.mean([dist[0] for details in stops.values() for dist in details if len(dist) > 0])
    )

    # Step 6: Apply softmax to the average normalized distances
    def softmax(values):
        exps = np.exp(-np.array(values))  # Penalize shorter distances
        return exps / exps.sum() if exps.sum() != 0 else exps

    normalized_means = df['AvgNormalizedDistance'].fillna(0).values
    softmax_values = softmax(normalized_means)

    # Assign the softmax values back to the DataFrame
    df['SoftmaxAvgDistance'] = softmax_values

    # Return the modified DataFrame
    return df[['ServiceNo', 'Category', 'TotalBusStops', 'AvgNormalizedDistance', 'SoftmaxAvgDistance', 'NormalizedDistances']]

# Example usage
result_df = calculate_normalized_softmax_distances(df)

# Select the top 20 lowest SoftmaxAvgDistance (indicating highest overlap)
top_20_softmax = result_df.nlargest(20, 'SoftmaxAvgDistance')

# Display the result
print(top_20_softmax)


    ServiceNo   Category  TotalBusStops  AvgNormalizedDistance  \
348       654  CITY_LINK             26               0.606960   
17         11      TRUNK             22               0.617529   
281       384     FEEDER             13               0.626136   
272       371     FEEDER             11               0.647718   
364       673  CITY_LINK             41               0.653580   
533       976      TRUNK             47               0.654438   
410        83      TRUNK             30               0.656498   
526      973A      TRUNK              4               0.663039   
268        36      TRUNK             60               0.668416   
270       36B      TRUNK             51               0.670657   
411       83T      TRUNK             16               0.671358   
357       666  CITY_LINK             33               0.672480   
77        146      TRUNK             16               0.674381   
269       36A      TRUNK             33               0.677236   
290       

In [None]:
### why use softmax ? 