## Import Libraries

In [None]:
!pip install xlrd geopy

In [None]:
# !pip install python-dotenv
import requests
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from dotenv import load_dotenv
import os
os.environ['OGR_GEOMETRY_ACCEPT_UNCLOSED_RING'] = 'NO'
import zipfile
import shutil
import numpy as np
import pandas as pd
import folium
import branca.colormap as cm
from shapely.geometry import Point, LineString

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# Retrieving api key
load_dotenv("../key.env")
api_key = os.getenv("API_KEY")

## Import Datasets

In [None]:
%run get_bus_info_function.ipynb
bus_services_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusServices", api_key)
bus_routes_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusRoutes", api_key)
bus_stops_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusStops", api_key)
train_stations = pd.read_excel("../datasets/Train_Stations.xls")

In [None]:
%run get_geospatial_function.ipynb
geospatial_train_path = "../datasets/geospatial_layer/TrainStation_Jul2024/RapidTransitSystemStation.shp"
geospatial_train_gdf = gpd.read_file(geospatial_train_path)
geospatial_train_gdf['geometry'] = geospatial_train_gdf['geometry'].buffer(0)

## Data Preprocessing

In [None]:
# Filter for Trunk Services
trunk_buses_df = bus_services_df[bus_services_df['Category'] == "TRUNK"]
trunk_buses_df = trunk_buses_df['ServiceNo']
trunk_bus_routes_df = pd.merge(trunk_buses_df, bus_routes_df,
                               on='ServiceNo', how='inner')
trunk_bus_routes_df = trunk_bus_routes_df[['ServiceNo', 'Direction', 'StopSequence', 'BusStopCode']]
geospatial_bus_route = pd.merge(trunk_bus_routes_df, bus_stops_df,
                                on='BusStopCode', how='inner')
geospatial_bus_route = geospatial_bus_route[['ServiceNo', 'Direction', 'StopSequence', 'BusStopCode', 'Latitude', 'Longitude']]
geospatial_bus_route.head()

## Bus Lines

In [None]:
# Ensure bus stops have the same CRS
geospatial_bus_route_gdf = gpd.GeoDataFrame(
    geospatial_bus_route,
    geometry=gpd.points_from_xy(geospatial_bus_route.Longitude, geospatial_bus_route.Latitude),
    crs="EPSG:4326"
)

# Sort and concatenate points for each ServiceNo without splitting by direction
def create_line(group):
    # Sort points by StopSequence and concatenate them
    sorted_group = group.sort_values(['Direction', 'StopSequence'])
    return LineString(sorted_group.geometry.tolist())

# Group by ServiceNo alone and apply the create_line function
bus_routes_lines = geospatial_bus_route_gdf.groupby('ServiceNo').apply(create_line).reset_index()
bus_routes_lines.columns = ['ServiceNo', 'geometry']

# Convert to a GeoDataFrame
bus_routes_lines_gdf = gpd.GeoDataFrame(bus_routes_lines, geometry='geometry', crs="EPSG:4326")

# Check your final bus_routes_lines_gdf
print(bus_routes_lines_gdf.head())


In [None]:
# # Ensure bus stops have the same CRS
# geospatial_bus_route_gdf = gpd.GeoDataFrame(
#     geospatial_bus_route,
#     geometry=gpd.points_from_xy(geospatial_bus_route.Longitude, geospatial_bus_route.Latitude),
#     crs="EPSG:4326"
# )

# # Group by each service and direction, and create LineStrings based on sorted StopSequence
# def create_line(group):
#     # Convert each stop in the group to a Point and create a LineString
#     return LineString(group.sort_values('StopSequence').geometry.tolist())

# # Apply the function to create a GeoDataFrame with LineStrings for each bus route
# bus_routes_lines = geospatial_bus_route_gdf.groupby(['ServiceNo', 'Direction']).apply(create_line).reset_index()
# bus_routes_lines.columns = ['ServiceNo', 'Direction', 'geometry']

# # Convert to a GeoDataFrame
# bus_routes_lines_gdf = gpd.GeoDataFrame(bus_routes_lines, geometry='geometry', crs="EPSG:4326")

# # Check your final bus_routes_lines_gdf
# print(bus_routes_lines_gdf.head())

In [None]:
# Filter for bus service number 10
service_10_routes = bus_routes_lines_gdf[bus_routes_lines_gdf['ServiceNo'] == "10"]
print(service_10_routes['geometry'].head())

# Plot the route
fig, ax = plt.subplots(figsize=(10, 6))

# Plot each direction of service 10 with explicit label handling
for _, row in service_10_routes.iterrows():
    line = row['geometry']  # LineString geometry
    x, y = line.xy  # Get the x and y coordinates from the LineString

    # Plot the line and add a label for each unique direction
    ax.plot(x, y, linewidth=2)

# Remove duplicate labels in legend
handles, labels = ax.get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax.legend(by_label.values(), by_label.keys())

# Add labels and title
ax.set_title("Bus Service No. 10 Route")
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")

plt.show()

## Train Lines

In [None]:
# Step 1: Union the geometries for the same station
unioned_gdf = geospatial_train_gdf.dissolve(by='STN_NAM_DE',aggfunc='first')

# Step 2: Calculate the centroid of the unioned polygon
unioned_gdf['centroid'] = unioned_gdf.centroid

# Optional Step: Replace geometry with centroid point
unioned_gdf['geometry'] = unioned_gdf['centroid']

# Reset index to clean up
unioned_gdf.reset_index(inplace=True)

# Function to normalize station names in train_stations_df
def normalize_station_name(name):
    return name.strip().upper()  # Ensure names are uppercase for consistent merging

# Apply normalization function to train_stations_df
train_stations['Normalized_Station'] = train_stations['MRT_Station'].apply(normalize_station_name)

# Create a column to append " MRT STATION" or " LRT STATION" based on the MRT_Line
train_stations['Station_MRT_LRT'] = train_stations.apply(
    lambda row: f"{row['Normalized_Station']} MRT STATION" if "LRT" not in row['MRT_Line'] else f"{row['Normalized_Station']} LRT STATION",
    axis=1
)

# Apply normalization to geospatial_train_df
# Strip ' MRT STATION' and ' LRT STATION' and normalize to uppercase
unioned_gdf['Normalized_Station'] = unioned_gdf['STN_NAM_DE'].str.strip().str.upper()

# Perform the merge on 'Station_MRT_LRT' from train_stations and 'Normalized_Station' from unioned_gdf
merged_train_stations = train_stations.merge(
    unioned_gdf,
    how='left',
    left_on='Station_MRT_LRT',
    right_on='Normalized_Station'
)

merged_train_stations = merged_train_stations[['Station_Code', 'MRT_Station', 'MRT_Line', 'TYP_CD_DES', 'geometry']]

#  Convert Pandas DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(merged_train_stations, geometry='geometry')

#  Reproject the GeoDataFrame to EPSG:4326 (WGS 84 - latitude/longitude)
gdf_4326 = gdf.to_crs(epsg=4326)

# Extract Longitude and Latitude from the reprojected geometries
gdf_4326['Longitude'] = gdf_4326.geometry.x
gdf_4326['Latitude'] = gdf_4326.geometry.y

#  Convert back to a Pandas DataFrame (if you don't need the geometry anymore)
geospatial_train_station = pd.DataFrame(gdf_4326)

geospatial_train_station = geospatial_train_station[['Station_Code', 'MRT_Station', 'MRT_Line', 'Longitude', 'Latitude']]
geospatial_train_station['Train_Line'] = geospatial_train_station['Station_Code'].str.extract(r'([A-Za-z]+)')
geospatial_train_station['Station_No'] = geospatial_train_station['Station_Code'].str.extract(r'(\d+)').fillna(1).astype(int)
geospatial_train_station.head()

In [None]:
# Convert to GeoDataFrame
train_stations_gdf = gpd.GeoDataFrame(
    geospatial_train_station,
    geometry=gpd.points_from_xy(geospatial_train_station.Longitude, geospatial_train_station.Latitude),
    crs="EPSG:4326"
)

# Step 2: Sort and group by train line to form continuous line segments for each line
train_stations_gdf = train_stations_gdf.sort_values(by=['Train_Line', 'Station_No'])

# Group by each train line to create LineString for each line
train_lines_gdf = train_stations_gdf.groupby('Train_Line').apply(
    lambda group: LineString(group.geometry.tolist()) if len(group) > 1 else None
).reset_index(name='geometry')

# Filter out rows where geometry is None (i.e., groups with less than 2 geometries)
train_lines_gdf = train_lines_gdf[train_lines_gdf['geometry'].notna()]

# Convert the result into a GeoDataFrame, which represents each train line as a LineString
train_lines_gdf = gpd.GeoDataFrame(train_lines_gdf, geometry='geometry', crs="EPSG:4326")

# Display the first few rows to confirm
print(train_lines_gdf.head())

In [None]:
# Assuming you have train_lines_gdf with unique Train_Line values
unique_lines = train_lines_gdf['Train_Line'].unique()

# Create a colormap
colors = plt.cm.get_cmap('tab10', len(unique_lines))  # Using 'tab10' colormap for distinct colors

# Plot the bus routes (lines) and train stations (points)
fig, ax = plt.subplots(figsize=(10, 10))

# Plot each train line with a unique color
for i, line in enumerate(unique_lines):
    line_data = train_lines_gdf[train_lines_gdf['Train_Line'] == line]
    line_data.plot(ax=ax, color=colors(i), linewidth=2, label=line)

plt.legend()
plt.title("Train Stations and Routes")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.grid(True)
plt.show()

## Overlapping with MRT Stations (Regardless of Line)
- Overlap train stations could be of different lines

In [None]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, LineString
from shapely.ops import unary_union

# Function to sample unique points along each LineString
def sample_unique_points_along_line(line, distance=100):
    num_points = int(line.length // distance)
    points = [line.interpolate(i * distance) for i in range(num_points + 1)]
    return list(dict.fromkeys(points))  # Deduplicate points

# Sample points for each bus route, ensuring deduplication
def generate_unique_bus_route_points(bus_routes_gdf, distance=100):
    points_list = []
    for idx, row in bus_routes_gdf.iterrows():
        points = sample_unique_points_along_line(row['geometry'], distance)
        points_list.extend([(row['ServiceNo'], pt) for pt in points])
    return gpd.GeoDataFrame(points_list, columns=['ServiceNo', 'geometry'], geometry='geometry', crs="EPSG:3857")

# Load and reproject GeoDataFrames for bus routes and train lines
bus_routes_gdf = bus_routes_lines_gdf.to_crs("EPSG:3857")
train_lines_gdf = train_lines_gdf.to_crs("EPSG:3857")

# Generate unique points along each LineString for bus routes and train lines
bus_route_points = generate_unique_bus_route_points(bus_routes_gdf)
train_line_points = train_lines_gdf

# Buffer train line points for proximity search
buffer_distance = 200
train_line_buffers = train_line_points.copy()
train_line_buffers['geometry'] = train_line_buffers.geometry.buffer(buffer_distance)

# Perform spatial join to find bus points within the train line buffers
overlapping_points = gpd.sjoin(bus_route_points, train_line_buffers, how="inner", predicate='intersects')

# Calculate unique stop count, overlap count, and collect coordinates of overlapping stops
overlap_results = []
for service_no, group in bus_route_points.groupby('ServiceNo'):
    unique_total_stops = group.geometry.nunique()  # Count unique stops for this route
    
    # Get overlapping points for this specific bus service
    overlap_group = overlapping_points[overlapping_points['ServiceNo'] == service_no]
    unique_overlap_count = overlap_group.geometry.nunique()  # Count unique overlaps
    
    # Extract coordinates of overlapping stops and Reproject overlapping points to EPSG:4326 and extract coordinates
    overlap_group = overlap_group.to_crs("EPSG:4326")
    overlapping_coords = [pt.coords[0] for pt in overlap_group.geometry.unique()] if unique_overlap_count > 0 else []
    
    overlap_percentage = (unique_overlap_count / unique_total_stops) * 100 if unique_total_stops > 0 else 0

    overlap_results.append({
        'ServiceNo': service_no,
        'OverlapCount': unique_overlap_count,
        'TotalStops': unique_total_stops,
        'OverlapPercentage': overlap_percentage,
        'OverlappingStopCoordinates': overlapping_coords  # Add coordinates of overlapping stops
    })

# Convert results to DataFrame
overlap_results_df = pd.DataFrame(overlap_results)
print(overlap_results_df.head())


In [None]:
# Convert results to DataFrame
overlap_results_df = pd.DataFrame(overlap_results)

# Sort by OverlapPercentage in descending order
overlap_results_df = overlap_results_df.sort_values(by='OverlapPercentage', ascending=False)

print(overlap_results_df.head())

In [None]:
# Filter out rows where ServiceNo contains any alphabetic characters
overlap_results_df = overlap_results_df[overlap_results_df['ServiceNo'].str.isnumeric()]

# Sort by OverlapPercentage in descending order
overlap_results_df = overlap_results_df.sort_values(by='OverlapPercentage', ascending=False)

print(overlap_results_df.head(20))

## Overlapping with MRT Stations (Regardless of Line)
- Counts number of unique lines overlapped

In [None]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, LineString
from shapely.ops import unary_union

# Function to sample unique points along each LineString
def sample_unique_points_along_line(line, distance=100):
    num_points = int(line.length // distance)
    points = [line.interpolate(i * distance) for i in range(num_points + 1)]
    return list(dict.fromkeys(points))  # Deduplicate points

# Sample points for each bus route, ensuring deduplication
def generate_unique_bus_route_points(bus_routes_gdf, distance=100):
    points_list = []
    for idx, row in bus_routes_gdf.iterrows():
        points = sample_unique_points_along_line(row['geometry'], distance)
        points_list.extend([(row['ServiceNo'], pt) for pt in points])
    return gpd.GeoDataFrame(points_list, columns=['ServiceNo', 'geometry'], geometry='geometry', crs="EPSG:3857")

# Load and reproject GeoDataFrames for bus routes and train lines
bus_routes_gdf = bus_routes_lines_gdf.to_crs("EPSG:3857")
train_lines_gdf = train_lines_gdf.to_crs("EPSG:3857")

# Generate unique points along each LineString for bus routes and train lines
bus_route_points = generate_unique_bus_route_points(bus_routes_gdf)
train_line_points = train_lines_gdf

# Buffer train line points for proximity search
buffer_distance = 200
train_line_buffers = train_line_points.copy()
train_line_buffers['geometry'] = train_line_buffers.geometry.buffer(buffer_distance)

# Perform spatial join to find bus points within the train line buffers
overlapping_points = gpd.sjoin(bus_route_points, train_line_buffers, how="inner", predicate='intersects')

# Calculate unique stop count, overlap count, and collect coordinates of overlapping stops
overlap_results = []
for service_no, group in bus_route_points.groupby('ServiceNo'):
    unique_total_stops = group.geometry.nunique()  # Count unique stops for this route

    # Get overlapping points for this specific bus service
    overlap_group = overlapping_points[overlapping_points['ServiceNo'] == service_no]
    unique_overlap_count = overlap_group.geometry.nunique()  # Count unique overlaps

    # Group by train line to get unique train line overlaps
    train_line_overlap_count = overlap_group['Train_Line'].nunique()  # Assuming 'train_line_id' is the column name for train lines

    # Extract coordinates of overlapping stops and Reproject overlapping points to EPSG:4326 and extract coordinates
    overlap_group = overlap_group.to_crs("EPSG:4326")
    overlapping_coords = [pt.coords[0] for pt in overlap_group.geometry.unique()] if unique_overlap_count > 0 else []

    overlap_percentage = (unique_overlap_count / unique_total_stops) * 100 if unique_total_stops > 0 else 0

    overlap_results.append({
        'ServiceNo': service_no,
        'OverlapCount': unique_overlap_count,
        'TrainLineOverlapCount': train_line_overlap_count,
        'TotalStops': unique_total_stops,
        'OverlapPercentage': overlap_percentage,
        'OverlappingStopCoordinates': overlapping_coords  # Add coordinates of overlapping stops
    })

# Convert results to DataFrame
overlap_results_df = pd.DataFrame(overlap_results)
print(overlap_results_df.head())


In [None]:
# Convert results to DataFrame
overlap_results_df = pd.DataFrame(overlap_results)

# Sort by OverlapPercentage in descending order
overlap_results_df = overlap_results_df.sort_values(by='OverlapPercentage', ascending=False)

print(overlap_results_df.head(20))

In [None]:
# Filter out rows where ServiceNo contains any alphabetic characters
overlap_results_df = overlap_results_df[overlap_results_df['ServiceNo'].str.isnumeric()]

# Sort by OverlapPercentage in descending order
overlap_results_df = overlap_results_df.sort_values(by='OverlapPercentage', ascending=False)

print(overlap_results_df.head(20))

## Overlapping with MRT Lines (Calculates overlap with each MRT Line)

In [None]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, LineString
from shapely.ops import unary_union

# Function to sample unique points along each LineString
def sample_unique_points_along_line(line, distance=100):
    num_points = int(line.length // distance)
    points = [line.interpolate(i * distance) for i in range(num_points + 1)]
    return list(dict.fromkeys(points))  # Deduplicate points

# Sample points for each bus route, ensuring deduplication
def generate_unique_bus_route_points(bus_routes_gdf, distance=100):
    points_list = []
    for idx, row in bus_routes_gdf.iterrows():
        points = sample_unique_points_along_line(row['geometry'], distance)
        points_list.extend([(row['ServiceNo'], pt) for pt in points])
    return gpd.GeoDataFrame(points_list, columns=['ServiceNo', 'geometry'], geometry='geometry', crs="EPSG:3857")

# Load and reproject GeoDataFrames for bus routes and train lines
bus_routes_gdf = bus_routes_lines_gdf.to_crs("EPSG:3857")
train_lines_gdf = train_lines_gdf.to_crs("EPSG:3857")

# Generate unique points along each LineString for bus routes and train lines
bus_route_points = generate_unique_bus_route_points(bus_routes_gdf)
train_line_points = train_lines_gdf

# Buffer train line points for proximity search
buffer_distance = 200
train_line_buffers = train_line_points.copy()
train_line_buffers['geometry'] = train_line_buffers.geometry.buffer(buffer_distance)

# Perform spatial join to find bus points within the train line buffers
overlapping_points = gpd.sjoin(bus_route_points, train_line_buffers, how="inner", predicate='intersects')

# Calculate unique stop count and overlap percentages
overlap_results = []

# Group by both ServiceNo and train_line_id
for (service_no, train_line_id), group in overlapping_points.groupby(['ServiceNo', 'Train_Line']):
    unique_total_stops = bus_route_points[bus_route_points['ServiceNo'] == service_no].geometry.nunique()  # Count unique stops for this route
    unique_overlap_count = group.geometry.nunique()  # Count unique overlaps for this train line

    overlap_percentage = (unique_overlap_count / unique_total_stops) * 100 if unique_total_stops > 0 else 0

    # Extract coordinates of overlapping stops and Reproject overlapping points to EPSG:4326 and extract coordinates
    group = group.to_crs("EPSG:4326")
    overlapping_coords = [pt.coords[0] for pt in group.geometry.unique()] if unique_overlap_count > 0 else []

    overlap_results.append({
        'ServiceNo': service_no,
        'TrainLineID': train_line_id,  # Assuming 'train_line_id' is the identifier for train lines
        'OverlapCount': unique_overlap_count,
        'TotalStops': unique_total_stops,
        'OverlapPercentage': overlap_percentage,
        'OverlappingStopCoordinates': overlapping_coords  # Add coordinates of overlapping stops
    })

# Convert results to DataFrame
overlap_results_df = pd.DataFrame(overlap_results)
print(overlap_results_df.head())


In [None]:
# Convert results to DataFrame
overlap_results_df = pd.DataFrame(overlap_results)

# Sort by OverlapPercentage in descending order
overlap_results_df = overlap_results_df.sort_values(by='OverlapPercentage', ascending=False)

print(overlap_results_df.head(20))

In [None]:
# Filter out rows where ServiceNo contains any alphabetic characters
overlap_results_df = overlap_results_df[overlap_results_df['ServiceNo'].str.isnumeric()]

# Sort by OverlapPercentage in descending order
overlap_results_df = overlap_results_df.sort_values(by='OverlapPercentage', ascending=False)

print(overlap_results_df.head(20))