## Import Libraries

In [None]:
!pip install xlrd geopy

In [None]:
# !pip install python-dotenv
import requests
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from dotenv import load_dotenv
import os
os.environ['OGR_GEOMETRY_ACCEPT_UNCLOSED_RING'] = 'NO'
import zipfile
import shutil
import numpy as np
import pandas as pd
import folium
import branca.colormap as cm
from shapely.geometry import Point, LineString

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# Retrieving api key
load_dotenv("../key.env")
api_key = os.getenv("API_KEY")

## Import Datasets

In [None]:
%run get_bus_info_function.ipynb
bus_services_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusServices", api_key)
bus_routes_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusRoutes", api_key)
bus_stops_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusStops", api_key)
train_stations = pd.read_excel("../datasets/Train_Stations.xls")

In [None]:
%run get_geospatial_function.ipynb
geospatial_train_path = "../datasets/geospatial_layer/TrainStation_Jul2024/RapidTransitSystemStation.shp"
geospatial_train_gdf = gpd.read_file(geospatial_train_path)
geospatial_train_gdf['geometry'] = geospatial_train_gdf['geometry'].buffer(0)

## Data Preprocessing

In [None]:
# Filter for Trunk Services
trunk_buses_df = bus_services_df[bus_services_df['Category'] == "TRUNK"]
trunk_buses_df = trunk_buses_df['ServiceNo']
trunk_bus_routes_df = pd.merge(trunk_buses_df, bus_routes_df,
                               on='ServiceNo', how='inner')
trunk_bus_routes_df = trunk_bus_routes_df[['ServiceNo', 'Direction', 'StopSequence', 'BusStopCode']]
geospatial_bus_route = pd.merge(trunk_bus_routes_df, bus_stops_df,
                                on='BusStopCode', how='inner')
geospatial_bus_route = geospatial_bus_route[['ServiceNo', 'Direction', 'StopSequence', 'BusStopCode', 'Latitude', 'Longitude']]
geospatial_bus_route.head()

## Bus Lines

In [None]:
# Ensure bus stops have the same CRS
geospatial_bus_route_gdf = gpd.GeoDataFrame(
    geospatial_bus_route,
    geometry=gpd.points_from_xy(geospatial_bus_route.Longitude, geospatial_bus_route.Latitude),
    crs="EPSG:4326"
)

# Group by each service and direction, and create LineStrings based on sorted StopSequence
def create_line(group):
    # Convert each stop in the group to a Point and create a LineString
    return LineString(group.sort_values('StopSequence').geometry.tolist())

# Apply the function to create a GeoDataFrame with LineStrings for each bus route
bus_routes_lines = geospatial_bus_route_gdf.groupby(['ServiceNo', 'Direction']).apply(create_line).reset_index()
bus_routes_lines.columns = ['ServiceNo', 'Direction', 'geometry']

# Convert to a GeoDataFrame
bus_routes_lines_gdf = gpd.GeoDataFrame(bus_routes_lines, geometry='geometry', crs="EPSG:4326")

# Check your final bus_routes_lines_gdf
print(bus_routes_lines_gdf.head())

In [None]:
# Filter for bus service number 10
service_10_routes = bus_routes_lines_gdf[bus_routes_lines_gdf['ServiceNo'] == "10"]
print(service_10_routes['geometry'].head())

# Plot the route
fig, ax = plt.subplots(figsize=(10, 6))

# Plot each direction of service 10 with explicit label handling
for _, row in service_10_routes.iterrows():
    line = row['geometry']  # LineString geometry
    direction = row['Direction']
    x, y = line.xy  # Get the x and y coordinates from the LineString

    # Plot the line and add a label for each unique direction
    ax.plot(x, y, linewidth=2, label=f"Direction {direction}")

# Remove duplicate labels in legend
handles, labels = ax.get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax.legend(by_label.values(), by_label.keys())

# Add labels and title
ax.set_title("Bus Service No. 10 Route")
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")

plt.show()

## Train Lines

In [None]:
# Step 1: Union the geometries for the same station
unioned_gdf = geospatial_train_gdf.dissolve(by='STN_NAM_DE',aggfunc='first')

# Step 2: Calculate the centroid of the unioned polygon
unioned_gdf['centroid'] = unioned_gdf.centroid

# Optional Step: Replace geometry with centroid point
unioned_gdf['geometry'] = unioned_gdf['centroid']

# Reset index to clean up
unioned_gdf.reset_index(inplace=True)

# Function to normalize station names in train_stations_df
def normalize_station_name(name):
    return name.strip().upper()  # Ensure names are uppercase for consistent merging

# Apply normalization function to train_stations_df
train_stations['Normalized_Station'] = train_stations['MRT_Station'].apply(normalize_station_name)

# Create a column to append " MRT STATION" or " LRT STATION" based on the MRT_Line
train_stations['Station_MRT_LRT'] = train_stations.apply(
    lambda row: f"{row['Normalized_Station']} MRT STATION" if "LRT" not in row['MRT_Line'] else f"{row['Normalized_Station']} LRT STATION",
    axis=1
)

# Apply normalization to geospatial_train_df
# Strip ' MRT STATION' and ' LRT STATION' and normalize to uppercase
unioned_gdf['Normalized_Station'] = unioned_gdf['STN_NAM_DE'].str.strip().str.upper()

# Perform the merge on 'Station_MRT_LRT' from train_stations and 'Normalized_Station' from unioned_gdf
merged_train_stations = train_stations.merge(
    unioned_gdf,
    how='left',
    left_on='Station_MRT_LRT',
    right_on='Normalized_Station'
)

merged_train_stations = merged_train_stations[['Station_Code', 'MRT_Station', 'MRT_Line', 'TYP_CD_DES', 'geometry']]

#  Convert Pandas DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(merged_train_stations, geometry='geometry')

#  Reproject the GeoDataFrame to EPSG:4326 (WGS 84 - latitude/longitude)
gdf_4326 = gdf.to_crs(epsg=4326)

# Extract Longitude and Latitude from the reprojected geometries
gdf_4326['Longitude'] = gdf_4326.geometry.x
gdf_4326['Latitude'] = gdf_4326.geometry.y

#  Convert back to a Pandas DataFrame (if you don't need the geometry anymore)
geospatial_train_station = pd.DataFrame(gdf_4326)

geospatial_train_station = geospatial_train_station[['Station_Code', 'MRT_Station', 'MRT_Line', 'Longitude', 'Latitude']]
geospatial_train_station['Train_Line'] = geospatial_train_station['Station_Code'].str.extract(r'([A-Za-z]+)')
geospatial_train_station['Station_No'] = geospatial_train_station['Station_Code'].str.extract(r'(\d+)').fillna(1).astype(int)
geospatial_train_station.head()

In [None]:
# Convert to GeoDataFrame
train_stations_gdf = gpd.GeoDataFrame(
    geospatial_train_station,
    geometry=gpd.points_from_xy(geospatial_train_station.Longitude, geospatial_train_station.Latitude),
    crs="EPSG:4326"
)

# Step 2: Sort and group by train line to form continuous line segments for each line
train_stations_gdf = train_stations_gdf.sort_values(by=['Train_Line', 'Station_No'])

# Group by each train line to create LineString for each line
train_lines_gdf = train_stations_gdf.groupby('Train_Line').apply(
    lambda group: LineString(group.geometry.tolist()) if len(group) > 1 else None
).reset_index(name='geometry')

# Filter out rows where geometry is None (i.e., groups with less than 2 geometries)
train_lines_gdf = train_lines_gdf[train_lines_gdf['geometry'].notna()]

# Convert the result into a GeoDataFrame, which represents each train line as a LineString
train_lines_gdf = gpd.GeoDataFrame(train_lines_gdf, geometry='geometry', crs="EPSG:4326")

# Display the first few rows to confirm
print(train_lines_gdf.head())

In [None]:
# Assuming you have train_lines_gdf with unique Train_Line values
unique_lines = train_lines_gdf['Train_Line'].unique()

# Create a colormap
colors = plt.cm.get_cmap('tab10', len(unique_lines))  # Using 'tab10' colormap for distinct colors

# Plot the bus routes (lines) and train stations (points)
fig, ax = plt.subplots(figsize=(10, 10))

# Plot each train line with a unique color
for i, line in enumerate(unique_lines):
    line_data = train_lines_gdf[train_lines_gdf['Train_Line'] == line]
    line_data.plot(ax=ax, color=colors(i), linewidth=2, label=line)

plt.legend()
plt.title("Train Stations and Routes")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.grid(True)
plt.show()

## DBSCAN wrong

In [None]:
# import pandas as pd
# from sklearn.cluster import DBSCAN
# from geopy.distance import geodesic

# # Select necessary columns
# bus_stops = geospatial_bus_route[['ServiceNo', 'StopSequence', 'BusStopCode', 'Latitude', 'Longitude']].copy()
# bus_stops['Type'] = 'Bus'
# train_stations = geospatial_train_station[['Station_Code', 'MRT_Line', 'Latitude', 'Longitude', 'Station_No']].copy()
# train_stations['Type'] = 'Train'

# # Combine bus stops and train stations
# combined_points = pd.concat([bus_stops, train_stations], ignore_index=True)

# # Step 1: Run DBSCAN to find clusters based on proximity
# coordinates = combined_points[['Latitude', 'Longitude']].to_numpy()
# db = DBSCAN(eps=0.005, min_samples=3, metric='haversine').fit(coordinates)  # eps in radians for haversine
# combined_points['Cluster'] = db.labels_

# # Step 2: Identify clusters with both bus and train points, grouped by bus route and train line
# parallel_segments = []
# overlap_counts = []
# for cluster_id in combined_points['Cluster'].unique():
#     cluster_points = combined_points[combined_points['Cluster'] == cluster_id]
    
#     # Check if cluster has both bus and train points
#     if set(cluster_points['Type']) == {'Bus', 'Train'}:
        
#         # Group by ServiceNo for buses and MRT_Line for trains to ensure route/line continuity
#         bus_groups = cluster_points[cluster_points['Type'] == 'Bus'].groupby('ServiceNo')
#         train_groups = cluster_points[cluster_points['Type'] == 'Train'].groupby('MRT_Line')
        
#         # Step 3: Process each ServiceNo and MRT_Line in the cluster
#         for bus_route, bus_cluster in bus_groups:
#             bus_cluster = bus_cluster.sort_values(by='StopSequence')
            
#             for train_line, train_cluster in train_groups:
#                 train_cluster = train_cluster.sort_values(by='Station_No')
                
#                 # Identify overlapping bus stops with train stations
#                 overlap_stations = []
#                 for _, bus_stop in bus_cluster.iterrows():
#                     for _, train_stop in train_cluster.iterrows():
#                         if geodesic((bus_stop['Latitude'], bus_stop['Longitude']),
#                                     (train_stop['Latitude'], train_stop['Longitude'])).meters <= 500:  # within 500 meters
#                             overlap_stations.append(bus_stop['BusStopCode'])
#                             break  # Stop further checking once an overlap is found for this bus stop
                
#                 # Check if there's any overlap, then proceed
#                 if overlap_stations:
#                     # Get the first and last overlapping bus stops
#                     start_bus_stop = overlap_stations[0]
#                     end_bus_stop = overlap_stations[-1]
                    
#                     # Get start and end train stations based on overlap proximity
#                     start_station = train_cluster.iloc[0]['Station_Code']
#                     end_station = train_cluster.iloc[-1]['Station_Code']
                    
#                     # Calculate the parallel distance along overlapping train stations
#                     total_distance = 0.0
#                     for i in range(len(train_cluster) - 1):
#                         start = (train_cluster.iloc[i]['Latitude'], train_cluster.iloc[i]['Longitude'])
#                         end = (train_cluster.iloc[i + 1]['Latitude'], train_cluster.iloc[i + 1]['Longitude'])
#                         total_distance += geodesic(start, end).km
                    
#                     # Append the result for each identified parallel segment
#                     parallel_segments.append({
#                         'Train_Line': train_line,
#                         'Bus_Route': bus_route,
#                         'Start_Bus_Stop': start_bus_stop,
#                         'End_Bus_Stop': end_bus_stop,
#                         'Start_Station': start_station,
#                         'End_Station': end_station,
#                         'Total_Parallel_Distance_km': total_distance,
#                         'Overlap_Bus_Stops': len(overlap_stations),
#                         'Cluster_ID': cluster_id
#                     })
                    
#                     # Collect overlap counts for overall grouping
#                     overlap_counts.append({
#                         'ServiceNo': bus_route,
#                         'Overlap_Count': len(overlap_stations)
#                     })

# # Step 4: Create DataFrame for parallel segments
# parallel_segments_df = pd.DataFrame(parallel_segments)
# print("Parallel Segments DataFrame:")
# print(parallel_segments_df.head())

## DBSCAN Test

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
from shapely.geometry import Point, LineString
from shapely.ops import nearest_points

# Assuming `parallel_segments` is the list of dictionaries from the previous code
parallel_segments = []

for _, bus_row in bus_routes_lines_gdf.iterrows():
    bus_service = bus_row['ServiceNo']
    bus_route_geom = bus_row['geometry']  # LINESTRING of the bus route
    
    for _, train_row in train_lines_gdf.iterrows():
        train_line = train_row['Train_Line']
        train_line_geom = train_row['geometry']  # LINESTRING of the train line
        
        # Find the intersection between the bus route and train line
        intersection = bus_route_geom.intersection(train_line_geom)
        
        if not intersection.is_empty:
            # Calculate overlapping bus stops by finding the nearest points on the intersection
            overlapping_stops = geospatial_bus_route[
                (geospatial_bus_route['ServiceNo'] == bus_service) &
                (geospatial_bus_route.apply(lambda x: Point(x['Longitude'], x['Latitude']).intersects(intersection), axis=1))
            ]

            if not overlapping_stops.empty:
                # Run DBSCAN on the overlapping stops' coordinates to identify clusters
                coords = overlapping_stops[['Latitude', 'Longitude']].values
                db = DBSCAN(eps=0.005, min_samples=2).fit(coords)  # Adjust `eps` as needed for clustering
                overlapping_stops['Cluster_ID'] = db.labels_
                
                # Iterate over unique clusters to add parallel segments
                for cluster_id in np.unique(db.labels_):
                    cluster_stops = overlapping_stops[overlapping_stops['Cluster_ID'] == cluster_id]
                    
                    if not cluster_stops.empty:
                        cluster_stops = cluster_stops.sort_values('StopSequence')
                        start_stop = cluster_stops.iloc[0]
                        end_stop = cluster_stops.iloc[-1]

                        parallel_segments.append({
                            'Train_Line': train_line,
                            'Bus_Route': bus_service,
                            'Start_Bus_Stop': start_stop['BusStopCode'],
                            'End_Bus_Stop': end_stop['BusStopCode'],
                            'Start_Station': start_stop['Station_Code'],
                            'End_Station': end_stop['Station_Code'],
                            'Total_Parallel_Distance_km': intersection.length / 1000,  # assuming CRS in meters
                            'Overlap_Bus_Stops': len(cluster_stops),
                            'Cluster_ID': cluster_id
                        })

# Convert to DataFrame
parallel_segments_df = pd.DataFrame(parallel_segments)

print(parallel_segments_df.head())

In [None]:
print(f"Intersection for {bus_service} and {train_line}: {intersection}")

In [None]:
# example_df = parallel_segments_df[parallel_segments_df['Bus_Route'] == '961M']
# print(example_df)

In [None]:
# Step 5: Group by ServiceNo to get total overlap counts and sort
overlap_counts_df = pd.DataFrame(overlap_counts)
total_overlap_df = overlap_counts_df.groupby('ServiceNo').sum().reset_index()
total_overlap_df = total_overlap_df.sort_values(by='Overlap_Count', ascending=False)

print("Total Overlap Counts by ServiceNo (sorted):")
print(total_overlap_df.head(20))

In [None]:
# # Step 1: Set up the base map centered around Singapore
# singapore_lat, singapore_lon = 1.3521, 103.8198
# m = folium.Map(location=[singapore_lat, singapore_lon], zoom_start=12, tiles='CartoDB positron')

# # Step 2: Create a color map for the train lines
# unique_lines = train_lines_gdf['Train_Line'].unique()
# colors = plt.cm.get_cmap('tab10', len(unique_lines))  # Using 'tab10' colormap for distinct colors

# # Step 3: Add each train line to the map with a unique color
# color_map_dict = {}  # Dictionary to keep track of colors assigned to each train line

# for i, line in enumerate(unique_lines):
#     line_data = train_lines_gdf[train_lines_gdf['Train_Line'] == line]
    
#     # Convert GeoDataFrame to GeoJSON
#     geo_json = line_data.to_json()

#     # Get a distinct color for each line
#     line_color = f"#{''.join([hex(int(c*255))[2:].zfill(2) for c in colors(i)[:3]])}"
#     color_map_dict[line] = line_color  # Save color for legend

#     # Add GeoJSON to folium map with unique color
#     folium.GeoJson(
#         geo_json,
#         name=f"Train Line {line}",
#         style_function=lambda feature, color=line_color: {
#             'color': color,
#             'weight': 3
#         }
#     ).add_to(m)

# # Step 4: Add a Layer Control to toggle visibility
# folium.LayerControl().add_to(m)

# # Step 5: Optional: Add a legend to explain the colors (using folium directly to create a custom legend)
# legend_html = '''
# <div style="position: fixed; 
#             bottom: 50px; left: 50px; width: 200px; height: auto; 
#             background-color: white; z-index: 1000; padding: 10px; border-radius: 5px;">
# <h4>Train Lines Legend</h4>
# <ul style="list-style-type: none; padding: 0;">
# '''
# for line, color in color_map_dict.items():
#     legend_html += f'<li><span style="background:{color};width:15px;height:15px;display:inline-block;margin-right:10px;"></span>{line}</li>'

# legend_html += '</ul></div>'

# m.get_root().html.add_child(folium.Element(legend_html))

# # Step 6: Save and Display the Map
# m.save("../datasets/singapore_map_with_train_lines.html")
# m

