In [2]:
%run get_bus_info_function.ipynb

In [1]:
# !pip install xlrd
import requests
import pandas as pd
import geopandas as gpd
from math import radians, cos, sin, sqrt, atan2
from dotenv import load_dotenv
import os

pd.set_option('display.max_rows', None)
os.environ['OGR_GEOMETRY_ACCEPT_UNCLOSED_RING'] = 'NO'

# Retrieving api key
load_dotenv("../key.env")
api_key = os.getenv("API_KEY")

## Reading in data

In [3]:
#reading in data frames
bus_routes_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusRoutes", api_key)
bus_stops_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusStops", api_key)
train_stations = pd.read_excel("../datasets/Train_Stations.xls")
geospatial_train_path = "../datasets/geospatial_layer/TrainStation_Jul2024"
geospatial_train_gdf = gpd.read_file(geospatial_train_path)


In [4]:
train_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Station_Code  211 non-null    object
 1   MRT_Station   211 non-null    object
 2   MRT_Line      211 non-null    object
dtypes: object(3)
memory usage: 5.1+ KB


In [5]:
train_stations.head()

Unnamed: 0,Station_Code,MRT_Station,MRT_Line
0,NS1,Jurong East,North-South Line
1,NS2,Bukit Batok,North-South Line
2,NS3,Bukit Gombak,North-South Line
3,NS4,Choa Chu Kang,North-South Line
4,NS5,Yew Tee,North-South Line


In [14]:
bus_routes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25544 entries, 0 to 25543
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ServiceNo     25544 non-null  object 
 1   Operator      25544 non-null  object 
 2   Direction     25544 non-null  int64  
 3   StopSequence  25544 non-null  int64  
 4   BusStopCode   25544 non-null  object 
 5   Distance      25544 non-null  float64
 6   WD_FirstBus   25544 non-null  object 
 7   WD_LastBus    25544 non-null  object 
 8   SAT_FirstBus  25544 non-null  object 
 9   SAT_LastBus   25544 non-null  object 
 10  SUN_FirstBus  25544 non-null  object 
 11  SUN_LastBus   25544 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 2.3+ MB


In [None]:
bus_routes_df.head(n = 25) 

In [8]:
geospatial_train_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   TYP_CD      230 non-null    int32   
 1   STN_NAM     0 non-null      object  
 2   ATTACHEMEN  51 non-null     object  
 3   TYP_CD_DES  230 non-null    object  
 4   STN_NAM_DE  230 non-null    object  
 5   geometry    230 non-null    geometry
dtypes: geometry(1), int32(1), object(4)
memory usage: 10.0+ KB


In [10]:
geospatial_train_gdf.head(n = 150)

Unnamed: 0,TYP_CD,STN_NAM,ATTACHEMEN,TYP_CD_DES,STN_NAM_DE,geometry
0,0,,,MRT,GALI BATU DEPOT,"POLYGON ((19210.615 41858.041, 19223.517 41756..."
1,0,,,MRT,HILLVIEW MRT STATION,"POLYGON ((20650.333 38282.331, 20654.77 38298...."
2,0,,,MRT,BEAUTY WORLD MRT STATION,"POLYGON ((21594.717 35882.935, 21584.857 35880..."
3,0,,,MRT,HUME MRT STATION,"POLYGON ((20807.997 37457.716, 20815.376 37460..."
4,0,,,MRT,BUKIT PANJANG MRT STATION,"POLYGON ((19996.27 40187.205, 20028.77 40127.2..."
5,0,,,MRT,CASHEW MRT STATION,"POLYGON ((20340.379 39136.76, 20354.684 39114...."
6,0,,,MRT,DHOBY GHAUT MRT STATION,"POLYGON ((29293.514 31312.527, 29360.102 31290..."
7,0,,,MRT,LAVENDER MRT STATION,"POLYGON ((31236.498 32085.764, 31234.137 32088..."
8,0,,,LRT,RENJONG LRT STATION,"POLYGON ((34382.656 40949.641, 34324.539 40966..."
9,0,,,MRT,DOVER MRT STATION,"POLYGON ((21987.246 32576.906, 21977.178 32581..."


## Data Pre-Processing

### Bus Stop 

In [34]:
# Merge bus_routes_df with bus_stops_df to get the lat-long of each bus stop
merged_bus_routes = pd.merge(bus_routes_df, bus_stops_df, on='BusStopCode', how='left')

#Ensure there's no rows with na values for longitude and latitude
print(merged_bus_routes['Latitude'].isna().sum())

#Ensure there's no rows with na values for longitude and latitude
print(merged_bus_routes['Longitude'].isna().sum())

0
0


### Train Stations

In [None]:
# Step 1: Union the geometries for the same station
unioned_gdf = geospatial_train_gdf.dissolve(by='STN_NAM_DE')

# Step 2: Calculate the centroid of the unioned polygon
unioned_gdf['centroid'] = unioned_gdf.centroid

# Optional Step: Replace geometry with centroid point
unioned_gdf['geometry'] = unioned_gdf['centroid']

# Reset index to clean up
unioned_gdf.reset_index(inplace=True)

In [24]:
# Function to normalize station names in train_stations_df
def normalize_station_name(name):
    return name.strip()  # No extra processing  needed since names are already clean

# Apply normalization function to train_stations_df
train_stations['Normalized_Station'] = train_stations['MRT_Station'].apply(normalize_station_name).str.upper()

# Apply normalization to geospatial_train_df
unioned_gdf['Normalized_Station'] = unioned_gdf['STN_NAM_DE'].replace({' MRT STATION': '', ' LRT STATION': ''}, regex=True).str.strip()

# Perform the merge
merged_train_stations = train_stations.merge(
    unioned_gdf,
    how='left',
    left_on='Normalized_Station',
    right_on='Normalized_Station'
)

# #keeping necessary columns
# columns_to_keep = ['Station_Code','MRT_Station', 'MRT_Line', 'TYP_CD_DES', 'geometry']
# # Display the merged DataFrame
# merged_train_stations = merged_train_stations[columns_to_keep]

# Check the resulting column names
print(merged_train_stations.head(n=250))


    Station_Code         MRT_Station                    MRT_Line  \
0            NS1         Jurong East           North-South Line    
1            NS2         Bukit Batok           North-South Line    
2            NS3        Bukit Gombak           North-South Line    
3            NS4       Choa Chu Kang           North-South Line    
4            NS4       Choa Chu Kang           North-South Line    
5            NS5             Yew Tee           North-South Line    
6            NS7              Kranji           North-South Line    
7            NS8           Marsiling           North-South Line    
8            NS9           Woodlands           North-South Line    
9           NS10           Admiralty           North-South Line    
10          NS11           Sembawang           North-South Line    
11          NS12            Canberra           North-South Line    
12          NS13              Yishun           North-South Line    
13          NS14              Khatib           N

In [95]:
# Check the resulting column names
print(merged_train_stations.head(n=250))

    Station_Code        MRT_Station                    MRT_Line TYP_CD_DES  \
0            NS1        Jurong East           North-South Line         MRT   
1            NS2        Bukit Batok           North-South Line         MRT   
2            NS3       Bukit Gombak           North-South Line         MRT   
3            NS4      Choa Chu Kang           North-South Line         MRT   
4            NS4      Choa Chu Kang           North-South Line         LRT   
5            NS5            Yew Tee           North-South Line         MRT   
6            NS7             Kranji           North-South Line         MRT   
7            NS8          Marsiling           North-South Line         MRT   
8            NS9          Woodlands           North-South Line         MRT   
9            NS9          Woodlands           North-South Line         MRT   
10          NS10          Admiralty           North-South Line         MRT   
11          NS11          Sembawang           North-South Line  