In [3]:
%run get_bus_info_function.ipynb

In [1]:
import requests
import xlrd
import pandas as pd
import geopandas as gpd
from math import radians, cos, sin, sqrt, atan2
from dotenv import load_dotenv
import os

pd.set_option('display.max_rows', None)
os.environ['OGR_GEOMETRY_ACCEPT_UNCLOSED_RING'] = 'NO'

# Retrieving api key
load_dotenv("../key.env")
api_key = os.getenv("API_KEY")

## Reading in data

In [4]:
#reading in data frames
bus_routes_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusRoutes", api_key)
bus_stops_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusStops", api_key)
train_stations = pd.read_excel("../datasets/Train_Stations.xls")
geospatial_train_path = "../datasets/geospatial_layer/TrainStation_Jul2024"
geospatial_train_gdf = gpd.read_file(geospatial_train_path)


In [5]:
train_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Station_Code  211 non-null    object
 1   MRT_Station   211 non-null    object
 2   MRT_Line      211 non-null    object
dtypes: object(3)
memory usage: 5.1+ KB


In [7]:
train_stations.head()

Unnamed: 0,Station_Code,MRT_Station,MRT_Line
0,NS1,Jurong East,North-South Line
1,NS2,Bukit Batok,North-South Line
2,NS3,Bukit Gombak,North-South Line
3,NS4,Choa Chu Kang,North-South Line
4,NS5,Yew Tee,North-South Line


In [8]:
bus_routes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25544 entries, 0 to 25543
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ServiceNo     25544 non-null  object 
 1   Operator      25544 non-null  object 
 2   Direction     25544 non-null  int64  
 3   StopSequence  25544 non-null  int64  
 4   BusStopCode   25544 non-null  object 
 5   Distance      25544 non-null  float64
 6   WD_FirstBus   25544 non-null  object 
 7   WD_LastBus    25544 non-null  object 
 8   SAT_FirstBus  25544 non-null  object 
 9   SAT_LastBus   25544 non-null  object 
 10  SUN_FirstBus  25544 non-null  object 
 11  SUN_LastBus   25544 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 2.3+ MB


In [9]:
bus_routes_df.head(n = 25) 

Unnamed: 0,ServiceNo,Operator,Direction,StopSequence,BusStopCode,Distance,WD_FirstBus,WD_LastBus,SAT_FirstBus,SAT_LastBus,SUN_FirstBus,SUN_LastBus
0,10,SBST,1,1,75009,0.0,500,2300,500,2300,500,2300
1,10,SBST,1,2,76059,0.6,502,2302,502,2302,502,2302
2,10,SBST,1,3,76069,1.1,504,2304,504,2304,503,2304
3,10,SBST,1,4,96289,2.3,508,2308,508,2309,507,2308
4,10,SBST,1,5,96109,2.7,509,2310,509,2311,508,2309
5,10,SBST,1,6,85079,3.3,511,2312,511,2313,510,2311
6,10,SBST,1,7,85089,3.5,512,2313,512,2314,511,2312
7,10,SBST,1,8,85069,3.8,513,2314,513,2315,512,2313
8,10,SBST,1,9,85059,4.1,514,2315,514,2316,513,2314
9,10,SBST,1,10,85049,4.5,515,2316,515,2317,514,2315


In [10]:
geospatial_train_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   TYP_CD      230 non-null    int32   
 1   STN_NAM     0 non-null      object  
 2   ATTACHEMEN  51 non-null     object  
 3   TYP_CD_DES  230 non-null    object  
 4   STN_NAM_DE  230 non-null    object  
 5   geometry    230 non-null    geometry
dtypes: geometry(1), int32(1), object(4)
memory usage: 10.0+ KB


In [None]:
geospatial_train_gdf.head(n = 150)

## Data Pre-Processing

### Bus Stop 

In [16]:
# Merge bus_routes_df with bus_stops_df to get the lat-long of each bus stop
merged_bus_routes = pd.merge(bus_routes_df, bus_stops_df, on='BusStopCode', how='left')

#Ensure there's no rows with na values for longitude and latitude
print(merged_bus_routes.head())

  ServiceNo Operator  Direction  StopSequence BusStopCode  Distance  \
0        10     SBST          1             1       75009       0.0   
1        10     SBST          1             2       76059       0.6   
2        10     SBST          1             3       76069       1.1   
3        10     SBST          1             4       96289       2.3   
4        10     SBST          1             5       96109       2.7   

  WD_FirstBus WD_LastBus SAT_FirstBus SAT_LastBus SUN_FirstBus SUN_LastBus  \
0        0500       2300         0500        2300         0500        2300   
1        0502       2302         0502        2302         0502        2302   
2        0504       2304         0504        2304         0503        2304   
3        0508       2308         0508        2309         0507        2308   
4        0509       2310         0509        2311         0508        2309   

          RoadName           Description  Latitude   Longitude  
0  Tampines Ctrl 1          Tampines In

### Train Stations

In [8]:
# Step 1: Union the geometries for the same station
unioned_gdf = geospatial_train_gdf.dissolve(by='STN_NAM_DE',aggfunc='first')

# Step 2: Calculate the centroid of the unioned polygon
unioned_gdf['centroid'] = unioned_gdf.centroid

# Optional Step: Replace geometry with centroid point
unioned_gdf['geometry'] = unioned_gdf['centroid']

# Reset index to clean up
unioned_gdf.reset_index(inplace=True)

In [9]:
unioned_gdf.head(n = 10)

Unnamed: 0,STN_NAM_DE,geometry,TYP_CD,STN_NAM,ATTACHEMEN,TYP_CD_DES,centroid
0,ADMIRALTY MRT STATION,POINT (24400.883 46918.344),0,,,MRT,POINT (24400.883 46918.344)
1,ALJUNIED MRT STATION,POINT (33518.605 33189.987),0,,,MRT,POINT (33518.605 33189.987)
2,ANG MO KIO MRT STATION,POINT (29813.745 39107.484),0,,,MRT,POINT (29813.745 39107.484)
3,BAKAU LRT STATION,POINT (36035.791 41115.238),0,,,LRT,POINT (36035.791 41115.238)
4,BANGKIT LRT STATION,POINT (21249.598 40220.704),0,,,LRT,POINT (21249.598 40220.704)
5,BARTLEY MRT STATION,POINT (33164.551 36110.747),0,,CC12_BLY STN.zip,MRT,POINT (33164.551 36110.747)
6,BAYFRONT MRT STATION,POINT (30866.641 29368.417),0,,,MRT,POINT (30866.641 29368.417)
7,BAYSHORE MRT STATION,POINT (40092.809 32807.696),0,,,MRT,POINT (40092.809 32807.696)
8,BEAUTY WORLD MRT STATION,POINT (21600.914 35929.067),0,,,MRT,POINT (21600.914 35929.067)
9,BEDOK MRT STATION,POINT (38778.877 34028.172),0,,,MRT,POINT (38778.877 34028.172)


In [13]:
# Function to normalize station names in train_stations_df
def normalize_station_name(name):
    return name.strip().upper()  # Ensure names are uppercase for consistent merging

# Apply normalization function to train_stations_df
train_stations['Normalized_Station'] = train_stations['MRT_Station'].apply(normalize_station_name)

# Create a column to append " MRT STATION" or " LRT STATION" based on the MRT_Line
train_stations['Station_MRT_LRT'] = train_stations.apply(
    lambda row: f"{row['Normalized_Station']} MRT STATION" if "LRT" not in row['MRT_Line'] else f"{row['Normalized_Station']} LRT STATION",
    axis=1
)

# Apply normalization to geospatial_train_df
# Strip ' MRT STATION' and ' LRT STATION' and normalize to uppercase
unioned_gdf['Normalized_Station'] = unioned_gdf['STN_NAM_DE'].str.strip().str.upper()

# Perform the merge on 'Station_MRT_LRT' from train_stations and 'Normalized_Station' from unioned_gdf
merged_train_stations = train_stations.merge(
    unioned_gdf,
    how='left',
    left_on='Station_MRT_LRT',
    right_on='Normalized_Station'
)

# Keeping necessary columns
columns_to_keep = ['Station_Code', 'MRT_Station', 'MRT_Line', 'TYP_CD_DES', 'geometry']
merged_train_stations = merged_train_stations[columns_to_keep]

# Check the resulting column names and sample data
print(merged_train_stations.head())


  Station_Code    MRT_Station           MRT_Line TYP_CD_DES  \
0          NS1    Jurong East  North-South Line         MRT   
1          NS2    Bukit Batok  North-South Line         MRT   
2          NS3   Bukit Gombak  North-South Line         MRT   
3          NS4  Choa Chu Kang  North-South Line         MRT   
4          NS5        Yew Tee  North-South Line         MRT   

                      geometry  
0  POINT (17866.487 35045.184)  
1  POINT (18676.448 36790.872)  
2  POINT (18940.178 37860.706)  
3  POINT (18101.056 40790.989)  
4  POINT (18438.643 42159.628)  


In [15]:
# Check the resulting column names and sample data
print(merged_train_stations.head(n = 212))

    Station_Code         MRT_Station                    MRT_Line TYP_CD_DES  \
0            NS1         Jurong East           North-South Line         MRT   
1            NS2         Bukit Batok           North-South Line         MRT   
2            NS3        Bukit Gombak           North-South Line         MRT   
3            NS4       Choa Chu Kang           North-South Line         MRT   
4            NS5             Yew Tee           North-South Line         MRT   
5            NS7              Kranji           North-South Line         MRT   
6            NS8           Marsiling           North-South Line         MRT   
7            NS9           Woodlands           North-South Line         MRT   
8           NS10           Admiralty           North-South Line         MRT   
9           NS11           Sembawang           North-South Line         MRT   
10          NS12            Canberra           North-South Line         MRT   
11          NS13              Yishun           North