## Importing libraries

In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from dotenv import load_dotenv
import os
import zipfile
import shutil
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# Retrieving api key
load_dotenv("../key.env")
api_key = os.getenv("API_KEY")

## Reading datasets

In [2]:
top_20_overlap_service = pd.read_csv("../datasets/overlap_routes/filtered_final_results.csv").head(20)
top_20_overlap_service['ServiceNo'] = top_20_overlap_service['ServiceNo'].astype(str)
print(top_20_overlap_service.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             20 non-null     int64  
 1   ServiceNo              20 non-null     object 
 2   TotalStops             20 non-null     int64  
 3   WeightedParallelScore  20 non-null     float64
 4   TrainLineParallelData  20 non-null     object 
dtypes: float64(1), int64(2), object(2)
memory usage: 928.0+ bytes
None


## Feature 1: Passenger Volume

In [3]:
pv_trend = pd.read_csv("../datasets/pv_eda/routes_pv_trend.csv")
print(pv_trend.head())

  ServiceNo     2024-07     2024-08     2024-09       Trend  Trend_Binary  \
0        10  22843384.0  23052832.0  21563610.0  Increasing             1   
1       100  15513408.0  15394684.0  14991330.0  Decreasing             0   
2      100A   1458770.0   1434772.0   1390170.0  Decreasing             0   
3       101   4203807.0   4119248.0   3918423.0  Decreasing             0   
4       102   5460389.0   5304970.0   5025364.0  Decreasing             0   

      Change       Average  
0 -1279774.0  2.248661e+07  
1  -522078.0  1.529981e+07  
2   -68600.0  1.427904e+06  
3  -285384.0  4.080493e+06  
4  -435025.0  5.263574e+06  


In [4]:
top_20_pv_trend = pd.merge(top_20_overlap_service, pv_trend, on="ServiceNo", how="inner")
print(top_20_pv_trend.head())

   Unnamed: 0 ServiceNo  TotalStops  WeightedParallelScore  \
0         254        67         163               1.792254   
1         219        48         105               1.513026   
2          12       107          79               1.419038   
3         196        36          61               1.380328   
4          38       122          39               1.323077   

                               TrainLineParallelData     2024-07     2024-08  \
0  {'BP': [{'ParallelStopCodes': ['44461', '44451...  40651926.0  40107168.0   
1  {'CC': [{'ParallelStopCodes': ['41021', '41119...   9789624.0   9783592.0   
2  {'DT': [{'ParallelStopCodes': ['60039', '60029...  11919460.0  11668796.0   
3  {'CC': [{'ParallelStopCodes': ['02149', '02171...   2731454.0   2755434.0   
4  {'EW': [{'ParallelStopCodes': ['06011', '06051...   2524376.0   2495384.0   

      2024-09       Trend  Trend_Binary     Change       Average  
0  37781528.0  Decreasing             0 -2870398.0  3.951354e+07  
1   9240800.

In [7]:
# Normalize 'Change' and 'Average' by 'TotalStops'
top_20_pv_trend['Change_Normalized'] = top_20_pv_trend['Change'] / top_20_pv_trend['TotalStops']
top_20_pv_trend['Average_Normalized'] = top_20_pv_trend['Average'] / top_20_pv_trend['TotalStops']

# Calculate z-score for 'Change_Normalized' across the entire dataset
change_mean = top_20_pv_trend['Change_Normalized'].mean()
change_std = top_20_pv_trend['Change_Normalized'].std()
top_20_pv_trend['Change_Standardized'] = (top_20_pv_trend['Change_Normalized'] - change_mean) / change_std

# Adjust signs based on 'Trend'
top_20_pv_trend.loc[top_20_pv_trend['Trend'] == 'Decreasing', 'Change_Standardized'] = \
    -np.abs(top_20_pv_trend.loc[top_20_pv_trend['Trend'] == 'Decreasing', 'Change_Standardized'])

top_20_pv_trend.loc[top_20_pv_trend['Trend'] == 'Increasing', 'Change_Standardized'] = \
    np.abs(top_20_pv_trend.loc[top_20_pv_trend['Trend'] == 'Increasing', 'Change_Standardized'])

# Scale 'Average_Normalized' to the range 0-1 for the entire dataset
scaler = MinMaxScaler(feature_range=(0, 1))
top_20_pv_trend['Average_Standardized'] = scaler.fit_transform(top_20_pv_trend[['Average_Normalized']])

top_20_pv_trend = top_20_pv_trend[['ServiceNo', 'TotalStops', 'WeightedParallelScore', 'TrainLineParallelData', 
                                   'Trend_Binary', 'Change_Standardized', 'Average_Standardized']]

# Preview the resulting DataFrame
print(top_20_pv_trend)

   ServiceNo  TotalStops  WeightedParallelScore  \
0         67         163               1.792254   
1         48         105               1.513026   
2        107          79               1.419038   
3         36          61               1.380328   
4        122          39               1.323077   
5        170         130               1.241430   
6          2         131               1.195465   
7         63         102               1.188235   
8         65         128               1.184811   
9        100         111               1.180604   
10        24         100               1.174000   
11        80         118               1.173851   
12       851         108               1.150687   
13       111          56               1.146429   
14       134          36               1.127778   
15       158          69               1.124638   
16        13         129               1.099351   
17       133          86               1.083658   
18       147         146       

## Count of alternative bus routes

In [9]:
%run get_bus_info_function.ipynb
bus_services_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusServices", api_key)
bus_routes_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusRoutes", api_key)
bus_stops_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusStops", api_key)

In [12]:
# Filter for Trunk Services
trunk_buses_df = bus_services_df[bus_services_df['Category'] == "TRUNK"]
trunk_buses_df = trunk_buses_df['ServiceNo']
trunk_bus_routes_df = pd.merge(trunk_buses_df, bus_routes_df, on='ServiceNo', how='inner')

# Select relevant columns
trunk_bus_routes_df = trunk_bus_routes_df[['ServiceNo', 'Direction', 'StopSequence', 'BusStopCode']]

# Sort the trunk bus routes by Direction and StopSequence
trunk_bus_routes_df.sort_values(by=['ServiceNo', 'Direction', 'StopSequence'], inplace=True)

# Group by ServiceNo and create an ordered list of BusStopCodes for each service
ordered_stops = trunk_bus_routes_df.groupby('ServiceNo').agg(
    OrderedBusStops=('BusStopCode', lambda x: list(x))
).reset_index()

# Display the result
print(ordered_stops.head())

  ServiceNo                                    OrderedBusStops
0        10  [75009, 75009, 76059, 76059, 76069, 76069, 962...
1       100  [66009, 66009, 62129, 62129, 62119, 62119, 610...
2      100A  [66009, 62129, 62119, 61049, 61039, 70069, 700...
3       101  [66009, 62131, 62141, 63011, 63021, 63031, 630...
4       102  [64009, 64041, 64051, 64061, 64401, 67739, 677...
