## Importing libraries

In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from dotenv import load_dotenv
import os
import zipfile
import shutil
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import ast  # To safely evaluate string representations of dictionaries/lists

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# Retrieving api key
load_dotenv("../key.env")
api_key = os.getenv("API_KEY")

## Reading datasets

In [2]:
%run get_bus_info_function.ipynb
bus_services_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusServices", api_key)
bus_routes_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusRoutes", api_key)
bus_stops_df = get_bus_info("https://datamall2.mytransport.sg/ltaodataservice/BusStops", api_key)

In [12]:
bus_67 = bus_services_df[bus_services_df['ServiceNo'] == "67"]
print(bus_67)

    ServiceNo Operator  Direction Category OriginCode DestinationCode  \
539        67     SMRT          1    TRUNK      44009           75009   
540        67     SMRT          2    TRUNK      75009           44009   

    AM_Peak_Freq AM_Offpeak_Freq PM_Peak_Freq PM_Offpeak_Freq LoopDesc  
539        08-09           08-11        10-11           09-11           
540        08-09           08-11        10-12           09-10           


In [3]:
top_20_overlap_service = pd.read_csv("../datasets/overlap_routes/filtered_final_results.csv").head(20)
top_20_overlap_service['ServiceNo'] = top_20_overlap_service['ServiceNo'].astype(str)
print(top_20_overlap_service.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             20 non-null     int64  
 1   ServiceNo              20 non-null     object 
 2   TotalStops             20 non-null     int64  
 3   WeightedParallelScore  20 non-null     float64
 4   TrainLineParallelData  20 non-null     object 
dtypes: float64(1), int64(2), object(2)
memory usage: 928.0+ bytes
None


## Availability of alternative bus routes

In [4]:
# Filter for Trunk Services
trunk_buses_df = bus_services_df[bus_services_df['Category'] == "TRUNK"]
trunk_buses_df = trunk_buses_df['ServiceNo']
trunk_bus_routes_df = pd.merge(trunk_buses_df, bus_routes_df, on='ServiceNo', how='inner')

# Select relevant columns
trunk_bus_routes_df = trunk_bus_routes_df[['ServiceNo', 'Direction', 'StopSequence', 'BusStopCode']]

# Sort the trunk bus routes by Direction and StopSequence
trunk_bus_routes_df.sort_values(by=['ServiceNo', 'Direction', 'StopSequence'], inplace=True)

# Group by ServiceNo and create an ordered list of BusStopCodes for each service
ordered_stops = trunk_bus_routes_df.groupby('ServiceNo').agg(
    OrderedBusStops=('BusStopCode', lambda x: list(x))
).reset_index()

# Display the result
print(ordered_stops.head())

  ServiceNo                                    OrderedBusStops
0        10  [75009, 75009, 76059, 76059, 76069, 76069, 962...
1       100  [66009, 66009, 62129, 62129, 62119, 62119, 610...
2      100A  [66009, 62129, 62119, 61049, 61039, 70069, 700...
3       101  [66009, 62131, 62141, 63011, 63021, 63031, 630...
4       102  [64009, 64041, 64051, 64061, 64401, 67739, 677...


In [7]:
# A list to store results
results = []
matched_service_numbers = set()  # Set to track unique matched ServiceNo

# Iterate over each row in top_20_pv_trend
for idx, row in top_20_overlap_service.iterrows():
    # Convert TrainLineParallelData string to dictionary
    parallel_data = ast.literal_eval(row['TrainLineParallelData'])
    original_service_no = row['ServiceNo']  # Save the original ServiceNo
    
    for line, data in parallel_data.items():
        for entry in data:
            parallel_stop_codes = entry['ParallelStopCodes']
            # Check against the ordered_stops
            for _, stop_row in ordered_stops.iterrows():
                stop_codes = stop_row['OrderedBusStops']
                # Check if all ParallelStopCodes are present in StopCodes
                if all(code in stop_codes for code in parallel_stop_codes):
                    matched_service_no = stop_row['ServiceNo']
                    # Check if the matched service number is unique
                    if matched_service_no not in matched_service_numbers:
                        matched_service_numbers.add(matched_service_no)
                        # If found, append the result
                        results.append((original_service_no, matched_service_no, line))

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results, columns=['Original_ServiceNo', 'Matched_ServiceNo', 'TrainLine'])

# Drop duplicates based on Original_ServiceNo and Matched_ServiceNo
results_df = results_df.drop_duplicates()

# Remove rows where Matched_ServiceNo is equal to Original_ServiceNo
results_df = results_df[results_df['Matched_ServiceNo'] != results_df['Original_ServiceNo']]

print(results_df)

    Original_ServiceNo Matched_ServiceNo TrainLine
1                   67               974        BP
2                   67              974A        BP
3                   67               976        BP
4                   67                 2        EW
5                   67               139        NE
6                   67               147        NE
7                   67                23        NE
8                   67                64        NE
9                   67                65        NE
10                  67               857        NE
13                 107              107M        DT
14                 107               175        DT
15                 107                61        DT
16                 107               100        NS
17                 107                56        NS
18                 107                57        NS
19                 107               961        NS
20                 107              961M        NS
21                 107         