## Importing libraries

In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from dotenv import load_dotenv
import os
import zipfile
import shutil
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# Retrieving api key
load_dotenv("../key.env")
api_key = os.getenv("API_KEY")

## Reading datasets

In [2]:
top_20_overlap_service = pd.read_csv("../datasets/overlap_routes/filtered_final_results.csv").head(20)
top_20_overlap_service['ServiceNo'] = top_20_overlap_service['ServiceNo'].astype(str)
print(top_20_overlap_service.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             20 non-null     int64  
 1   ServiceNo              20 non-null     object 
 2   TotalStops             20 non-null     int64  
 3   WeightedParallelScore  20 non-null     float64
 4   TrainLineParallelData  20 non-null     object 
dtypes: float64(1), int64(2), object(2)
memory usage: 928.0+ bytes
None


## Feature 1: Passenger Volume

In [3]:
pv_trend = pd.read_csv("../datasets/pv_eda/routes_pv_trend.csv")
print(pv_trend.head())

  ServiceNo     2024-07     2024-08     2024-09       Trend  Trend_Binary  \
0        10  22843384.0  23052832.0  21563610.0  Increasing             1   
1       100  15513408.0  15394684.0  14991330.0  Decreasing             0   
2      100A   1458770.0   1434772.0   1390170.0  Decreasing             0   
3       101   4203807.0   4119248.0   3918423.0  Decreasing             0   
4       102   5460389.0   5304970.0   5025364.0  Decreasing             0   

      Change       Average  
0 -1279774.0  2.248661e+07  
1  -522078.0  1.529981e+07  
2   -68600.0  1.427904e+06  
3  -285384.0  4.080493e+06  
4  -435025.0  5.263574e+06  


In [4]:
top_20_pv_trend = pd.merge(top_20_overlap_service, pv_trend, on="ServiceNo", how="inner")
print(top_20_pv_trend.head())

   Unnamed: 0 ServiceNo  TotalStops  WeightedParallelScore  \
0         254        67         163               1.792254   
1         219        48         105               1.513026   
2          12       107          79               1.419038   
3         196        36          61               1.380328   
4          38       122          39               1.323077   

                               TrainLineParallelData     2024-07     2024-08  \
0  {'BP': [{'ParallelStopCodes': ['44461', '44451...  40651926.0  40107168.0   
1  {'CC': [{'ParallelStopCodes': ['41021', '41119...   9789624.0   9783592.0   
2  {'DT': [{'ParallelStopCodes': ['60039', '60029...  11919460.0  11668796.0   
3  {'CC': [{'ParallelStopCodes': ['02149', '02171...   2731454.0   2755434.0   
4  {'EW': [{'ParallelStopCodes': ['06011', '06051...   2524376.0   2495384.0   

      2024-09       Trend  Trend_Binary     Change       Average  
0  37781528.0  Decreasing             0 -2870398.0  3.951354e+07  
1   9240800.