In [1]:
# If not already installed, do: pip install pandas fastparquet
import pandas as pd

URL_DATA = "https://storage.data.gov.my/dashboards/ktmb_timeseries.parquet"

df_dosm = pd.read_parquet(URL_DATA)
if 'date' in df_dosm.columns: df_dosm['date'] = pd.to_datetime(df_dosm['date'])

print(df_dosm)

       service frequency         origin destination       date  passengers
0          ets     daily   All Stations  Alor Setar 2023-11-19         150
1          ets     daily   All Stations  Alor Setar 2023-11-20         335
2          ets     daily   All Stations  Alor Setar 2023-11-21         328
3          ets     daily   All Stations  Alor Setar 2023-11-22         325
4          ets     daily   All Stations  Alor Setar 2023-11-23         242
...        ...       ...            ...         ...        ...         ...
502035  tebrau   monthly  Woodlands CIQ  JB Sentral 2023-09-01      104415
502036  tebrau   monthly  Woodlands CIQ  JB Sentral 2023-10-01       98828
502037  tebrau   monthly  Woodlands CIQ  JB Sentral 2023-11-01      103053
502038  tebrau   monthly  Woodlands CIQ  JB Sentral 2023-12-01      119915
502039  tebrau   monthly  Woodlands CIQ  JB Sentral 2024-01-01       45676

[502040 rows x 6 columns]


## Ridership figures of each month from individual origin and destination

Remove All Stations as origin and destination
Sort ridership figures by highest

In [3]:
df_dosm_only = df_dosm[df_dosm['origin'] != 'All Stations']
df_dosm_only = df_dosm_only[df_dosm_only['destination'] != 'All Stations']
df_dosm_only = df_dosm_only[df_dosm_only['frequency'] == 'monthly']

#df_grouped = df_dosm_only.groupby(['origin', 'destination'])['passengers'].mean().reset_index()

df_dosm_only.sort_values(by='passengers', ascending=False).head(10)


Unnamed: 0,service,frequency,origin,destination,date,passengers
502025,tebrau,monthly,JB Sentral,Woodlands CIQ,2023-12-01,170436
502020,tebrau,monthly,JB Sentral,Woodlands CIQ,2023-07-01,165269
502018,tebrau,monthly,JB Sentral,Woodlands CIQ,2023-05-01,164231
502016,tebrau,monthly,JB Sentral,Woodlands CIQ,2023-03-01,164023
502021,tebrau,monthly,JB Sentral,Woodlands CIQ,2023-08-01,163419
502019,tebrau,monthly,JB Sentral,Woodlands CIQ,2023-06-01,162689
502023,tebrau,monthly,JB Sentral,Woodlands CIQ,2023-10-01,159068
502022,tebrau,monthly,JB Sentral,Woodlands CIQ,2023-09-01,158958
502017,tebrau,monthly,JB Sentral,Woodlands CIQ,2023-04-01,157176
502024,tebrau,monthly,JB Sentral,Woodlands CIQ,2023-11-01,156451


## Monthly average ridership by origin/destination pair

In [5]:
df_dosm_only = df_dosm[df_dosm['origin'] != 'All Stations']
df_dosm_only = df_dosm_only[df_dosm_only['destination'] != 'All Stations']
df_dosm_only = df_dosm_only[df_dosm_only['frequency'] == 'monthly']


df_grouped = df_dosm_only.groupby(['origin', 'destination'])['passengers'].mean().reset_index()

df_grouped['passengers']= df_grouped['passengers'].astype(int)
df_grouped.sort_values(by='passengers', ascending=False).head(20)


Unnamed: 0,origin,destination,passengers
1655,JB Sentral,Woodlands CIQ,152671
6397,Woodlands CIQ,JB Sentral,98400
1845,KL Sentral,Butterworth,14617
1295,Butterworth,KL Sentral,13714
1852,KL Sentral,Ipoh,13406
1602,Ipoh,KL Sentral,13332
1155,Bukit Mertajam,KL Sentral,9523
1844,KL Sentral,Bukit Mertajam,9317
3519,Padang Besar,Butterworth,7495
151,Alor Setar,Butterworth,7005


## Monthly average ridership by combining return journeys of origin/destination pair
ie. Origin: KLCC - Destination: KL Sentral will be combined with Origin: KL Sentral - Destination: KLCC

In [6]:
import numpy as np

# Sort 'origin' and 'destination' columns
df_grouped[['origin', 'destination']] = pd.DataFrame(np.sort(df_grouped[['origin', 'destination']], axis=1))

# Group by the sorted 'origin-destination-pair' and sum the 'passengers' column
result_df = df_grouped.groupby(['origin', 'destination'])['passengers'].sum().reset_index()

# Combine 'origin' and 'destination' into 'origin-destination-pair'
result_df['origin-destination-pair'] = result_df['origin'] + '-' + result_df['destination']

# Drop the separate 'origin' and 'destination' columns
result_df = result_df[['origin-destination-pair', 'passengers']]

result_df.sort_values(by='passengers', ascending=False).to_csv('ktm_riderships')
# Print the resulting DataFrame
result_df.sort_values(by='passengers', ascending=False).head(10)

Unnamed: 0,origin-destination-pair,passengers
1554,JB Sentral-Woodlands CIQ,251071
1213,Butterworth-KL Sentral,28331
1506,Ipoh-KL Sentral,26738
1098,Bukit Mertajam-KL Sentral,18840
1223,Butterworth-Padang Besar,14180
150,Alor Setar-Butterworth,13461
651,Batu Caves-KL Sentral,12348
278,Arau-Butterworth,11657
701,Batu Gajah-KL Sentral,11518
1745,KL Sentral-Sungai Petani,11122


In [7]:
import pandas as pd
import plotly.express as px

sorted_df  = result_df.sort_values(by='passengers', ascending=True)
# Create an interactive scatterplot

# Create an interactive scatterplot
fig = px.scatter(sorted_df, x='origin-destination-pair', y='passengers', text='passengers',
                 title='Passenger Counts by Origin-Destination Pairs',
                 labels={'origin-destination-pair': 'Origin-Destination Pair'},
                 hover_name='origin-destination-pair')

fig.update_traces(textposition='top center')

# Remove the x-axis labels
fig.update_xaxes(showticklabels=False)

# Customize the appearance of the plot (optional)
fig.update_layout(xaxis_title='Origin-Destination Pair', yaxis_title='Passengers')

# Show the interactive plot
fig.show()

## Group Riderships by only Origin

In [9]:
df_dosm_origin = df_dosm[df_dosm['origin'] != 'All Stations']
df_dosm_origin = df_dosm_origin[df_dosm_origin['destination'] == 'All Stations']
df_dosm_origin = df_dosm_origin[df_dosm_origin['frequency'] == 'monthly']
#df_dosm_origin = df_dosm_origin[df_dosm_origin['date'] == '2023-09-01']

df_dosm_origin = df_dosm_origin.groupby(['origin'])['passengers'].mean().reset_index()
df_dosm_origin['passengers']= df_dosm_origin['passengers'].astype(int)

df_dosm_origin.sort_values(by='passengers', ascending=False).head(13)


Unnamed: 0,origin,passengers
45,KL Sentral,55190
30,Butterworth,29895
131,Sungai Petani,25536
2,Alor Setar,23237
5,Arau,22707
26,Bukit Mertajam,20565
38,Ipoh,16291
28,Bukit Tengah,15690
126,Subang Jaya,15582
89,Padang Besar,13049


In [10]:
df_dosm_origin.sort_values(by='passengers', ascending=True).head(20)


Unnamed: 0,origin,passengers
65,Kodiang,12
123,Sri Bintang,15
133,Sungai Sirian,22
6,Aur Gading,22
57,Kg Berkam,23
124,Sri Jaya,27
129,Sungai Keladi,28
32,Chicha Tinggi,33
24,Bukit Betong,37
134,Sungai Tasin,42
