In [2]:
import os
import argparse
from time import time
from pathlib import Path
import pandas as pd     

## Read a file and display a few rows

In [3]:
 df = pd.read_csv('../data/230318.csv.gz', iterator=False,compression="gzip")
 df.head(10)

Unnamed: 0.1,Unnamed: 0,CA,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,0,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/11/2023,03:00:00,REGULAR,7841834,2793522
1,1,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/11/2023,07:00:00,REGULAR,7841838,2793529
2,2,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/11/2023,11:00:00,REGULAR,7841864,2793593
3,3,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/11/2023,15:00:00,REGULAR,7841929,2793634
4,4,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/11/2023,19:00:00,REGULAR,7842065,2793683
5,5,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/11/2023,23:00:00,REGULAR,7842124,2793709
6,6,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2023,04:00:00,REGULAR,7842135,2793724
7,7,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2023,08:00:00,REGULAR,7842141,2793742
8,8,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2023,12:00:00,REGULAR,7842174,2793772
9,9,A002,R051,02-00-00,59 ST,NQR456W,BMT,03/12/2023,16:00:00,REGULAR,7842238,2793823


## Create a new DateTime column and merge the DATE and TIME columns

In [4]:
# add the datetime column and drop the DATE and TIME columns
df['CREATED'] =  pd.to_datetime(df['DATE'] + ' ' + df['TIME'], format='%m/%d/%Y %H:%M:%S')
df = df.drop('DATE', axis=1).drop('TIME',axis=1)
df.head(10)

Unnamed: 0.1,Unnamed: 0,CA,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,ENTRIES,EXITS,CREATED
0,0,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7841834,2793522,2023-03-11 03:00:00
1,1,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7841838,2793529,2023-03-11 07:00:00
2,2,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7841864,2793593,2023-03-11 11:00:00
3,3,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7841929,2793634,2023-03-11 15:00:00
4,4,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7842065,2793683,2023-03-11 19:00:00
5,5,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7842124,2793709,2023-03-11 23:00:00
6,6,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7842135,2793724,2023-03-12 04:00:00
7,7,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7842141,2793742,2023-03-12 08:00:00
8,8,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7842174,2793772,2023-03-12 12:00:00
9,9,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7842238,2793823,2023-03-12 16:00:00


## Observations
- ENTRIES are the departing commuters
- EXITS are the arriving commuters
- STATION provides the location


In [24]:
# Aggregate the information by station and datetime
df["ENTRIES"] = df["ENTRIES"].astype(int)
df["EXITS"] = df["EXITS"].astype(int)
df_totals = df.groupby(["STATION","CREATED"], as_index=False)[["ENTRIES","EXITS"]].sum()
df_totals.head(10)

Unnamed: 0,STATION,CREATED,ENTRIES,EXITS
0,1 AV,2023-03-11 03:00:00,101049842,66622974
1,1 AV,2023-03-11 07:00:00,101049919,66623551
2,1 AV,2023-03-11 11:00:00,101050212,66624697
3,1 AV,2023-03-11 15:00:00,101051201,66628507
4,1 AV,2023-03-11 19:00:00,101052515,66634520
5,1 AV,2023-03-11 23:00:00,101053565,66639010
6,1 AV,2023-03-12 04:00:00,101054025,66640769
7,1 AV,2023-03-12 08:00:00,101054148,66641484
8,1 AV,2023-03-12 12:00:00,101054641,66643234
9,1 AV,2023-03-12 16:00:00,101055585,66646944


In [29]:
df_station_totals = df.groupby(["STATION"], as_index=False)[["ENTRIES","EXITS"]].sum()
df_station_totals.head(10)

Unnamed: 0,STATION,ENTRIES,EXITS
0,1 AV,4145926477,2737832303
1,103 ST,6016580098,5462024501
2,103 ST-CORONA,2957172596,3091946059
3,104 ST,69163548647,40215925954
4,110 ST,1825979272,1828245181
5,111 ST,5738499115,3050060852
6,116 ST,11087399416,10719360350
7,116 ST-COLUMBIA,50841258484,10943433303
8,121 ST,80191550644,21632609290
9,125 ST,287400176761,246131709303


## Show the total entries by station, use a subset of data

In [87]:
import plotly.express as px
import plotly.graph_objects as go
 
df_stations =  df_station_totals.head(25)
# pie_chart = px.pie(df_stations, values="ENTRIES",
#              names="STATION", title="Total Entries by Station", )          
# pie_chart.show()

donut_chart = go.Figure(data=[go.Pie(labels=df_stations["STATION"], values=df_stations["ENTRIES"], hole=.2)])
donut_chart.update_layout(title_text='Entries Distribution by Station', margin=dict(t=40, b=0, l=10, r=10))
donut_chart.show()

## Show the data by the day of the week

In [90]:
df_by_date = df_totals.groupby(["CREATED"], as_index=False)[["ENTRIES"]].sum()
day_order = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
df_by_date["WEEKDAY"] = pd.Categorical(df_by_date["CREATED"].dt.strftime('%a'), categories=day_order, ordered=True)
df_entries_by_date =  df_by_date.groupby(["WEEKDAY"], as_index=False)[["ENTRIES"]].sum()
df_entries_by_date.head(10)

Unnamed: 0,WEEKDAY,ENTRIES
0,Sun,1100520021546
1,Mon,1226715046221
2,Tue,1219808680891
3,Wed,1222141284515
4,Thu,1216016028249
5,Fri,1216148822371
6,Sat,1215735705925


In [91]:
bar_chart = go.Figure(data=[go.Bar(x=df_entries_by_date["WEEKDAY"], y=df_entries_by_date["ENTRIES"])])
bar_chart.update_layout(title_text='Total Entries by Week Day')
bar_chart.show()