In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq

month = 1
year = 2023
path = Path('..') / 'data' / 'raw' / f"citi_rides_{year}_{month:02}.parquet"

table = pq.read_table(path)
rides = table.to_pandas()
rides.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,DC1CB984821DFFF7,classic_bike,2023-01-07 15:36:53.430,2023-01-07 15:39:45.406,Vesey St & Church St,5216.06,Albany St & Greenwich St,5145.02,40.71222,-74.010472,40.709267,-74.013247,member
1,C00CA02971557F16,classic_bike,2023-01-04 19:23:01.234,2023-01-04 19:34:57.151,Lispenard St & Broadway,5391.06,St Marks Pl & 1 Ave,5626.13,40.719392,-74.002472,40.727791,-73.985649,member
2,C753AE5EBD8458F9,classic_bike,2023-01-20 09:22:19.894,2023-01-20 10:23:24.255,3 Ave & Schermerhorn St,4437.01,State St & Smith St,4522.07,40.686832,-73.979677,40.68917,-73.9886,member
3,E4415A543C1972A7,classic_bike,2023-01-24 10:38:01.135,2023-01-24 10:41:40.749,E 5 St & Ave A,5626.06,E 1 St & 1 Ave,5593.01,40.72479,-73.984301,40.723356,-73.98865,member
4,BD52A87B215877C7,electric_bike,2023-01-13 10:17:38.192,2023-01-13 10:33:59.099,W 54 St & 11 Ave,6955.05,Washington St & Gansevoort St,6039.06,40.768292,-73.992563,40.739323,-74.008119,member


In [5]:
rides_cp = rides.copy()
rides_cp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1795412 entries, 0 to 1795411
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 178.1+ MB


In [6]:
time_cols = ['started_at', 'ended_at']
for col in time_cols:
    rides_cp[col] = pd.to_datetime(rides_cp[col], errors='coerce')

In [8]:
rides_cp['duration'] = rides_cp['ended_at'] - rides_cp['started_at']
rides_cp.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
0,DC1CB984821DFFF7,classic_bike,2023-01-07 15:36:53.430,2023-01-07 15:39:45.406,Vesey St & Church St,5216.06,Albany St & Greenwich St,5145.02,40.71222,-74.010472,40.709267,-74.013247,member,0 days 00:02:51.976000
1,C00CA02971557F16,classic_bike,2023-01-04 19:23:01.234,2023-01-04 19:34:57.151,Lispenard St & Broadway,5391.06,St Marks Pl & 1 Ave,5626.13,40.719392,-74.002472,40.727791,-73.985649,member,0 days 00:11:55.917000
2,C753AE5EBD8458F9,classic_bike,2023-01-20 09:22:19.894,2023-01-20 10:23:24.255,3 Ave & Schermerhorn St,4437.01,State St & Smith St,4522.07,40.686832,-73.979677,40.68917,-73.9886,member,0 days 01:01:04.361000
3,E4415A543C1972A7,classic_bike,2023-01-24 10:38:01.135,2023-01-24 10:41:40.749,E 5 St & Ave A,5626.06,E 1 St & 1 Ave,5593.01,40.72479,-73.984301,40.723356,-73.98865,member,0 days 00:03:39.614000
4,BD52A87B215877C7,electric_bike,2023-01-13 10:17:38.192,2023-01-13 10:33:59.099,W 54 St & 11 Ave,6955.05,Washington St & Gansevoort St,6039.06,40.768292,-73.992563,40.739323,-74.008119,member,0 days 00:16:20.907000


In [9]:
rides_cp['duration'].describe().T

count                      1795412
mean     0 days 00:14:56.528263825
std      0 days 08:09:42.277669378
min         0 days 00:00:11.832000
25%         0 days 00:04:52.974750
50%         0 days 00:08:18.412500
75%         0 days 00:14:15.858000
max       263 days 22:37:05.076000
Name: duration, dtype: object

In [13]:
rides_cp["duration"].quantile(0)
rides_cp["duration"].quantile(0.01)
rides_cp["duration"].quantile(0.99)
rides_cp["duration"].quantile(0.999)

Timedelta('0 days 00:00:11.832000')

Timedelta('0 days 00:01:19.154000')

Timedelta('0 days 01:00:27.271579999')

Timedelta('0 days 21:01:27.282577002')

In [14]:
duration_filter = (rides_cp["duration"] > pd.Timedelta(0)) & (rides_cp["duration"] <= pd.Timedelta(hours=1))
sum(~duration_filter)

18270

In [15]:
sorted_df = rides_cp.sort_values(by="started_at", ascending=True)  

# Get the top 10 (smallest) and bottom 10 (largest) values  
top_10 = sorted_df.head(10)  
bottom_10 = sorted_df.tail(10)  

top_10

bottom_10

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
385139,EF87BF79137F2A8C,classic_bike,2022-04-17 14:42:58.905,2023-01-06 13:20:03.981,Crescent St & Broadway,6827.11,Franklin St & Dupont St,5944.01,40.763359,-73.928647,40.73564,-73.95866,casual,263 days 22:37:05.076000
137960,6279BF6511781EEF,classic_bike,2022-05-21 18:16:10.671,2023-01-17 10:45:11.995,Wards Meadow Comfort Station,7327.01,E 147 St & Bergen Ave,7840.11,40.78294,-73.93077,40.814673,-73.91839,casual,240 days 16:29:01.324000
1635218,FE32B9D5DF164E0F,classic_bike,2022-07-29 04:55:32.536,2023-01-02 10:32:09.748,W 53 St & 10 Ave,6890.01,Melrose Ave & E 150 St,7879.01,40.766697,-73.990617,40.816827,-73.917338,casual,157 days 05:36:37.212000
328839,0C632EB7C53DD973,classic_bike,2022-08-07 16:59:45.518,2023-01-18 11:56:50.565,Broadway & Battery Pl,4962.01,Pier 40 Dock Station,SYS035,40.704633,-74.013617,40.72866,-74.01198,casual,163 days 18:57:05.047000
1633571,D54F0CD210E5DCD1,classic_bike,2022-09-29 19:47:50.972,2023-01-10 22:36:05.769,Kenmare St & Elizabeth St,5453.06,Park Ave Depot,SYS032,40.72054,-73.9949,40.695943,-73.967788,casual,103 days 02:48:14.797000
448080,D475B24D2224194E,classic_bike,2022-10-11 14:22:30.004,2023-01-12 18:24:18.127,Jerome Ave & E Mosholu Parkway S,8795.01,Webster Ave & Ford St,8472.08,40.87935,-73.88534,40.85556,-73.89615,casual,93 days 04:01:48.123000
1239212,109A7C4AFC7F00E1,classic_bike,2022-11-06 17:57:48.518,2023-01-08 16:10:27.234,Yankee Ferry Terminal,4440.02,Soissons Landing,4590.01,40.687066,-74.016756,40.692317,-74.014866,casual,62 days 22:12:38.716000
879749,1371D252BF823F8F,classic_bike,2022-12-14 11:43:54.762,2023-01-08 14:08:53.507,Dock St & Front St,4903.09,,,40.702709,-73.99253,,,casual,25 days 02:24:58.745000
40496,49762460E71B1340,classic_bike,2022-12-28 09:17:28.882,2023-01-01 09:24:57.541,Flushing Ave & Vanderbilt Ave,4762.05,Washington Ave & Park Ave,4724.03,40.69795,-73.970776,40.696102,-73.96751,casual,4 days 00:07:28.659000
101972,F04D6905667F3109,classic_bike,2022-12-28 09:17:51.549,2023-01-01 07:39:38.879,Flushing Ave & Vanderbilt Ave,4762.05,Washington Ave & Park Ave,4724.03,40.69795,-73.970776,40.696102,-73.96751,casual,3 days 22:21:47.330000


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
1016491,4943762BB8723942,classic_bike,2023-01-31 23:51:41.857,2023-01-31 23:56:23.860,University Pl & E 14 St,5905.14,E 15 St & 3 Ave,5863.07,40.734814,-73.992085,40.734232,-73.986923,member,0 days 00:04:42.003000
1434392,53135FC3632A01E3,electric_bike,2023-01-31 23:52:04.993,2023-01-31 23:55:34.808,3 Ave & E 169 St,8103.05,E 169 St & Fox St,8024.05,40.832553,-73.905232,40.827184,-73.893721,casual,0 days 00:03:29.815000
1138843,D7407E59F0F180A5,classic_bike,2023-01-31 23:53:10.710,2023-01-31 23:56:09.355,Washington Square E,5755.09,Lafayette St & E 8 St,5788.13,40.730494,-73.995721,40.730207,-73.991026,member,0 days 00:02:58.645000
190018,7F65C55B4E4828A7,electric_bike,2023-01-31 23:53:48.659,2023-01-31 23:55:34.767,Kosciuszko St & Nostrand Ave,4519.04,Kosciuszko St & Tompkins Ave,4553.04,40.690719,-73.951338,40.691283,-73.945242,member,0 days 00:01:46.108000
191058,22C6FD22072E41F5,electric_bike,2023-01-31 23:54:00.474,2023-01-31 23:58:01.362,E 7 St & Ave B,5584.05,E 1 St & 1 Ave,5593.01,40.725286,-73.981687,40.723356,-73.98865,casual,0 days 00:04:00.888000
759791,30724CD753190D73,electric_bike,2023-01-31 23:54:04.004,2023-01-31 23:56:00.735,E 39 St & 2 Ave,6345.08,Lexington Ave & E 36 St,6313.1,40.747733,-73.974299,40.747574,-73.978801,casual,0 days 00:01:56.731000
1430267,93C785C67858B4F7,classic_bike,2023-01-31 23:54:12.671,2023-01-31 23:55:29.328,Henry St & Grand St,5294.04,Grand St & Samuel Dickstein Plaza,5335.07,40.714211,-73.981095,40.715119,-73.984171,member,0 days 00:01:16.657000
9340,DAD107843B1023C8,classic_bike,2023-01-31 23:54:13.263,2023-01-31 23:55:57.650,Jay St & York St,4895.09,Dock St & Front St,4903.09,40.701403,-73.986727,40.702709,-73.99253,member,0 days 00:01:44.387000
697380,9923E7BCADD8BE16,classic_bike,2023-01-31 23:55:27.739,2023-01-31 23:59:57.154,Linden St & Knickerbocker Ave,4743.04,Putnam Ave & Knickerbocker Ave,4663.06,40.69714,-73.91566,40.69545,-73.91164,member,0 days 00:04:29.415000
1303740,A8614C780E0140DF,classic_bike,2023-01-31 23:57:54.025,2023-01-31 23:59:10.318,Dean St & Hoyt St,4446.05,Bond St & Bergen St,4404.1,40.686444,-73.987591,40.684967,-73.986208,member,0 days 00:01:16.293000


In [18]:
rides_cp['start_station_name'].nunique()

1732

In [16]:
# Data in the file that should not be there, remove all values that should not be there
filter_date_range = (rides_cp['started_at'] >= '2023-01-01') & (rides_cp['started_at'] < '2023-02-01')
sum(~filter_date_range)

283

In [17]:
final_filter = duration_filter & filter_date_range
numbers_dropped = final_filter.shape[0] - sum(final_filter)
numbers_dropped
numbers_dropped / final_filter.shape[0] * 100

18406

1.0251685964001578

In [22]:
rides = rides[final_filter]
rides = rides[['started_at', 'start_station_id']]
rides.sort_values(by="started_at", ascending=True).head()

path = Path('..') / 'data' / 'processed' / f'citi_rides_{year}_{month:02}.parquet'
rides.to_parquet(path, engine="pyarrow", index=False)

  rides = rides[final_filter]


Unnamed: 0,started_at,start_station_id
1266126,2023-01-01 00:00:13.021,6955.01
50310,2023-01-01 00:00:15.653,7634.01
564026,2023-01-01 00:00:27.436,4307.13
1699037,2023-01-01 00:00:29.074,8528.05
1506585,2023-01-01 00:00:49.245,7340.07
