In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
import pandas as pd
from pathlib import Path

month = 1
year = 2023
path = Path("..") / "data" / "raw" / f"JC-{year}{month:02}-citibike-tripdata.csv"

# Read CSV using pandas
rides = pd.read_csv(path)

# Preview the data
rides.head()


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0905B18B365C9D20,classic_bike,2023-01-28 09:18:10,2023-01-28 09:28:52,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Hamilton Park,JC009,40.735938,-74.030305,40.727596,-74.044247,member
1,B4F0562B05CB5404,electric_bike,2023-01-23 20:10:12,2023-01-23 20:18:27,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Southwest Park - Jackson St & Observer Hwy,HB401,40.735938,-74.030305,40.737551,-74.041664,member
2,5ABF032895F5D87E,classic_bike,2023-01-29 15:27:04,2023-01-29 15:32:38,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Marshall St & 2 St,HB408,40.735944,-74.030383,40.740802,-74.042521,member
3,E7E1F9C53976D2F9,classic_bike,2023-01-24 18:35:08,2023-01-24 18:42:13,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Hamilton Park,JC009,40.735986,-74.030364,40.727596,-74.044247,member
4,323165780CA0734B,classic_bike,2023-01-21 20:44:09,2023-01-21 20:48:08,Hamilton Park,JC009,Manila & 1st,JC082,40.727596,-74.044247,40.721651,-74.042884,member


In [6]:
rides_cp = rides.copy()
rides_cp["started_at"] = pd.to_datetime(rides_cp["started_at"])
rides_cp["ended_at"] = pd.to_datetime(rides_cp["ended_at"])
rides_cp["duration"] = rides_cp["ended_at"] - rides_cp["started_at"]

In [7]:
print(rides_cp["duration"].describe())
print(rides_cp["duration"].quantile([0.0, 0.01, 0.995, 0.999]))

count                        56075
mean     0 days 00:11:47.619580918
std      0 days 01:08:03.871820543
min                0 days 00:00:00
25%                0 days 00:03:51
50%                0 days 00:05:52
75%                0 days 00:09:02
max                3 days 03:38:17
Name: duration, dtype: object
0.000             0 days 00:00:00
0.010             0 days 00:00:19
0.995   0 days 01:47:45.519999999
0.999      1 days 00:59:39.926000
Name: duration, dtype: timedelta64[ns]


In [8]:
duration_filter = (rides_cp["duration"] > pd.Timedelta(0)) & (rides_cp["duration"] <= pd.Timedelta(hours=5))
print("Invalid durations:", sum(~duration_filter))

Invalid durations: 187


In [9]:
valid_start = rides_cp["start_station_id"].notna()
valid_end = rides_cp["end_station_id"].notna()
location_filter = valid_start & valid_end

In [11]:
date_filter = (rides_cp["started_at"] >= "2023-01-01") & (rides_cp["started_at"] < "2023-02-01")


In [12]:
final_filter = duration_filter & location_filter & date_filter
print("Rows dropped:", rides_cp.shape[0] - sum(final_filter))

Rows dropped: 327


In [13]:
rides_filtered = rides_cp[final_filter].copy()


In [14]:
rides_filtered = rides_filtered[["started_at", "start_station_id"]]
rides_filtered.rename(columns={
    "started_at": "pickup_datetime",
    "start_station_id": "pickup_location_id"
}, inplace=True)

# 9. Save as Parquet
output_path = Path("..") / "data" / "processed" / f"rides_{year}_{month:02}.parquet"
output_path.parent.mkdir(parents=True, exist_ok=True)
rides_filtered.to_parquet(output_path, engine="pyarrow", index=False)

# 10. Final check
rides_filtered.head()

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-01-28 09:18:10,HB101
1,2023-01-23 20:10:12,HB101
2,2023-01-29 15:27:04,HB101
3,2023-01-24 18:35:08,HB101
4,2023-01-21 20:44:09,JC009
