# CitiBike 2022 – Data Preparation

This notebook loads and prepares all CitiBike trip data for New York (2022),
and enriches it with daily weather data from NOAA (LaGuardia station).

Steps covered:
1. Import libraries
2. Load and concatenate monthly CitiBike trip files
3. Clean and standardize date fields
4. Aggregate trips at daily level
5. Prepare data for weather merge

In [2]:
# Importing Libraries 

import os
import glob
import pandas as pd
import numpy as np

In [3]:
os.getcwd()

'/Users/mariatirado/citibike_2022_weather'

In [4]:
# locate raw data folder

RAW_DATA_PATH = "../data/raw/citibike_2022"
os.listdir(RAW_DATA_PATH)

[]

In [7]:
os.listdir("../data/raw")

['citibike_2022']

In [9]:
os.getcwd()

'/Users/mariatirado/citibike_2022_weather'

In [10]:
RAW_DATA_PATH = "data/raw/citibike_2022"
os.listdir(RAW_DATA_PATH)[:5]

['202211-citibike-tripdata',
 '202208-citibike-tripdata',
 '202210-citibike-tripdata',
 '202209-citibike-tripdata',
 '202212-citibike-tripdata']

# Importing all files 

In [11]:
import os, glob

RAW_DATA_PATH = "data/raw/citibike_2022"

# Recursive search because my CSVs are inside subfolders
csv_files = sorted(glob.glob(os.path.join(RAW_DATA_PATH, "**/*.csv"), recursive=True))

print("CSV files found:", len(csv_files))
csv_files[:5]

CSV files found: 36


['data/raw/citibike_2022/202201-citibike-tripdata/202201-citibike-tripdata_1.csv',
 'data/raw/citibike_2022/202201-citibike-tripdata/202201-citibike-tripdata_2.csv',
 'data/raw/citibike_2022/202202-citibike-tripdata/202202-citibike-tripdata_1.csv',
 'data/raw/citibike_2022/202202-citibike-tripdata/202202-citibike-tripdata_2.csv',
 'data/raw/citibike_2022/202203-citibike-tripdata/202203-citibike-tripdata_1.csv']

# loading and combining all files 

In [13]:
dfs = []

for file in csv_files:
    df = pd.read_csv(file, low_memory=False, dtype=str)
    df.columns = [c.strip().lower() for c in df.columns]
    dfs.append(df)

citibike = pd.concat(dfs, ignore_index=True)

print("Rows:", citibike.shape[0])
print("Columns:", citibike.shape[1])
citibike.head()

Rows: 29838806
Columns: 13


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,BFD29218AB271154,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.8021174,-73.9681805305,40.804038,-73.945925,member
1,7C953F2FD7BE1302,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673746,-73.985649,40.68848905639242,-73.99116039276123,member
2,95893ABD40CED4B8,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,1 Ave & E 62 St,6753.08,5 Ave & E 29 St,6248.06,40.7612274,-73.96094022,40.7451677,-73.98683077,member
3,F853B50772137378,classic_bike,2022-01-03 08:35:48.247,2022-01-03 09:10:50.475,2 Ave & E 96 St,7338.02,5 Ave & E 29 St,6248.06,40.7839636,-73.9471673,40.7451677,-73.98683077,member
4,7590ADF834797B4B,classic_bike,2022-01-22 14:14:23.043,2022-01-22 14:34:57.474,6 Ave & W 34 St,6364.1,5 Ave & E 29 St,6248.06,40.74964,-73.98805,40.7451677,-73.98683077,member


In [14]:
citibike.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='str')

# Obtaining weather data from New York LaGuardia’s weather station for 2022

In [15]:
import requests

NOAA_TOKEN = "VfHSNScltxoPnLVRHeoNMOleugNPtOlA"
headers = {"token": NOAA_TOKEN}

stations_url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/stations"
params = {
    "datasetid": "GHCND",       # Daily summaries dataset
    "locationid": "CITY:US360019",  # New York, NY (NOAA location id)
    "limit": 1000
}

r = requests.get(stations_url, headers=headers, params=params)
r.raise_for_status()
stations = r.json()["results"]

# Filter stations that mention LaGuardia in the name
lga = [s for s in stations if "LAGUARDIA" in s["name"].upper()]
len(lga), lga[:5]

(1,
 [{'elevation': 3,
   'mindate': '1939-10-07',
   'maxdate': '2026-02-03',
   'latitude': 40.77945,
   'name': 'LAGUARDIA AIRPORT, NY US',
   'datacoverage': 1,
   'id': 'GHCND:USW00014732',
   'elevationUnit': 'METERS',
   'longitude': -73.88027}])

# Requesting daily data for 2022 for LaGuardia's

In [16]:
STATION_ID = "GHCND:USW00014732"  

data_url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data"

def fetch_noaa_daily(datatypeid):
    all_rows = []
    offset = 1
    limit = 1000

    while True:
        params = {
            "datasetid": "GHCND",
            "stationid": STATION_ID,
            "startdate": "2022-01-01",
            "enddate": "2022-12-31",
            "datatypeid": datatypeid,
            "units": "metric",   
            "limit": limit,
            "offset": offset
        }
        r = requests.get(data_url, headers=headers, params=params)
        r.raise_for_status()
        js = r.json()
        results = js.get("results", [])
        if not results:
            break

        all_rows.extend(results)

        meta = js.get("metadata", {}).get("resultset", {})
        count = meta.get("count", 0)
        if offset + limit > count:
            break
        offset += limit

    return all_rows

# Pull the main variables
tmax = fetch_noaa_daily("TMAX")
tmin = fetch_noaa_daily("TMIN")
prcp = fetch_noaa_daily("PRCP")

len(tmax), len(tmin), len(prcp)

(365, 365, 365)

# Converting to a clean daily weather table

In [17]:
weather_raw = pd.DataFrame(tmax + tmin + prcp)

weather_raw["date"] = pd.to_datetime(weather_raw["date"]).dt.date

weather_daily = (
    weather_raw
    .pivot_table(index="date", columns="datatype", values="value", aggfunc="first")
    .reset_index()
)

weather_daily.head()

datatype,date,PRCP,TMAX,TMIN
0,2022-01-01,19.3,13.9,10.0
1,2022-01-02,1.0,15.6,3.9
2,2022-01-03,0.0,3.9,-4.3
3,2022-01-04,0.0,2.2,-6.0
4,2022-01-05,6.1,8.9,0.0


In [18]:
# Export 
weather_daily.to_csv("data/processed/noaa_lga_daily_2022.csv", index=False)

In [19]:
# Units check 
weather_daily[["TMAX","TMIN","PRCP"]].describe()

datatype,TMAX,TMIN,PRCP
count,365.0,365.0,365.0
mean,17.964932,10.003836,3.078904
std,10.263184,9.687433,6.80752
min,-8.8,-13.8,0.0
25%,8.9,2.2,0.0
50%,18.3,10.0,0.0
75%,26.7,18.3,2.0
max,36.7,27.8,45.0


# Merging CitiBike trips with Weather data

In [21]:
citibike.shape

(29838806, 13)

In [22]:
# convert to datetime
citibike["started_at"] = pd.to_datetime(citibike["started_at"], errors="coerce")

# create daily date
citibike["date"] = citibike["started_at"].dt.date

citibike[["started_at","date"]].head()

Unnamed: 0,started_at,date
0,2022-01-21 13:13:43.392,2022-01-21
1,2022-01-10 11:30:54.162,2022-01-10
2,2022-01-26 10:52:43.096,2022-01-26
3,2022-01-03 08:35:48.247,2022-01-03
4,2022-01-22 14:14:23.043,2022-01-22


# Creating the daily aggregation

In [23]:
daily_trips = (
    citibike
    .groupby("date")
    .size()
    .reset_index(name="total_trips")
)

daily_trips.head()

Unnamed: 0,date,total_trips
0,2021-01-30,1
1,2021-02-15,1
2,2021-03-11,1
3,2021-03-14,1
4,2021-03-31,1


In [24]:
# verifying 
daily_trips.shape

(402, 2)

# Performing the merge

In [25]:
daily_trips["date"] = pd.to_datetime(daily_trips["date"])
weather_daily["date"] = pd.to_datetime(weather_daily["date"])

citibike_weather = pd.merge(
    daily_trips,
    weather_daily,
    on="date",
    how="left"
)

citibike_weather.head()

Unnamed: 0,date,total_trips,PRCP,TMAX,TMIN
0,2021-01-30,1,,,
1,2021-02-15,1,,,
2,2021-03-11,1,,,
3,2021-03-14,1,,,
4,2021-03-31,1,,,


In [26]:
# Confirming the years in my dataframe
citibike["started_at"] = pd.to_datetime(citibike["started_at"], errors="coerce")
citibike["year"] = citibike["started_at"].dt.year
citibike["year"].value_counts().sort_index()

year
2021         640
2022    29838166
Name: count, dtype: int64

In [27]:
# Filter CitiBike to 2022 only
citibike_2022 = citibike[citibike["started_at"].dt.year == 2022].copy()
citibike_2022.shape

(29838166, 15)

In [28]:
# creating the daily table from citibike_2022
citibike_2022["date"] = citibike_2022["started_at"].dt.normalize()  

daily_trips = (
    citibike_2022
    .groupby("date")
    .size()
    .reset_index(name="total_trips")
)

daily_trips.head(), daily_trips.shape

(        date  total_trips
 0 2022-01-01        20428
 1 2022-01-02        43009
 2 2022-01-03        33189
 3 2022-01-04        36842
 4 2022-01-05        34230,
 (365, 2))

In [29]:
# Normalize weather dates to match exactly

weather_daily["date"] = pd.to_datetime(weather_daily["date"]).dt.normalize()
weather_daily.head()

datatype,date,PRCP,TMAX,TMIN
0,2022-01-01,19.3,13.9,10.0
1,2022-01-02,1.0,15.6,3.9
2,2022-01-03,0.0,3.9,-4.3
3,2022-01-04,0.0,2.2,-6.0
4,2022-01-05,6.1,8.9,0.0


# Merging again 

In [30]:
citibike_weather = pd.merge(daily_trips, weather_daily, on="date", how="left")
citibike_weather.head()

Unnamed: 0,date,total_trips,PRCP,TMAX,TMIN
0,2022-01-01,20428,19.3,13.9,10.0
1,2022-01-02,43009,1.0,15.6,3.9
2,2022-01-03,33189,0.0,3.9,-4.3
3,2022-01-04,36842,0.0,2.2,-6.0
4,2022-01-05,34230,6.1,8.9,0.0


In [31]:
# Checking for missing weather 

citibike_weather[["TMAX", "TMIN", "PRCP"]].isna().mean()

TMAX    0.0
TMIN    0.0
PRCP    0.0
dtype: float64

In [32]:
# Sanity check

citibike_weather["date"].min(), citibike_weather["date"].max()

(Timestamp('2022-01-01 00:00:00'), Timestamp('2022-12-31 00:00:00'))

In [33]:
weather_daily["date"].min(), weather_daily["date"].max()

(Timestamp('2022-01-01 00:00:00'), Timestamp('2022-12-31 00:00:00'))

# Export the merged dataset to CSV

In [34]:
citibike_weather.to_csv("data/processed/citibike_trips_weather_merged_2022.csv",
    index=False)

In [36]:
import os
os.path.exists("data/processed/citibike_trips_weather_merged_2022.csv")

True

In [37]:
# Save the NOAA weather CSV
weather_daily.to_csv("data/processed/noaa_lga_daily_2022.csv", index=False)