### This script contains the below:
1. Import libraries
2. Importing bike trips data
3. Data Exploration
4. Get weather data using NOAA's API
5. Merge bike trip and weather data

# 01 Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json
from datetime import datetime

# 02 Importing data & merge

In [2]:
# The code below automates the process of importing and combining all monthly CitiBike CSV files for 2022.
# 1. First, it walks through the folder "Original Data" and creates a list (filepaths) containing the full paths 
#    of all CSV files in the folder and its subfolders.
# 2. Then, it uses a generator expression inside pd.concat() to read each CSV file into a temporary DataFrame.
# 3. All these monthly DataFrames are merged (stacked vertically) into one large DataFrame called 'df'.
# 4. The parameter 'ignore_index=True' resets the row index after concatenation, creating a clean continuous index.
# 5. 'low_memory=False' ensures that pandas reads large CSVs more reliably when columns have mixed data types.

In [17]:
# Create a list with all CSV files in the folder using a list comprehension
# The comprehension iterates through all directories and subdirectories with os.walk(),
# checks if each file ends with '.csv', and joins the folder path with the file name.

folderpath = r"../02 Data/Original Data"
filepaths = [
    os.path.join(root, name)
    for root, dirs, files in os.walk(folderpath)
    for name in files
    if name.endswith('.csv')
]

In [18]:
filepaths

['../02 Data/Original Data/202208-citibike-tripdata_3.csv',
 '../02 Data/Original Data/202208-citibike-tripdata_2.csv',
 '../02 Data/Original Data/202207-citibike-tripdata_1.csv',
 '../02 Data/Original Data/202208-citibike-tripdata_1.csv',
 '../02 Data/Original Data/202210-citibike-tripdata_1.csv',
 '../02 Data/Original Data/202208-citibike-tripdata_4.csv',
 '../02 Data/Original Data/202203-citibike-tripdata_2.csv',
 '../02 Data/Original Data/202210-citibike-tripdata_2.csv',
 '../02 Data/Original Data/202203-citibike-tripdata_1.csv',
 '../02 Data/Original Data/202210-citibike-tripdata_3.csv',
 '../02 Data/Original Data/202204-citibike-tripdata_1.csv',
 '../02 Data/Original Data/202204-citibike-tripdata_2.csv',
 '../02 Data/Original Data/202204-citibike-tripdata_3.csv',
 '../02 Data/Original Data/202211-citibike-tripdata_3.csv',
 '../02 Data/Original Data/202202-citibike-tripdata_1.csv',
 '../02 Data/Original Data/202211-citibike-tripdata_2.csv',
 '../02 Data/Original Data/202202-citibi

In [19]:
# merging the data
df = pd.concat((pd.read_csv(f, low_memory=False) for f in filepaths), ignore_index=True)

In [89]:
# Merging the data:
# This line uses a generator expression inside pd.concat() to combine all CSV files into one DataFrame.
# The generator (pd.read_csv(f) for f in filepaths) loads each file one by one instead of all at once,
# which makes the process more memory-efficient for large datasets.

In [20]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,9D0DC440CB40CF8E,electric_bike,2022-08-27 13:56:47.728,2022-08-27 14:02:56.651,Flatbush Ave & Ocean Ave,3704.04,3 St & Prospect Park West,3865.05,40.663657,-73.963014,40.668132,-73.973638,casual
1,2214991DFBE5C4D7,electric_bike,2022-08-20 10:37:02.756,2022-08-20 10:45:56.631,Forsyth St\t& Grand St,5382.07,E 11 St & 1 Ave,5746.14,40.717798,-73.993161,40.729538,-73.984267,casual
2,20C5D469563B6337,classic_bike,2022-08-31 18:55:03.051,2022-08-31 19:03:37.344,Perry St & Bleecker St,5922.07,Grand St & Greene St,5500.02,40.735354,-74.004831,40.7217,-74.002381,member
3,3E8791885BC189D1,classic_bike,2022-08-02 08:05:00.250,2022-08-02 08:16:52.063,FDR Drive & E 35 St,6230.04,Grand Army Plaza & Central Park S,6839.1,40.744219,-73.971212,40.764397,-73.973715,member
4,8DBCBF98885106CB,electric_bike,2022-08-25 15:44:48.386,2022-08-25 15:55:39.691,E 40 St & 5 Ave,6474.11,Ave A & E 14 St,5779.11,40.752052,-73.982115,40.730311,-73.980472,member


In [21]:
# Convert the columns to datetime (if not already)
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

# Get the earliest and latest dates for each column
earliest_start = df['started_at'].min()
latest_start = df['started_at'].max()

earliest_end = df['ended_at'].min()
latest_end = df['ended_at'].max()

# Print the results
print("Earliest start date:", earliest_start)
print("Latest start date:", latest_start)
print("Earliest end date:", earliest_end)
print("Latest end date:", latest_end)

Earliest start date: 2021-01-30 17:30:45.544000
Latest start date: 2022-12-31 23:58:19.206000
Earliest end date: 2022-01-01 00:00:09.459000
Latest end date: 2022-12-31 23:59:55.708000


In [23]:
df2022 = df[(df['started_at'] >= '2022-01-01') & (df['started_at'] <= '2022-12-31')].copy()

In [24]:
df2022.shape

(27408768, 13)

In [26]:
df2022.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27408768 entries, 0 to 27440873
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
dtypes: datetime64[ns](2), float64(4), object(7)
memory usage: 2.9+ GB


In [27]:
cat_cols = [
    'rideable_type',
    'start_station_name',
    'start_station_id',
    'end_station_name',
    'end_station_id',
    'member_casual'
]

for col in cat_cols:
    df2022[col] = df2022[col].astype('category')

In [28]:
float_cols = ['start_lat', 'start_lng', 'end_lat', 'end_lng']
df2022[float_cols] = df2022[float_cols].apply(pd.to_numeric, downcast='float')

In [29]:
df2022.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27408768 entries, 0 to 27440873
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       category      
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  category      
 5   start_station_id    category      
 6   end_station_name    category      
 7   end_station_id      category      
 8   start_lat           float32       
 9   start_lng           float32       
 10  end_lat             float32       
 11  end_lng             float32       
 12  member_casual       category      
dtypes: category(6), datetime64[ns](2), float32(4), object(1)
memory usage: 1.5+ GB


In [30]:
summary = pd.DataFrame({
    'dtype': df2022.dtypes,
    'non_nulls': df2022.notnull().sum(),
    'nulls': df2022.isnull().sum(),
    'unique': df2022.nunique()
})

summary

Unnamed: 0,dtype,non_nulls,nulls,unique
ride_id,object,27408768,0,27408768
rideable_type,category,27408768,0,2
started_at,datetime64[ns],27408768,0,27404219
ended_at,datetime64[ns],27408768,0,27376231
start_station_name,category,27408721,47,1761
start_station_id,category,27408721,47,3468
end_station_name,category,27344519,64249,1839
end_station_id,category,27344519,64249,1904
start_lat,float32,27408768,0,56998
start_lng,float32,27408768,0,17982


In [31]:
# Save the rides for the 2022 year to a pickle file
df2022.to_pickle('df2022.pkl')

# 03 Data exploration

In [33]:
# Counts of bike types
df2022['rideable_type'].value_counts()

classic_bike     16550804
electric_bike    10857964
Name: rideable_type, dtype: int64

In [35]:
# === RESEARCH QUESTIONS ===

# Q1: What are the most popular stations in the city?
print("\n=== Q1: Top 10 Most Popular Start Stations ===")
popular_stations = df2022['start_station_name'].value_counts().head(10)
display(popular_stations)

print("\n=== Q1b: Top 10 Most Popular End Stations ===")
popular_end_stations = df2022['end_station_name'].value_counts().head(10)
display(popular_end_stations)


=== Q1: Top 10 Most Popular Start Stations ===


W 21 St & 6 Ave            119320
West St & Chambers St      111344
Broadway & W 58 St         104844
1 Ave & E 68 St             97872
6 Ave & W 33 St             96718
University Pl & E 14 St     92250
Broadway & W 25 St          91187
Broadway & E 14 St          89293
Broadway & E 21 St          88357
W 31 St & 7 Ave             87301
Name: start_station_name, dtype: int64


=== Q1b: Top 10 Most Popular End Stations ===


W 21 St & 6 Ave             116731
West St & Chambers St       108696
Broadway & W 58 St           98266
1 Ave & E 68 St              94130
6 Ave & W 33 St              93057
University Pl & E 14 St      91887
Cleveland Pl & Spring St     91425
E 33 St & 1 Ave              90217
Broadway & W 25 St           89963
6 Ave & W 34 St              86692
Name: end_station_name, dtype: int64

Q1. Most Popular Stations

The analysis reveals a strong overlap between the most frequently used start and end stations in New York’s CitiBike network.
W 21 St & 6 Ave stands out as the single most popular location for both trip departures (119,320 rides) and arrivals (116,731 rides).

Other consistently busy stations include West St & Chambers St, Broadway & W 58 St, 1 Ave & E 68 St, and 6 Ave & W 33 St — all ranking within the top five for both starting and ending trips.

The distribution of popular stations indicates a concentration of activity in Manhattan, particularly around Midtown and Downtown, areas characterized by dense business districts and strong commuter demand.

Minor variations appear in the rankings — for instance, Cleveland Pl & Spring St and E 33 St & 1 Ave appear among the top end stations but not among the top start stations — suggesting certain locations serve primarily as destination points rather than origins.

Overall, the data highlights the central Manhattan corridor as the hub of CitiBike usage, reflecting its role as the city’s primary commuting and tourist zone.

In [36]:
# Q2: Which are the months with the most trips taken?
# Create a month column
df2022['month'] = df2022['started_at'].dt.month_name()

print("\n=== Q2: Trips by Month ===")
trips_by_month = df2022['month'].value_counts()
display(trips_by_month)


=== Q2: Trips by Month ===


August       3576182
September    3411909
June         3343535
October      2935959
May          2865291
November     2386350
April        2261338
March        1846034
December     1560344
February     1197359
January      1024055
July         1000412
Name: month, dtype: int64

CitiBike usage in 2022 shows a strong seasonal pattern, with the highest activity during the warmer months.

August recorded the most trips (3.58 million), followed closely by September and June, confirming peak ridership in late summer.
Usage gradually declined through the fall, with a sharp drop beginning in November and reaching the lowest levels in January (just over 1 million trips).
The trend reflects typical weather-dependent cycling behavior, where favorable temperatures and longer daylight hours significantly boost ridership.

Overall, the data suggests that summer and early autumn represent the core operating season for the CitiBike network, while winter months show reduced demand.

In [37]:
# Q3: What are the most popular trips between stations?
print("\n=== Q3: Top 10 Most Common Start→End Station Pairs ===")
popular_routes = (
    df2022.groupby(['start_station_name', 'end_station_name'])
    .size()
    .sort_values(ascending=False)
    .head(10)
)
display(popular_routes)


=== Q3: Top 10 Most Common Start→End Station Pairs ===


start_station_name                 end_station_name                 
Central Park S & 6 Ave             Central Park S & 6 Ave               10534
Roosevelt Island Tramway           Roosevelt Island Tramway              7313
7 Ave & Central Park South         7 Ave & Central Park South            7248
Grand Army Plaza & Central Park S  Grand Army Plaza & Central Park S     7026
W 21 St & 6 Ave                    9 Ave & W 22 St                       5736
Soissons Landing                   Soissons Landing                      5657
1 Ave & E 62 St                    1 Ave & E 68 St                       5196
5 Ave & E 72 St                    5 Ave & E 72 St                       5054
Broadway & W 58 St                 Broadway & W 58 St                    4845
Yankee Ferry Terminal              Yankee Ferry Terminal                 4787
dtype: int64

The top start-end pairs show that:

The most frequent trip patterns in 2022 reveal a strong preference for short, localized rides and loop trips that start and end at the same station.

The top pair, Central Park S & 6 Ave → Central Park S & 6 Ave, recorded over 10,500 trips, indicating high recreational use within or around Central Park.

Similar self-loop patterns appear at Roosevelt Island Tramway, 7 Ave & Central Park South, and Grand Army Plaza & Central Park S, all popular leisure or tourist zones.

A few short-distance commuter routes, such as W 21 St & 6 Ave → 9 Ave & W 22 St and 1 Ave & E 62 St → 1 Ave & E 68 St, suggest regular local commuting within Midtown and the Upper East Side.

Overall, the data highlights Central Park and Roosevelt Island as dominant recreational cycling areas, while Midtown Manhattan routes reflect consistent short-distance commuter activity.

In [38]:
# Q4: Are the existing stations evenly distributed?
# Check the geographic spread using latitude/longitude ranges
print("\n=== Q4: Station Geographic Distribution ===")
print("Start stations latitude range:", df2022['start_lat'].min(), "-", df2022['start_lat'].max())
print("Start stations longitude range:", df2022['start_lng'].min(), "-", df2022['start_lng'].max())

print("Number of unique start stations:", df2022['start_station_id'].nunique())
print("Number of unique end stations:", df2022['end_station_id'].nunique())


=== Q4: Station Geographic Distribution ===
Start stations latitude range: 40.62737 - 40.88398
Start stations longitude range: -74.028015 - -73.878586
Number of unique start stations: 3468
Number of unique end stations: 1904


Summary: Station Geographic Distribution (2022)

CitiBike stations in 2022 were widely distributed across New York City, covering a latitude range from 40.63° to 40.88° and a longitude range from –74.03° to –73.88°.

This area spans from southern Brooklyn and Jersey City up to northern Manhattan and the Bronx, reflecting the network’s broad urban coverage.

There were 3,468 unique start stations and 1,904 unique end stations, suggesting some newly installed or temporarily inactive stations may not yet appear in both categories.

Overall, the data shows that CitiBike’s infrastructure is densely concentrated in central areas but extends across multiple boroughs, supporting both commuter and recreational use across the city.

# 04 Obtain weather data from New York LaGuardia’s weather station for 2022 and export it to a CSV file.

In [39]:
# Define your NOAA token

Token = 'HLPKfjkSRPFNvFLMQBdRxcFpoojZoEwc'

In [40]:
# Get the API for La Guardia Airport Jan 1st - Dec 31st 2022

r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00014732&startdate=2022-01-01&enddate=2022-12-31', headers={'token':Token})

In [41]:
# Load the api response as a json

d = json.loads(r.text)

In [42]:
d

{'metadata': {'resultset': {'offset': 1, 'count': 365, 'limit': 1000}},
 'results': [{'date': '2022-01-01T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 116},
  {'date': '2022-01-02T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 114},
  {'date': '2022-01-03T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 14},
  {'date': '2022-01-04T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': -27},
  {'date': '2022-01-05T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 32},
  {'date': '2022-01-06T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 49},
  {'date': '2022-01-07T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attribut

In [43]:
# Secure all items in the response that correspond to TAVG

avg_temps = [item for item in d['results'] if item['datatype']=='TAVG']

In [44]:
# Get only the date field from all average temperature readings

dates_temp = [item['date'] for item in avg_temps]

In [45]:
# Get the temperature from all average temperature readings

temps = [item['value'] for item in avg_temps]

In [46]:
temps

[116,
 114,
 14,
 -27,
 32,
 49,
 7,
 -25,
 14,
 16,
 -54,
 -19,
 40,
 48,
 -67,
 -80,
 39,
 18,
 32,
 51,
 -60,
 -59,
 -7,
 -2,
 36,
 -23,
 -42,
 1,
 -48,
 -71,
 -34,
 -17,
 23,
 64,
 58,
 -28,
 -48,
 5,
 41,
 28,
 63,
 86,
 118,
 28,
 -43,
 -47,
 16,
 116,
 99,
 1,
 -26,
 41,
 56,
 144,
 11,
 8,
 -11,
 24,
 8,
 26,
 77,
 56,
 -19,
 31,
 96,
 182,
 87,
 32,
 40,
 72,
 51,
 -23,
 53,
 116,
 136,
 82,
 139,
 131,
 143,
 104,
 107,
 72,
 57,
 96,
 104,
 59,
 -15,
 -12,
 26,
 99,
 131,
 72,
 70,
 76,
 97,
 86,
 91,
 128,
 114,
 86,
 88,
 134,
 154,
 163,
 157,
 157,
 83,
 79,
 76,
 94,
 106,
 146,
 133,
 128,
 104,
 115,
 116,
 82,
 104,
 129,
 136,
 121,
 124,
 120,
 172,
 148,
 100,
 104,
 135,
 154,
 161,
 171,
 160,
 179,
 188,
 204,
 197,
 178,
 155,
 154,
 212,
 278,
 210,
 174,
 162,
 164,
 202,
 204,
 207,
 233,
 283,
 178,
 203,
 195,
 219,
 200,
 222,
 218,
 237,
 244,
 218,
 218,
 205,
 242,
 239,
 233,
 203,
 252,
 217,
 173,
 205,
 221,
 191,
 192,
 219,
 258,
 270,
 243,
 22

In [47]:
# Storing the results in a dataframe

df_temp = pd.DataFrame()

In [48]:
# Get only date and cast it to date time; convert temperature from tenths of Celsius to normal Celsius

df_temp['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in dates_temp]
df_temp['avgTemp'] = [float(v)/10.0 for v in temps]

In [49]:
df_temp.head()

Unnamed: 0,date,avgTemp
0,2022-01-01,11.6
1,2022-01-02,11.4
2,2022-01-03,1.4
3,2022-01-04,-2.7
4,2022-01-05,3.2


In [50]:
# Save the cleaned temperature data to CSV
df_temp.to_csv('LaGuardia_Weather_2022.csv', index=False)

# 05 Merge bike trip and weather data

In [51]:
df2022.dtypes

ride_id                       object
rideable_type               category
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name          category
start_station_id            category
end_station_name            category
end_station_id              category
start_lat                    float32
start_lng                    float32
end_lat                      float32
end_lng                      float32
member_casual               category
month                         object
dtype: object

In [52]:
# Extract only the date part (YYYY-MM-DD) from 'started_at' into a new column

df2022['date'] = pd.to_datetime(df2022['started_at'], format='%Y-%m-%d').dt.date

In [53]:
# Convert 'date' from string to datetime

df2022['date'] = pd.to_datetime(df2022['date'])

In [54]:
df2022.dtypes

ride_id                       object
rideable_type               category
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name          category
start_station_id            category
end_station_name            category
end_station_id              category
start_lat                    float32
start_lng                    float32
end_lat                      float32
end_lng                      float32
member_casual               category
month                         object
date                  datetime64[ns]
dtype: object

In [55]:
# Merge dataframes

df_merged = df2022.merge(df_temp, how = 'left', on = 'date', indicator = True)

In [56]:
df_merged['_merge'].value_counts(dropna = False)

both          27408768
left_only            0
right_only           0
Name: _merge, dtype: int64

In [59]:
# Save the merged df to pickle
df_merged.to_pickle('df_merged.pkl')