### This script contains the below:
1. Import libraries
2. Importing bike trips data
3. Data Exploration
4. Get weather data using NOAA's API
5. Merge bike trip and weather data

# 01 Importing libraries

In [52]:
import pandas as pd
import numpy as np
import os
import requests
import json
from datetime import datetime

# 02 Importing data & merge

In [53]:
# The code below automates the process of importing and combining all monthly CitiBike CSV files for 2022.
# 1. First, it walks through the folder "Original Data" and creates a list (filepaths) containing the full paths 
#    of all CSV files in the folder and its subfolders.
# 2. Then, it uses a generator expression inside pd.concat() to read each CSV file into a temporary DataFrame.
# 3. All these monthly DataFrames are merged (stacked vertically) into one large DataFrame called 'df'.
# 4. The parameter 'ignore_index=True' resets the row index after concatenation, creating a clean continuous index.
# 5. 'low_memory=False' ensures that pandas reads large CSVs more reliably when columns have mixed data types.

In [54]:
# Create a list with all files in the folder using a list comprehension

folderpath = r"../02 Data/Original Data" 
filepaths = []
for root, dirs, files in os.walk(folderpath):
    for name in files:
        if name.endswith('.csv'):
            filepaths.append(os.path.join(root, name))

In [55]:
filepaths

['../02 Data/Original Data/JC-202203-citibike-tripdata.csv',
 '../02 Data/Original Data/JC-202201-citibike-tripdata.csv',
 '../02 Data/Original Data/JC-202209-citibike-tripdata.csv',
 '../02 Data/Original Data/JC-202211-citibike-tripdata.csv',
 '../02 Data/Original Data/JC-202207-citbike-tripdata.csv',
 '../02 Data/Original Data/JC-202205-citibike-tripdata.csv',
 '../02 Data/Original Data/JC-202202-citibike-tripdata.csv',
 '../02 Data/Original Data/JC-202208-citibike-tripdata.csv',
 '../02 Data/Original Data/JC-202206-citibike-tripdata.csv',
 '../02 Data/Original Data/JC-202210-citibike-tripdata.csv',
 '../02 Data/Original Data/JC-202204-citibike-tripdata.csv',
 '../02 Data/Original Data/JC-202212-citibike-tripdata.csv']

In [56]:
# merging the data
df = pd.concat((pd.read_csv(f, low_memory=False) for f in filepaths), ignore_index=True)

In [89]:
# Merging the data:
# This line uses a generator expression inside pd.concat() to combine all CSV files into one DataFrame.
# The generator (pd.read_csv(f) for f in filepaths) loads each file one by one instead of all at once,
# which makes the process more memory-efficient for large datasets.

In [57]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,3255D3E3F33CDC45,classic_bike,2022-03-18 15:38:17,2022-03-18 15:45:34,Mama Johnson Field - 4 St & Jackson St,HB404,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,40.74314,-74.040041,40.736982,-74.027781,casual
1,17FA5604A37338F9,electric_bike,2022-03-04 16:44:48,2022-03-04 16:50:45,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member
2,7DEC9ADDB8D6BBE1,electric_bike,2022-03-13 17:44:32,2022-03-13 17:54:44,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member
3,9D69F74EEF231A2E,classic_bike,2022-03-13 15:33:47,2022-03-13 15:41:22,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member
4,C84AE4A9D78A6347,classic_bike,2022-03-11 12:21:18,2022-03-11 12:33:24,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member


In [58]:
# Convert the columns to datetime (if not already)
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

# Get the earliest and latest dates for each column
earliest_start = df['started_at'].min()
latest_start = df['started_at'].max()

earliest_end = df['ended_at'].min()
latest_end = df['ended_at'].max()

# Print the results
print("Earliest start date:", earliest_start)
print("Latest start date:", latest_start)
print("Earliest end date:", earliest_end)
print("Latest end date:", latest_end)

Earliest start date: 2022-01-01 00:10:20
Latest start date: 2022-12-31 23:58:26
Earliest end date: 2022-01-01 00:14:06
Latest end date: 2023-01-02 08:33:33


In [59]:
df2022 = df[(df['started_at'] >= '2022-01-01') & (df['started_at'] <= '2022-12-31')].copy()

In [60]:
df2022.shape

(894502, 13)

In [91]:
df2022.duplicated().sum()

0

In [61]:
df2022.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 894502 entries, 0 to 895484
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   ride_id             894502 non-null  object        
 1   rideable_type       894502 non-null  object        
 2   started_at          894502 non-null  datetime64[ns]
 3   ended_at            894502 non-null  datetime64[ns]
 4   start_station_name  894492 non-null  object        
 5   start_station_id    894492 non-null  object        
 6   end_station_name    891312 non-null  object        
 7   end_station_id      891312 non-null  object        
 8   start_lat           894502 non-null  float64       
 9   start_lng           894502 non-null  float64       
 10  end_lat             892539 non-null  float64       
 11  end_lng             892539 non-null  float64       
 12  member_casual       894502 non-null  object        
dtypes: datetime64[ns](2), float64

In [62]:
summary = pd.DataFrame({
    'dtype': df2022.dtypes,
    'non_nulls': df2022.notnull().sum(),
    'nulls': df2022.isnull().sum(),
    'unique': df2022.nunique()
})

summary

Unnamed: 0,dtype,non_nulls,nulls,unique
ride_id,object,894502,0,894502
rideable_type,object,894502,0,3
started_at,datetime64[ns],894502,0,869517
ended_at,datetime64[ns],894502,0,868910
start_station_name,object,894492,10,84
start_station_id,object,894492,10,86
end_station_name,object,891312,3190,320
end_station_id,object,891312,3190,319
start_lat,float64,894502,0,65648
start_lng,float64,894502,0,68314


In [63]:
# Save the rides for the 2022 year to CSV
df2022.to_csv('df2022.csv', index=False)

# 03 Data exploration

In [64]:
# === 1. GENERAL DESCRIPTIVE STATISTICS ===
print("\n--- Missing Values (%) ---")
print((df2022.isna().sum() / len(df2022) * 100).round(2))

print("\n--- Numeric Summary ---")
display(df2022.describe())

print("\n--- Categorical Summary ---")
display(df2022.describe(include='object'))


--- Missing Values (%) ---
ride_id               0.00
rideable_type         0.00
started_at            0.00
ended_at              0.00
start_station_name    0.00
start_station_id      0.00
end_station_name      0.36
end_station_id        0.36
start_lat             0.00
start_lng             0.00
end_lat               0.22
end_lng               0.22
member_casual         0.00
dtype: float64

--- Numeric Summary ---


Unnamed: 0,start_lat,start_lng,end_lat,end_lng
count,894502.0,894502.0,892539.0,892539.0
mean,40.731927,-74.040459,40.731868,-74.040203
std,0.012095,0.012219,0.012223,0.012177
min,40.706495,-74.088964,40.64,-74.14
25%,40.721124,-74.046305,40.721124,-74.045953
50%,40.73367,-74.037977,40.73367,-74.037977
75%,40.740973,-74.03097,40.740973,-74.03097
max,40.754992,-74.023587,40.872412,-73.888271



--- Categorical Summary ---


Unnamed: 0,ride_id,rideable_type,start_station_name,start_station_id,end_station_name,end_station_id,member_casual
count,894502,894502,894492,894492,891312,891312,894502
unique,894502,3,84,86,320,319,2
top,3255D3E3F33CDC45,classic_bike,Grove St PATH,JC005,Grove St PATH,JC005,member
freq,1,626367,42522,42389,44684,44562,588152


In [65]:
# Counts of bike types
df2022['rideable_type'].value_counts()

classic_bike     626367
electric_bike    260382
docked_bike        7753
Name: rideable_type, dtype: int64

In [66]:
# === 2. RESEARCH QUESTIONS ===

# Q1: What are the most popular stations in the city?
print("\n=== Q1: Top 10 Most Popular Start Stations ===")
popular_stations = df2022['start_station_name'].value_counts().head(10)
display(popular_stations)

print("\n=== Q1b: Top 10 Most Popular End Stations ===")
popular_end_stations = df2022['end_station_name'].value_counts().head(10)
display(popular_end_stations)


=== Q1: Top 10 Most Popular Start Stations ===


Grove St PATH                                   42522
South Waterfront Walkway - Sinatra Dr & 1 St    34219
Hoboken Terminal - River St & Hudson Pl         32985
Hoboken Terminal - Hudson St & Hudson Pl        30209
City Hall - Washington St & 1 St                23269
Newport Pkwy                                    21932
Hamilton Park                                   20467
Newport PATH                                    19942
Hoboken Ave at Monmouth St                      19481
Marin Light Rail                                19213
Name: start_station_name, dtype: int64


=== Q1b: Top 10 Most Popular End Stations ===


Grove St PATH                                   44684
South Waterfront Walkway - Sinatra Dr & 1 St    34725
Hoboken Terminal - River St & Hudson Pl         32380
Hoboken Terminal - Hudson St & Hudson Pl        30193
City Hall - Washington St & 1 St                23572
Newport Pkwy                                    21771
Hamilton Park                                   20693
Hoboken Ave at Monmouth St                      20149
Newport PATH                                    19829
Marin Light Rail                                18883
Name: end_station_name, dtype: int64

Q1. Most Popular Stations

The most frequently used stations are concentrated around major transit and commercial hubs in Hoboken and Jersey City.
Top stations for both start and end trips include:

Grove St PATH (≈ 42K–45K trips)
South Waterfront Walkway – Sinatra Dr & 1 St
Hoboken Terminal – River St & Hudson Pl / Hudson St
These are likely high-traffic commuter zones, close to train and ferry connections, indicating strong demand around transportation nodes.

In [67]:
# Q2: Which are the months with the most trips taken?
# Create a month column
df2022['month'] = df2022['started_at'].dt.month_name()

print("\n=== Q2: Trips by Month ===")
trips_by_month = df2022['month'].value_counts()
display(trips_by_month)


=== Q2: Trips by Month ===


August       115231
July         108502
September    104247
June         103299
October       89558
May           80482
November      72709
April         62528
March         51671
December      47602
February      31911
January       26762
Name: month, dtype: int64

Trip volumes peak during summer months (June–September) and drop sharply in winter (December–February).
Confirms the hypothesis: weather and season strongly influence ridership.


In [68]:
# Q3: What are the most popular trips between stations?
print("\n=== Q3: Top 10 Most Common Start→End Station Pairs ===")
popular_routes = (
    df2022.groupby(['start_station_name', 'end_station_name'])
    .size()
    .sort_values(ascending=False)
    .head(10)
)
display(popular_routes)


=== Q3: Top 10 Most Common Start→End Station Pairs ===


start_station_name                            end_station_name                            
Hoboken Terminal - Hudson St & Hudson Pl      Hoboken Ave at Monmouth St                      5562
South Waterfront Walkway - Sinatra Dr & 1 St  South Waterfront Walkway - Sinatra Dr & 1 St    5438
Marin Light Rail                              Grove St PATH                                   4111
Hoboken Ave at Monmouth St                    Hoboken Terminal - Hudson St & Hudson Pl        4079
Grove St PATH                                 Marin Light Rail                                3972
12 St & Sinatra Dr N                          South Waterfront Walkway - Sinatra Dr & 1 St    3961
Liberty Light Rail                            Liberty Light Rail                              3696
South Waterfront Walkway - Sinatra Dr & 1 St  12 St & Sinatra Dr N                            3495
Hamilton Park                                 Grove St PATH                                   3202
Newport Pkwy      

The top start-end pairs show that:

Many riders start and end at the same or nearby stations (e.g., South Waterfront Walkway – Sinatra Dr & 1 St → same station, Newport Pkwy → Newport Pkwy),
There’s heavy movement between Hoboken Terminal, Grove St PATH, and Marin Light Rail — commuter hotspots.

They may indicate areas of bike congestion or shortages, especially during rush hours.

In [69]:
# Q4: Are the existing stations evenly distributed?
# Check the geographic spread using latitude/longitude ranges
print("\n=== Q4: Station Geographic Distribution ===")
print("Start stations latitude range:", df2022['start_lat'].min(), "-", df2022['start_lat'].max())
print("Start stations longitude range:", df2022['start_lng'].min(), "-", df2022['start_lng'].max())

print("Number of unique start stations:", df2022['start_station_id'].nunique())
print("Number of unique end stations:", df2022['end_station_id'].nunique())


=== Q4: Station Geographic Distribution ===
Start stations latitude range: 40.706495047 - 40.754992366
Start stations longitude range: -74.08896387030836 - -74.023587108
Number of unique start stations: 86
Number of unique end stations: 319


The network is spatially concentrated — mostly within the Hoboken–Jersey City corridor.
End stations are more numerous than start stations, possibly reflecting different docking or expansion patterns, which could affect supply balance.

# 04 Obtain weather data from New York LaGuardia’s weather station for 2022 and export it to a CSV file.

In [70]:
# Define your NOAA token

Token = 'HLPKfjkSRPFNvFLMQBdRxcFpoojZoEwc'

In [71]:
# Get the API for La Guardia Airport Jan 1st - Dec 31st 2022

r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00014732&startdate=2022-01-01&enddate=2022-12-31', headers={'token':Token})

In [72]:
# Load the api response as a json

d = json.loads(r.text)

In [73]:
d

{'metadata': {'resultset': {'offset': 1, 'count': 365, 'limit': 1000}},
 'results': [{'date': '2022-01-01T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 116},
  {'date': '2022-01-02T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 114},
  {'date': '2022-01-03T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 14},
  {'date': '2022-01-04T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': -27},
  {'date': '2022-01-05T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 32},
  {'date': '2022-01-06T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 49},
  {'date': '2022-01-07T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attribut

In [74]:
# Secure all items in the response that correspond to TAVG

avg_temps = [item for item in d['results'] if item['datatype']=='TAVG']

In [75]:
# Get only the date field from all average temperature readings

dates_temp = [item['date'] for item in avg_temps]

In [76]:
# Get the temperature from all average temperature readings

temps = [item['value'] for item in avg_temps]

In [77]:
temps

[116,
 114,
 14,
 -27,
 32,
 49,
 7,
 -25,
 14,
 16,
 -54,
 -19,
 40,
 48,
 -67,
 -80,
 39,
 18,
 32,
 51,
 -60,
 -59,
 -7,
 -2,
 36,
 -23,
 -42,
 1,
 -48,
 -71,
 -34,
 -17,
 23,
 64,
 58,
 -28,
 -48,
 5,
 41,
 28,
 63,
 86,
 118,
 28,
 -43,
 -47,
 16,
 116,
 99,
 1,
 -26,
 41,
 56,
 144,
 11,
 8,
 -11,
 24,
 8,
 26,
 77,
 56,
 -19,
 31,
 96,
 182,
 87,
 32,
 40,
 72,
 51,
 -23,
 53,
 116,
 136,
 82,
 139,
 131,
 143,
 104,
 107,
 72,
 57,
 96,
 104,
 59,
 -15,
 -12,
 26,
 99,
 131,
 72,
 70,
 76,
 97,
 86,
 91,
 128,
 114,
 86,
 88,
 134,
 154,
 163,
 157,
 157,
 83,
 79,
 76,
 94,
 106,
 146,
 133,
 128,
 104,
 115,
 116,
 82,
 104,
 129,
 136,
 121,
 124,
 120,
 172,
 148,
 100,
 104,
 135,
 154,
 161,
 171,
 160,
 179,
 188,
 204,
 197,
 178,
 155,
 154,
 212,
 278,
 210,
 174,
 162,
 164,
 202,
 204,
 207,
 233,
 283,
 178,
 203,
 195,
 219,
 200,
 222,
 218,
 237,
 244,
 218,
 218,
 205,
 242,
 239,
 233,
 203,
 252,
 217,
 173,
 205,
 221,
 191,
 192,
 219,
 258,
 270,
 243,
 22

In [78]:
# Storing the results in a dataframe

df_temp = pd.DataFrame()

In [79]:
# Get only date and cast it to date time; convert temperature from tenths of Celsius to normal Celsius

df_temp['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in dates_temp]
df_temp['avgTemp'] = [float(v)/10.0 for v in temps]

In [80]:
df_temp.head()

Unnamed: 0,date,avgTemp
0,2022-01-01,11.6
1,2022-01-02,11.4
2,2022-01-03,1.4
3,2022-01-04,-2.7
4,2022-01-05,3.2


In [81]:
# Save the cleaned temperature data to CSV
df_temp.to_csv('LaGuardia_Weather_2022.csv', index=False)

# 05 Merge bike trip and weather data

In [82]:
df2022.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
month                         object
dtype: object

In [83]:
# Extract only the date part (YYYY-MM-DD) from 'started_at' into a new column

df2022['date'] = pd.to_datetime(df2022['started_at'], format='%Y-%m-%d').dt.date

In [84]:
# Convert 'date' from string to datetime

df2022['date'] = pd.to_datetime(df2022['date'])

In [85]:
df2022.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
month                         object
date                  datetime64[ns]
dtype: object

In [86]:
# Merge dataframes

df_merged = df2022.merge(df_temp, how = 'left', on = 'date', indicator = True)

In [87]:
df_merged['_merge'].value_counts(dropna = False)

both          894502
left_only          0
right_only         0
Name: _merge, dtype: int64

In [88]:
# Save the merged df to CSV
df_merged.to_csv('df_merged.csv', index=False)