In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json
from datetime import datetime

In [3]:
# Create a list with all files in the folder using a list comprehension
folderpath = r"C:\Users\msyeu\New-York-Citibike-Analysis-2022\2022-citibike-tripdata"

In [5]:
filepaths = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]

In [7]:
print(filepaths)

['C:\\Users\\msyeu\\New-York-Citibike-Analysis-2022\\2022-citibike-tripdata\\202201-citibike-tripdata.zip', 'C:\\Users\\msyeu\\New-York-Citibike-Analysis-2022\\2022-citibike-tripdata\\202201-citibike-tripdata_1.csv', 'C:\\Users\\msyeu\\New-York-Citibike-Analysis-2022\\2022-citibike-tripdata\\202201-citibike-tripdata_2.csv', 'C:\\Users\\msyeu\\New-York-Citibike-Analysis-2022\\2022-citibike-tripdata\\202202-citibike-tripdata.zip', 'C:\\Users\\msyeu\\New-York-Citibike-Analysis-2022\\2022-citibike-tripdata\\202202-citibike-tripdata_1.csv', 'C:\\Users\\msyeu\\New-York-Citibike-Analysis-2022\\2022-citibike-tripdata\\202202-citibike-tripdata_2.csv', 'C:\\Users\\msyeu\\New-York-Citibike-Analysis-2022\\2022-citibike-tripdata\\202203-citibike-tripdata.zip', 'C:\\Users\\msyeu\\New-York-Citibike-Analysis-2022\\2022-citibike-tripdata\\202203-citibike-tripdata_1.csv', 'C:\\Users\\msyeu\\New-York-Citibike-Analysis-2022\\2022-citibike-tripdata\\202203-citibike-tripdata_2.csv', 'C:\\Users\\msyeu\\New-Y

- **`folderpath = r"Data"`**: The `r` in front of the string makes it a raw string, ensuring that backslashes are treated literally, not as escape sequences.

- **`filepaths = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]`**: This line creates a list of file paths by:
    1. Using `os.listdir(folderpath)` to list all files in the `Data` folder.
    2. Joining the folder path with each file name using `os.path.join()`.

In [13]:
import zipfile

In [15]:
for file in os.listdir(folderpath):
    if file.endswith('.zip'):
        with zipfile.ZipFile(os.path.join(folderpath, file), 'r') as zip_ref:
            zip_ref.extractall(folderpath)

In [19]:
# After extracting, read all CSV files
filepaths = [os.path.join(folderpath, name) for name in os.listdir(folderpath) if name.endswith('.csv')]

In [22]:
df_citibike = pd.concat([pd.read_csv(f, dtype={'column_5_name': 'str', 'column_7_name': 'str'}, low_memory=False) for f in filepaths], ignore_index=True)

In [24]:
df_citibike.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,BFD29218AB271154,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.802117,-73.968181,40.804038,-73.945925,member
1,7C953F2FD7BE1302,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673746,-73.985649,40.688489,-73.99116,member
2,95893ABD40CED4B8,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,1 Ave & E 62 St,6753.08,5 Ave & E 29 St,6248.06,40.761227,-73.96094,40.745168,-73.986831,member
3,F853B50772137378,classic_bike,2022-01-03 08:35:48.247,2022-01-03 09:10:50.475,2 Ave & E 96 St,7338.02,5 Ave & E 29 St,6248.06,40.783964,-73.947167,40.745168,-73.986831,member
4,7590ADF834797B4B,classic_bike,2022-01-22 14:14:23.043,2022-01-22 14:34:57.474,6 Ave & W 34 St,6364.1,5 Ave & E 29 St,6248.06,40.74964,-73.98805,40.745168,-73.986831,member


In [30]:
# NOAA API endpoint and parameters
base_url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data?"

In [32]:
token = "SKNSEHFPikiFDPQpswIviTpRmYARzDzx"

In [34]:
# Define parameters for LaGuardia Airport in 2022
params = {
    'datasetid': 'GHCND',
    'stationid': 'GHCND:USW00014732',  # LaGuardia station ID
    'startdate': '2022-01-01',
    'enddate': '2022-12-31',
    'datatypeid': 'TAVG',  # Average temperature
    'limit': 1000
}

In [36]:
headers = {'token': token}

In [38]:
response = requests.get(base_url, headers=headers, params=params)

In [39]:
data = response.json()['results']

In [46]:
# Create lists for dates and temperatures
dates = [datetime.strptime(item['date'], "%Y-%m-%dT%H:%M:%S") for item in data]
temps = [item['value'] / 10 for item in data]  # Convert temp from tenths of Celsius to Celsius

In [60]:
# Create a DataFrame
df_temps = pd.DataFrame({'Date': dates, 'AvgTemp(C)': temps})

In [62]:
df_temps.to_csv('laguardia_weather_2022.csv', index=False)

In [68]:
df_temps.tail()

Unnamed: 0,Date,AvgTemp(C)
360,2022-12-27,-0.7
361,2022-12-28,3.4
362,2022-12-29,6.4
363,2022-12-30,9.3
364,2022-12-31,8.2


In [46]:
df_temps = pd.read_csv('laguardia_weather_2022.csv')

In [48]:
df_temps['Date'] = pd.to_datetime(df_temps['Date']).dt.date

In [50]:
df_citibike['date'] = pd.to_datetime(df_citibike['started_at']).dt.date

In [58]:
df_merged = df_citibike.merge(df_temps, left_on='date', right_on='Date', how='left', indicator=True)

In [None]:
df_merged.to_csv('citibike_weather_merged.csv', index=False)

In [60]:
df_merged['_merge'].value_counts(dropna=False)

_merge
both          29838166
left_only          640
right_only           0
Name: count, dtype: int64