### This script contains the below:
1. Import libraries
2. Importing bike trips data to merge it
3. Get weather data using NOAA's API
4. Merge bike trip and weather data

# 01 Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json
from datetime import datetime

# 02 Importing data & merge

In [23]:
# The code below automates the process of importing and combining all monthly CitiBike CSV files for 2022.
# 1. First, it walks through the folder "Original Data" and creates a list (filepaths) containing the full paths 
#    of all CSV files in the folder and its subfolders.
# 2. Then, it uses a generator expression inside pd.concat() to read each CSV file into a temporary DataFrame.
# 3. All these monthly DataFrames are merged (stacked vertically) into one large DataFrame called 'df'.
# 4. The parameter 'ignore_index=True' resets the row index after concatenation, creating a clean continuous index.
# 5. 'low_memory=False' ensures that pandas reads large CSVs more reliably when columns have mixed data types.

In [16]:
# Create a list with all files in the folder using a list comprehension

folderpath = r"Original Data" 
filepaths = []
for root, dirs, files in os.walk(folderpath):
    for name in files:
        if name.endswith('.csv'):
            filepaths.append(os.path.join(root, name))

In [17]:
filepaths

['Original Data/202211-citibike-tripdata/202211-citibike-tripdata_3.csv',
 'Original Data/202211-citibike-tripdata/202211-citibike-tripdata_2.csv',
 'Original Data/202211-citibike-tripdata/202211-citibike-tripdata_1.csv',
 'Original Data/202208-citibike-tripdata/202208-citibike-tripdata_3.csv',
 'Original Data/202208-citibike-tripdata/202208-citibike-tripdata_2.csv',
 'Original Data/202208-citibike-tripdata/202208-citibike-tripdata_1.csv',
 'Original Data/202208-citibike-tripdata/202208-citibike-tripdata_4.csv',
 'Original Data/202210-citibike-tripdata/202210-citibike-tripdata_1.csv',
 'Original Data/202210-citibike-tripdata/202210-citibike-tripdata_2.csv',
 'Original Data/202210-citibike-tripdata/202210-citibike-tripdata_3.csv',
 'Original Data/202209-citibike-tripdata/202209-citibike-tripdata_4.csv',
 'Original Data/202209-citibike-tripdata/202209-citibike-tripdata_1.csv',
 'Original Data/202209-citibike-tripdata/202209-citibike-tripdata_2.csv',
 'Original Data/202209-citibike-tripda

In [20]:
# merging the data
df = pd.concat((pd.read_csv(f, low_memory=False) for f in filepaths), ignore_index=True)

In [24]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0CC1D7F53FA21F9A,electric_bike,2022-11-02 05:54:11.481,2022-11-02 06:08:27.471,Park Ave & E 162 St,8016.07,Jerome Ave & W 193 St,8619.02,40.825701,-73.915644,40.86659,-73.89794,casual
1,EDAECDAE6BF903DE,classic_bike,2022-11-02 18:20:15.611,2022-11-02 19:00:24.787,Broadway & W 61 St,7014.12,Leonard St & Church St,5359.11,40.77003,-73.981968,40.717571,-74.005549,casual
2,37C06FED49069B80,electric_bike,2022-11-04 18:39:39.873,2022-11-04 18:44:01.851,W 54 St & 11 Ave,6955.05,11 Ave & W 59 St,7059.01,40.768333,-73.992573,40.771497,-73.99046,member
3,63751973E9A95FB1,classic_bike,2022-11-09 18:02:29.616,2022-11-09 18:19:28.693,Broadway & W 41 St,6560.01,11 Ave & W 59 St,7059.01,40.755136,-73.98658,40.771497,-73.99046,member
4,F7410DEDF925FBA8,electric_bike,2022-11-12 10:23:11.805,2022-11-12 10:28:24.794,William St & Pine St,5065.12,Leonard St & Church St,5359.11,40.707317,-74.008854,40.717571,-74.005549,member


In [50]:
# Convert the columns to datetime (if not already)
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

# Get the earliest and latest dates for each column
earliest_start = df['started_at'].min()
latest_start = df['started_at'].max()

earliest_end = df['ended_at'].min()
latest_end = df['ended_at'].max()

# Print the results
print("Earliest start date:", earliest_start)
print("Latest start date:", latest_start)
print("Earliest end date:", earliest_end)
print("Latest end date:", latest_end)

Earliest start date: 2021-01-30 17:30:45.544000
Latest start date: 2022-12-31 23:58:19.206000
Earliest end date: 2022-01-01 00:00:09.459000
Latest end date: 2022-12-31 23:59:55.708000


In [51]:
df2022 = df[(df['started_at'] >= '2022-01-01') & (df['started_at'] <= '2022-12-31')]

In [52]:
df2022.shape

(29806700, 14)

In [53]:
df2022.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29806700 entries, 0 to 29838805
Data columns (total 14 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
 13  date                datetime64[ns]
dtypes: datetime64[ns](3), float64(4), object(7)
memory usage: 3.3+ GB


# 03 Obtain weather data from New York LaGuardia’s weather station for 2022 and export it to a CSV file.

In [28]:
# Define your NOAA token

Token = 'HLPKfjkSRPFNvFLMQBdRxcFpoojZoEwc'

In [29]:
# Get the API for La Guardia Airport Jan 1st - Dec 31st 2022

r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00014732&startdate=2022-01-01&enddate=2022-12-31', headers={'token':Token})

In [30]:
# Load the api response as a json

d = json.loads(r.text)

In [31]:
d

{'metadata': {'resultset': {'offset': 1, 'count': 365, 'limit': 1000}},
 'results': [{'date': '2022-01-01T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 116},
  {'date': '2022-01-02T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 114},
  {'date': '2022-01-03T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 14},
  {'date': '2022-01-04T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': -27},
  {'date': '2022-01-05T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 32},
  {'date': '2022-01-06T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 49},
  {'date': '2022-01-07T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attribut

In [32]:
# Secure all items in the response that correspond to TAVG

avg_temps = [item for item in d['results'] if item['datatype']=='TAVG']

In [34]:
# Get only the date field from all average temperature readings

dates_temp = [item['date'] for item in avg_temps]

In [35]:
# Get the temperature from all average temperature readings

temps = [item['value'] for item in avg_temps]

In [36]:
temps

[116,
 114,
 14,
 -27,
 32,
 49,
 7,
 -25,
 14,
 16,
 -54,
 -19,
 40,
 48,
 -67,
 -80,
 39,
 18,
 32,
 51,
 -60,
 -59,
 -7,
 -2,
 36,
 -23,
 -42,
 1,
 -48,
 -71,
 -34,
 -17,
 23,
 64,
 58,
 -28,
 -48,
 5,
 41,
 28,
 63,
 86,
 118,
 28,
 -43,
 -47,
 16,
 116,
 99,
 1,
 -26,
 41,
 56,
 144,
 11,
 8,
 -11,
 24,
 8,
 26,
 77,
 56,
 -19,
 31,
 96,
 182,
 87,
 32,
 40,
 72,
 51,
 -23,
 53,
 116,
 136,
 82,
 139,
 131,
 143,
 104,
 107,
 72,
 57,
 96,
 104,
 59,
 -15,
 -12,
 26,
 99,
 131,
 72,
 70,
 76,
 97,
 86,
 91,
 128,
 114,
 86,
 88,
 134,
 154,
 163,
 157,
 157,
 83,
 79,
 76,
 94,
 106,
 146,
 133,
 128,
 104,
 115,
 116,
 82,
 104,
 129,
 136,
 121,
 124,
 120,
 172,
 148,
 100,
 104,
 135,
 154,
 161,
 171,
 160,
 179,
 188,
 204,
 197,
 178,
 155,
 154,
 212,
 278,
 210,
 174,
 162,
 164,
 202,
 204,
 207,
 233,
 283,
 178,
 203,
 195,
 219,
 200,
 222,
 218,
 237,
 244,
 218,
 218,
 205,
 242,
 239,
 233,
 203,
 252,
 217,
 173,
 205,
 221,
 191,
 192,
 219,
 258,
 270,
 243,
 22

In [37]:
# Storing the results in a dataframe

df_temp = pd.DataFrame()

In [38]:
# Get only date and cast it to date time; convert temperature from tenths of Celsius to normal Celsius

df_temp['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in dates_temp]
df_temp['avgTemp'] = [float(v)/10.0 for v in temps]

In [39]:
df_temp.head()

Unnamed: 0,date,avgTemp
0,2022-01-01,11.6
1,2022-01-02,11.4
2,2022-01-03,1.4
3,2022-01-04,-2.7
4,2022-01-05,3.2


In [40]:
# Save the cleaned temperature data to CSV
df_temp.to_csv('LaGuardia_Weather_2022.csv', index=False)

# 04 Merge bike trip and weather data

In [54]:
df2022.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
date                  datetime64[ns]
dtype: object

In [55]:
# Convert 'started_at' from string to datetime (day comes first in the input format)

df2022['started_at'] = pd.to_datetime(df2022['started_at'], dayfirst = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2022['started_at'] = pd.to_datetime(df2022['started_at'], dayfirst = True)


In [45]:
# Extract only the date part (YYYY-MM-DD) from 'started_at' into a new column

df2022['date'] = pd.to_datetime(df2022['started_at'], format='%Y-%m-%d').dt.date

In [56]:
# Convert 'date' from string to datetime

df2022['date'] = pd.to_datetime(df2022['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2022['date'] = pd.to_datetime(df2022['date'])


In [57]:
# Merge dataframes

df_merged = df2022.merge(df_temp, how = 'left', on = 'date', indicator = True)

In [58]:
df_merged['_merge'].value_counts(dropna = False)

both          29806700
left_only            0
right_only           0
Name: _merge, dtype: int64