## 01. Import Libraries

In [32]:
# import libraries

import pandas as pd
import numpy as np
import os
import requests
import json
from datetime import datetime

## 02. Import Data

In [75]:
# importing Bike Sharing Data from Each Quarter (2018)

# Create a list with all files in the folder using a list comprehension
# pulling the data from the Data file of our project folder

# WHY are we using a list comprehension:
# in this case we have 12 files, and loading/merging each file individually can be tedious and cumbersome
# in order to streamline, we are using a list comprehension.


# first, we are creating a folderpath variable and indicating that it is from our data folder (hence the r"Data")
# then we get into the list comprehension.. start by creating filepath (this will be our list and return the name of all files in the list)
# breaking down each part individually we have our loop:
    # for name in os.listdir(folderpath) if name.endswith(.csv') : this will loop through the folder and add it to the list if it meets the name reqs
    # os.path.join(folderpath, name) will join the folderpath with the file name, creating the path for our data :)


# after my initial attempt, a few other files populated, so decided to add the "if ends with .csv" to filter those out.

folderpath = r"Data"
filepath = [os.path.join(folderpath, name) for name in os.listdir(folderpath) if name.endswith('.csv')]

filepath

['Data/JC-202203-citibike-tripdata.csv',
 'Data/JC-202201-citibike-tripdata.csv',
 'Data/JC-202209-citibike-tripdata.csv',
 'Data/JC-202211-citibike-tripdata.csv',
 'Data/JC-202207-citbike-tripdata.csv',
 'Data/JC-202205-citibike-tripdata.csv',
 'Data/JC-202202-citibike-tripdata.csv',
 'Data/JC-202208-citibike-tripdata.csv',
 'Data/JC-202206-citibike-tripdata.csv',
 'Data/JC-202210-citibike-tripdata.csv',
 'Data/JC-202204-citibike-tripdata.csv',
 'Data/JC-202212-citibike-tripdata.csv']

In [46]:
# now that all data sets are in the list, time to read & concatenate the data sets:

df = pd.concat((pd.read_csv(f) for f in filepath), ignore_index = True)

df.head(5)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,3255D3E3F33CDC45,classic_bike,2022-03-18 15:38:17,2022-03-18 15:45:34,Mama Johnson Field - 4 St & Jackson St,HB404,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,40.74314,-74.040041,40.736982,-74.027781,casual
1,17FA5604A37338F9,electric_bike,2022-03-04 16:44:48,2022-03-04 16:50:45,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member
2,7DEC9ADDB8D6BBE1,electric_bike,2022-03-13 17:44:32,2022-03-13 17:54:44,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member
3,9D69F74EEF231A2E,classic_bike,2022-03-13 15:33:47,2022-03-13 15:41:22,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member
4,C84AE4A9D78A6347,classic_bike,2022-03-11 12:21:18,2022-03-11 12:33:24,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member


In [47]:
df.tail(5)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
895480,D438F1622839AC50,classic_bike,2022-12-06 15:43:38,2022-12-06 15:53:57,Dey St,JC065,Riverview Park,JC057,40.737828,-74.067083,40.744319,-74.043991,member
895481,747A63A8E782D171,electric_bike,2022-12-08 08:17:51,2022-12-08 08:23:33,9 St HBLR - Jackson St & 8 St,HB305,City Hall - Washington St & 1 St,HB105,40.747907,-74.038412,40.73736,-74.03097,casual
895482,AE090858CFDE6E82,electric_bike,2022-12-23 14:10:07,2022-12-23 14:14:18,Mama Johnson Field - 4 St & Jackson St,HB404,City Hall - Washington St & 1 St,HB105,40.74314,-74.040041,40.73736,-74.03097,member
895483,B3CC8E70AF4E259C,classic_bike,2022-12-02 04:43:25,2022-12-02 04:46:55,Mama Johnson Field - 4 St & Jackson St,HB404,City Hall - Washington St & 1 St,HB105,40.743135,-74.04008,40.73736,-74.03097,member
895484,176B601F21327350,classic_bike,2022-12-30 14:50:17,2022-12-30 14:55:37,14 St Ferry - 14 St & Shipyard Ln,HB202,City Hall - Washington St & 1 St,HB105,40.752747,-74.024035,40.73736,-74.03097,member


In [50]:
df.shape

(895485, 13)

In [129]:
# going to export this to .csv so we have it handy for future use:

df.to_csv('combined_bike_rides_2022.csv')

In [60]:
# import the weather data 
# noaa la guardia airport id: GHCND:USW00014732

In [56]:
# Define NOAA token:

Token = 'zfMjWUYOnNKcTKqvOfbLLpElqHIpOiRo'

In [58]:
# request for data with parameters:

r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00014732&startdate=2022-01-01&enddate=2022-12-31', headers={'token':Token})

In [88]:
# load API response as JSON file:

d = json.loads(r.text)  

# double check to make sure this worked :)

d

{'metadata': {'resultset': {'offset': 1, 'count': 365, 'limit': 1000}},
 'results': [{'date': '2022-01-01T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 116},
  {'date': '2022-01-02T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 114},
  {'date': '2022-01-03T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 14},
  {'date': '2022-01-04T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': -27},
  {'date': '2022-01-05T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 32},
  {'date': '2022-01-06T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attributes': 'H,,S,',
   'value': 49},
  {'date': '2022-01-07T00:00:00',
   'datatype': 'TAVG',
   'station': 'GHCND:USW00014732',
   'attribut

## 03. Wrangle JSON data:

In [90]:
# for the purpose of this project, we are only interested in the average daily temp and the date
# when looking at d (our JSON data) we can see that we have our results, 'date' and 'TAVG' 

In [111]:
# for avg temps, use a list comprehension again:

avg_temp = [item for item in d['results'] if item['datatype']=='TAVG']

# this willstore/access the average temp in our d (JSON results) as long as the datatype within d is == TAVG (our average temp)

In [113]:
# now we want to get the date for each of these averages:

date_of_temp = [item['date'] for item in avg_temp]

# date_of_temp

In [115]:
# now to get the temperature from our avg_temp

temp = [item['value'] for item in avg_temp]

# now that we have a list of dates, and a list of temps, we can combine this with our data set (bike rides)

In [172]:
# before we can merge this to our data set, we will need to make a new dataframe and populate it with this temp and date_of_temp

df_temp = pd.DataFrame()

df_temp['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in date_of_temp]

# this will strip the date column of any attribute aside from date (hour, minute, second, etc)
# now to make sure its in the format we want!

df_temp['date'] = pd.to_datetime(df_temp['date'], format='%Y-%m-%d').dt.date

df_temp

Unnamed: 0,date
0,2022-01-01
1,2022-01-02
2,2022-01-03
3,2022-01-04
4,2022-01-05
...,...
360,2022-12-27
361,2022-12-28
362,2022-12-29
363,2022-12-30


In [174]:
# now lets add in our average temp in a new column!

df_temp['average_temp'] = [float(x)/10.0 for x in temp]

# this is going to make sure the temperature from the temp list created above is a float and rounded to the the nearest tenth of a degree

In [176]:
df_temp

Unnamed: 0,date,average_temp
0,2022-01-01,11.6
1,2022-01-02,11.4
2,2022-01-03,1.4
3,2022-01-04,-2.7
4,2022-01-05,3.2
...,...,...
360,2022-12-27,-0.7
361,2022-12-28,3.4
362,2022-12-29,6.4
363,2022-12-30,9.3


In [180]:
# we have two columns and 365 days, meaning we did this correctly!
# going to export this to a .csv (just in case), then begin merging with our other dataframe

df_temp.to_csv(r"Data/average_daily_temps_2022.csv")

## 04. Merge Both Data Sets

In [138]:
# it's now time to merge the df_temp with our bike rides dataset (df)
# we will be using the date as the common key (column) with which to complete the merge!

In [162]:
# first we need to ensure that the date in the rides df is stripped of any time and ONLY represents the date
# following the same process as above

df['started_at'] = pd.to_datetime(df['started_at'], dayfirst = True)

# this is taking the 'started_at' column from our dataframe and converting it to date_time format

df['date'] = pd.to_datetime(df['started_at'], format='%Y-%m-%d').dt.date 

# this step is creating a new 'date' column within our dataframe and populating it with JUST the date from our 'started_at' column

df.head(5)



Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date
0,3255D3E3F33CDC45,classic_bike,2022-03-18 15:38:17,2022-03-18 15:45:34,Mama Johnson Field - 4 St & Jackson St,HB404,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,40.74314,-74.040041,40.736982,-74.027781,casual,2022-03-18
1,17FA5604A37338F9,electric_bike,2022-03-04 16:44:48,2022-03-04 16:50:45,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member,2022-03-04
2,7DEC9ADDB8D6BBE1,electric_bike,2022-03-13 17:44:32,2022-03-13 17:54:44,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member,2022-03-13
3,9D69F74EEF231A2E,classic_bike,2022-03-13 15:33:47,2022-03-13 15:41:22,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member,2022-03-13
4,C84AE4A9D78A6347,classic_bike,2022-03-11 12:21:18,2022-03-11 12:33:24,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member,2022-03-11


In [182]:
# now that we have a correctly formatted 'date' column in each dataframe, we can complete the merge!

# creating a new dataframe for the merged data!

df_merged = pd.DataFrame()

# we are using a left merge here! 
# Why? Bc our 'df' dataframe has more rows and we want to ensure that each entry is assigned a temp that corresponds with the date!
# this means that there will be bike rides with the same avg temp ( as more than one ride was taken daily)

df_merged = df.merge(df_temp, how = 'left', on = 'date', indicator = True) 

# we are merging on the column 'date' as mentioned above!
# check to see if it worked!

df_merged

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,average_temp,_merge
0,3255D3E3F33CDC45,classic_bike,2022-03-18 15:38:17,2022-03-18 15:45:34,Mama Johnson Field - 4 St & Jackson St,HB404,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,40.743140,-74.040041,40.736982,-74.027781,casual,2022-03-18,13.9,both
1,17FA5604A37338F9,electric_bike,2022-03-04 16:44:48,2022-03-04 16:50:45,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member,2022-03-04,-1.9,both
2,7DEC9ADDB8D6BBE1,electric_bike,2022-03-13 17:44:32,2022-03-13 17:54:44,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member,2022-03-13,-2.3,both
3,9D69F74EEF231A2E,classic_bike,2022-03-13 15:33:47,2022-03-13 15:41:22,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member,2022-03-13,-2.3,both
4,C84AE4A9D78A6347,classic_bike,2022-03-11 12:21:18,2022-03-11 12:33:24,Baldwin at Montgomery,JC020,Grove St PATH,JC005,40.723659,-74.064194,40.719586,-74.043117,member,2022-03-11,7.2,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895480,D438F1622839AC50,classic_bike,2022-12-06 15:43:38,2022-12-06 15:53:57,Dey St,JC065,Riverview Park,JC057,40.737828,-74.067083,40.744319,-74.043991,member,2022-12-06,9.3,both
895481,747A63A8E782D171,electric_bike,2022-12-08 08:17:51,2022-12-08 08:23:33,9 St HBLR - Jackson St & 8 St,HB305,City Hall - Washington St & 1 St,HB105,40.747907,-74.038412,40.737360,-74.030970,casual,2022-12-08,11.6,both
895482,AE090858CFDE6E82,electric_bike,2022-12-23 14:10:07,2022-12-23 14:14:18,Mama Johnson Field - 4 St & Jackson St,HB404,City Hall - Washington St & 1 St,HB105,40.743140,-74.040041,40.737360,-74.030970,member,2022-12-23,7.5,both
895483,B3CC8E70AF4E259C,classic_bike,2022-12-02 04:43:25,2022-12-02 04:46:55,Mama Johnson Field - 4 St & Jackson St,HB404,City Hall - Washington St & 1 St,HB105,40.743135,-74.040080,40.737360,-74.030970,member,2022-12-02,3.4,both


In [184]:
# this appears to have worked! 
# we know from above that our df has 895485 rows, so lets check to make sure each row was assigned an average_temp

df_merged['_merge'].value_counts(dropna = False)

_merge
both          895485
left_only          0
right_only         0
Name: count, dtype: int64

In [188]:
# the _merge confirms that all rows have been matched on the 'date' column, so we are good to go and ready to download this as a .csv

df_merged.to_csv(r"Data/merged_temp_ride_data.csv")