This notebook totals the bike counts for the most predominant bike counters then pulls in weather data to be used as input features for modelling.
There two weather sources for data, though it could all be retrieved from Open Mateo (which was discovered later in the project)

In [96]:
# data
import pandas as pd
import numpy as np

# utils
from datetime import datetime, date, timedelta
import pickle
import time

# NOAA
import requests
import json

Token='' #### removed
today = date.today().strftime('%Y-%m-%d')


## import bike data

In [None]:
# pull biking data
data = pd.read_pickle('data/all_data.pkl')

# limit data to the below counters and after 2013
counters = ['Manhattan Br', 'Brooklyn Br', 'Williamsburg Br', 'Kent Ave', 'Queensboro Br', 'Prospect Park W']
data = data[data['name'].isin(counters)].copy()
data = data[data['date'].dt.year > 2013]

## import weather data from NOAA

In [193]:
# set params
dataset = 'GHCND'
station_id = 'GHCND:USW00014732' # LGA
data_types = ['TAVG', 'TMAX', 'TMIN', 'PRCP', 'AWND', 'SNOW', 'SNWD']

start_year = 2014
end_year = data['date'].dt.year.max()+1

results = []

# NOAA limits responses to the first 1,000 hence they must be performed year- and data_type-wise
for i in range(0, len(data_types) - 1, 2):
    types = data_types[i:i+2]
    data_request = ','.join(types)
    dt_results = []

    print('working on '+data_request)

    for year in range(start_year, end_year):
        year = str(year)

        # temps
        # make the api call
        r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid='+dataset+'&limit=1000&stationid='+station_id+'&datatypeid='+data_request+'&startdate='+year+'-01-01&enddate='+year+'-12-31&units=standard', headers={'token':Token})
        time.sleep(0.1)
        # load the api response as a json
        d = json.loads(r.text)
        results += d['results']

if len(data_types) % 2 == 1: # pairwise looping will miss the last element if it has an odd number
    data_request = data_types[-1]
    print('working on '+data_request)

    for year in range(start_year, end_year):
        year = str(year)

        # temps
        # make the api call
        r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid='+dataset+'&limit=1000&stationid='+station_id+'&datatypeid='+data_request+'&startdate='+year+'-01-01&enddate='+year+'-12-31&units=standard', headers={'token':Token})
        time.sleep(0.1)
        # load the api response as a json
        d = json.loads(r.text)
        results += d['results']



working on TAVG,TMAX
working on TMIN,PRCP
working on AWND,SNOW
working on SNWD,TSUN


In [194]:
# save
with open('data/raw_wthr.pkl', 'wb') as f:
    pickle.dump(results, f)

In [396]:
# get data types we want
wthr = pd.DataFrame(columns=['date', 'param', 'value'])

for dt in data_types:
    dt_data = [[x['date'], x['datatype'], x['value']] for x in results if x['datatype']==dt]
    dt_df = pd.DataFrame(dt_data, columns=['date', 'param', 'value'])
    wthr = pd.concat([wthr, dt_df])

  wthr = pd.concat([wthr, dt_df])
  wthr = pd.concat([wthr, dt_df])


In [397]:
# remove dupes then pivot so each data type has its own column
wthr = wthr.drop_duplicates()\
        .pivot(index='date', columns=['param'], values=['value']).reset_index()\
        .dropna()
wthr['date'] = pd.to_datetime(wthr['date'])
wthr.columns = ['date', 'AWND', 'PRCP', 'SNOW', 'SNWD', 'TAVG', 'TMAX', 'TMIN']
wthr.columns = [x.lower() for x in wthr.columns]

## pull weather data from Open Mateo

In [398]:
# discovered this other source with hourly data for many features
# requests are simple, no api key needed
url = f'https://archive-api.open-meteo.com/v1/archive?latitude=40.776&longitude=-73.8727&start_date={start_year}-01-01&end_date={today}&hourly=temperature_2m,dewpoint_2m,cloudcover_low,is_day,direct_radiation,precipitation,apparent_temperature,windspeed_10m&timezone=America%2FNew_York&temperature_unit=fahrenheit&precipitation_unit=inch&windspeed_unit=mph'
r = requests.get(url)
d = json.loads(r.text)

In [399]:
# save
with open('data/raw_wthr_2.pkl', 'wb') as f:
    pickle.dump(d, f)

In [400]:
# parse json to retrieve data and make df
wthr2 = pd.DataFrame({
    'date':d['hourly']['time'],
    'dew':d['hourly']['dewpoint_2m'],
    'is_day':d['hourly']['is_day'],
    'rad':d['hourly']['direct_radiation'],
    'precip':d['hourly']['precipitation'],
    'temp':d['hourly']['temperature_2m'],
    'real_feel':d['hourly']['apparent_temperature'],
    'wind':d['hourly']['windspeed_10m']
}).dropna()

In [401]:
# create daytime features by masking with is_day
wthr2['day_tmin'] = wthr2['is_day'] * wthr2['temp']
wthr2['day_precip'] = wthr2['is_day'] * wthr2['precip']
wthr2['day_real_feel'] = wthr2['is_day'] * wthr2['real_feel']
wthr2['day_dew'] = wthr2['is_day'] * wthr2['dew']
wthr2['day_wind'] = wthr2['is_day'] * wthr2['wind']

#day dew and day real feel will average zeroes unless you fill with na
wthr2.loc[wthr2['is_day']==0., 'day_dew'] = np.nan
wthr2.loc[wthr2['is_day']==0., 'day_real_feel'] = np.nan
wthr2.loc[wthr2['is_day']==0., 'day_wind'] = np.nan

wthr2['date'] = pd.to_datetime(wthr2['date'])

In [402]:
# group by the date
wthr2 = wthr2.groupby(wthr2['date'].dt.date).agg({
    'dew':'mean',
    'is_day':'sum',
    'rad':'sum',
    'precip':'sum',
    'day_tmin':'min',
    'day_dew':'mean',
    'day_precip':'sum',
    'real_feel':'mean',
    'day_real_feel':'mean',
    'wind':'mean',
    'day_wind':'mean'
    }).reset_index()

wthr2['date'] = pd.to_datetime(wthr2['date'])

## merge two weather datasets to count df

In [403]:
# merge new OM data with NOAA data
wthr = wthr.merge(wthr2, how='left', on='date')

In [406]:
# group data by date
df = data[['date', 'counts']]\
        .groupby([pd.Grouper(key='date', freq='D')])\
        .sum()\
        .reset_index()

df = df.merge(wthr, on='date')

In [407]:
# save
df.to_pickle('data/by_day_weather.pkl')
wthr.to_pickle('data/weather_data.pkl')