# Citibike Modeling: Incorporate Exogenous Variables

This script will incorporate the NYC recovery index in an attempt to appropriately control for the COVID-19 pandemic's effect on ridership.

In [1]:
# Standard imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import geopandas as gpd
from scipy.fft import fft, fftfreq, fftshift
import scipy
import itertools
from ast import literal_eval

# Set style
plt.style.use('seaborn')

In [2]:
# Import util file

from util import dickey_fuller, process_neighborhood, report_metrics, plot_results

In [3]:
# Modeling

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
import statsmodels.graphics.tsaplots as tsa

from pmdarima.utils import decomposed_plot
from pmdarima.arima import decompose

from sklearn import metrics

#### Ridership Data Import and Prep

In [5]:
# Import clean dataset

df_citibike = pd.read_csv('./../clean_data/final_clean_ridership.csv', low_memory=False)

In [6]:
# Convert starttime to date and set as index

df_citibike['starttime'] = pd.to_datetime(df_citibike['starttime'])

df_citibike.set_index('starttime', inplace=True)

In [7]:
# Create the daily file

daily_ridership = df_citibike[['ride_count']].resample('1D').sum()

# Backfill dates with 0 rides (logic for this change in EDA notebook)
daily_ridership['ride_count'] = daily_ridership['ride_count'].replace(to_replace=0, method='bfill')

In [8]:
# Define Train / Test split time

test_start = '2020-11-01'

In [12]:
# Resample to weekly

weekly_ridership = daily_ridership.resample('W-SUN').sum()

# Log transform ridership
weekly_ridership['ride_count_log'] = weekly_ridership['ride_count'].apply(lambda x: np.log(x))

In [14]:
# Create future time stamp

weekly_ridership['future'] = (weekly_ridership.index >= test_start).astype('int')


# Create train test splits

train_weekly = weekly_ridership[weekly_ridership['future'] == 0]['ride_count_log']
test_weekly = weekly_ridership[weekly_ridership['future'] == 1]['ride_count_log']

#### COVID data import and prep

In [15]:
# Read in the COVID data

covid_df = pd.read_csv('./../covid_data/data-uIenF.csv')

In [16]:
# Convert the axis to datetime and create a "overall_index" column

covid_df['Date'] = pd.to_datetime(covid_df['Date'])

covid_df.set_index('Date', inplace=True)

covid_df['overall_index'] = covid_df.sum(axis=1)

In [17]:
covid_df.head()

Unnamed: 0_level_0,Covid-19 Hospitalizations Index,Unemployment Claims Index,Home Sales Index,Rental Inventory Index,Subway Mobility Index,Restaurant Reservations Index,overall_index
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-04,16.7,16.8,14.6,16.1,15.8,16.7,96.7
2020-01-11,16.7,15.5,19.4,16.1,16.9,16.7,101.3
2020-01-18,16.7,15.5,19.0,16.2,16.7,16.7,100.8
2020-01-25,16.7,16.2,17.9,16.6,17.3,16.7,101.4
2020-02-01,16.7,18.4,18.2,16.3,17.2,16.7,103.5
