In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from sklearn.model_selection import train_test_split

from statsmodels.tsa.arima.model import ARIMA


In [2]:
# Reading in data for station
df = pd.read_csv('../data/df_cleaned_year.csv')
df

Unnamed: 0,datetime,c/a,unit,scp,station,linename,entries_abs,exits_abs,weekday
0,2022-04-16 04:00:00,H007,R248,00-00-00,1 AV,L,18,192,Saturday
1,2022-04-16 08:00:00,H007,R248,00-00-00,1 AV,L,8,101,Saturday
2,2022-04-16 12:00:00,H007,R248,00-00-00,1 AV,L,62,273,Saturday
3,2022-04-16 16:00:00,H007,R248,00-00-00,1 AV,L,91,473,Saturday
4,2022-04-16 20:00:00,H007,R248,00-00-00,1 AV,L,115,484,Saturday
...,...,...,...,...,...,...,...,...,...
5486199,2022-10-14 04:00:00,R419,R326,00-05-01,ZEREGA AV,6,0,0,Friday
5486200,2022-10-14 08:00:00,R419,R326,00-05-01,ZEREGA AV,6,0,0,Friday
5486201,2022-10-14 12:00:00,R419,R326,00-05-01,ZEREGA AV,6,0,0,Friday
5486202,2022-10-14 16:00:00,R419,R326,00-05-01,ZEREGA AV,6,0,0,Friday


In [6]:
def get_station_dataframe(df, station):
    '''Grabs data for a single station from the dataset'''
    station_df = df[df['station'] == station].copy()
    
    # Setting the datetime to a datetime object and setting as index
    station_df['datetime'] = pd.to_datetime(df['datetime'])

    #Using the groupby to add up the entries and exits for each time
    station_df = station_df.groupby('datetime').sum()

    #Renaming the columns and sorting the index
    station_df.rename(columns = {'entries_abs':'entries', 'exits_abs':'exits'}, inplace = True)

    station_df.sort_index(inplace = True)
    station_df.index.freq = '4H'

    #Adding column for entries + exits called traffic
    station_df['traffic'] = station_df['entries'] + station_df['exits']

    #Setting data for half a year
    station_df = station_df.loc['2022-04-16 04:00:00':'2022-10-14 20:00:00']
    
    return station_df

In [7]:
def create_save_model(df, col, station, p_value, d_value, q_value):
    '''Creates a model based on the parameters optimized in preprocessing and modeling and saves it for future use'''
    y_train, y_test = train_test_split(df[col], test_size = 0.05, shuffle = False)

    arima = ARIMA(endog = y_train, order = (p_value, d_value, q_value), freq = '4H')
    
    model = arima.fit()

    model.save(f'../models/{station}_{col}_model.pkl')

In [8]:
# List of stations used to get the models
stations = ['34 ST-PENN STA', 'GRD CNTRL-42 ST',  '34 ST-HERALD SQ', 'TIMES SQ-42 ST', '42 ST-PORT AUTH', 
            '23 ST', '86 ST', 'FULTON ST', '125 ST', '14 ST-UNON SQ', 'CANAL ST']

#For loop to create models for entries and exits for each station
for station in stations:
    station_df = get_station_dataframe(df, station)
    for col in ['entries', 'exits']:
        print(f'Creating {col} model for station: {station}')
        create_save_model(station_df, col, station, 19, 0, 0)

print("Finished! All models created!")

Creating entries model for station: 34 ST-PENN STA
Creating exits model for station: 34 ST-PENN STA
Creating entries model for station: GRD CNTRL-42 ST
Creating exits model for station: GRD CNTRL-42 ST
Creating entries model for station: 34 ST-HERALD SQ
Creating exits model for station: 34 ST-HERALD SQ
Creating entries model for station: TIMES SQ-42 ST
Creating exits model for station: TIMES SQ-42 ST
Creating entries model for station: 42 ST-PORT AUTH
Creating exits model for station: 42 ST-PORT AUTH
Creating entries model for station: 23 ST
Creating exits model for station: 23 ST
Creating entries model for station: 86 ST
Creating exits model for station: 86 ST
Creating entries model for station: FULTON ST
Creating exits model for station: FULTON ST
Creating entries model for station: 125 ST
Creating exits model for station: 125 ST
Creating entries model for station: 14 ST-UNON SQ
Creating exits model for station: 14 ST-UNON SQ
Creating entries model for station: CANAL ST
Creating exit