In [46]:
import pandas as pd
import numpy as np 
import os
import plotly.express as px
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
from matplotlib import pyplot
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_percentage_error
import random

# For investigating timeseries data
from sklearn import preprocessing
from sklearn.model_selection import ParameterGrid
from prophet import Prophet
from statsmodels.tsa.seasonal import seasonal_decompose

# For modeling
from neuralforecast import NeuralForecast
from neuralforecast.models import LSTM, NHITS, RNN
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost
from prophet import Prophet

### Reading Data

In [47]:
# Reading Data
base_path =  os.getcwd()
file_name = 'Traffic_Data.xlsx'
total_path = base_path + '//Data//' 
df = pd.read_excel(total_path + file_name, sheet_name='Sheet1')
df.head(10)


Unnamed: 0,State,Region,STATIONS,CMILES,PMILES,Month,Month_2,Year,Date
0,Connecticut,Northeast,14,2546,2432,November,11,2023,2023-11-01
1,Maine,Northeast,130,1177,1148,November,11,2023,2023-11-01
2,Massachusetts,Northeast,227,5148,5013,November,11,2023,2023-11-01
3,New Hampshire,Northeast,150,1062,1034,November,11,2023,2023-11-01
4,New Jersey,Northeast,73,6569,6339,November,11,2023,2023-11-01
5,New York,Northeast,110,9144,8825,November,11,2023,2023-11-01
6,Pennsylvania,Northeast,57,8610,8408,November,11,2023,2023-11-01
7,Rhode Island,Northeast,26,661,659,November,11,2023,2023-11-01
8,Vermont,Northeast,35,543,531,November,11,2023,2023-11-01
9,Delaware,South Atlantic,0,923,900,November,11,2023,2023-11-01


### Filtering for States

In [48]:
states_of_interest = ['Oregon', 'Washington', 'California']

# Filtering for states of interest
df = df[df['State'].isin(states_of_interest)]
df = df.sort_values(by = ['Date']).reset_index()
df.head(10)

for val in states_of_interest:
    # Replacing outlier January 2023 value
    mean_val = df[(df['Month'] == 'January') &
              (df['Year'] != 2023) &
              (df['State'] == val)]['CMILES'].mean()
    df['CMILES'] = np.where((df['Month'] == 'January') & (df['Year'] == 2023) & (df['State'] == val), mean_val, df['CMILES'])

### Plotting Data

In [49]:
fig = px.scatter(df.reindex(), x="Date", y="CMILES", color = 'State',
                  trendline="lowess",trendline_options=dict(frac=0.1),
                  title = 'Miles Driven by Time - Oregon')
fig.show()

## Fitting Prophet Model

Now we'll fit prophet models.

In [50]:
def evaluate_prophet(df, states_of_interest, window = 1, split = 0.5):

    '''
    Trains prophet model using input dataframe for each of the specified factor levels
    and forecasts out a number of timesteps defined by the window input.
    '''
    total_observations = len(df[df['State'] == states_of_interest[0]])
    train_len = int(total_observations * split)
    models = {}
    for val in states_of_interest:

        # performing stepforward validation on each series
        predictions = []
        actuals = []
        for i in range(train_len, total_observations):
            # Generating series for 1 factor level and creating time
            series_of_interest = df[df['State'] == val].rename(columns= {'Date': 'ds', 'CMILES': 'y'})
            series_of_interest = series_of_interest.sort_values(by = ['ds'], ascending = True).reset_index()[['ds', 'y']]
            split_series = series_of_interest[:i]

            # Fitting model for series
            my_model = Prophet()
            my_model.fit(split_series)

            # Generating future dataset and makign predictions
            future_dates = my_model.make_future_dataframe(periods = 12)
            forecast = my_model.predict(future_dates)

            # Returning prediction
            yhat = forecast['yhat'][0]
            predictions.append(yhat)
            test = series_of_interest['y'].to_list()
            actuals.append(test[i])

            # Plotting results
            #my_model.plot(forecast, uncertainty=True)

        # saving model
        models[val] = [predictions, actuals]
    return models


def evaluate_neural_forecast(df, states_of_interest, window = 1, split = 0.5):

    '''
    Trains nn model using input dataframe for each of the specified factor levels
    and forecasts out a number of timesteps defined by the window input.
    '''
    total_observations = len(df[df['State'] == states_of_interest[0]])
    train_len = int(total_observations * split)
    models = {}
    for val in states_of_interest:

        # performing stepforward validation on each series
        predictions_lstm = []
        predictions_nhits = []
        actuals = []
        for i in range(train_len, total_observations):
            # Generating series for 1 factor level and creating time
            series_of_interest = df[df['State'] == val].rename(columns= {'Date': 'ds', 'CMILES': 'y'}).reset_index()[['ds', 'y']]
            series_of_interest['unique_id'] = val
            split_series = series_of_interest[:i]
            

            # Fitting model for series
            model = LSTM(h=window,                    # Forecast horizon
                        max_steps=100,                # Number of steps to train
                        scaler_type='standard',       # Type of scaler to normalize data
                        encoder_hidden_size=32,       # Defines the size of the hidden state of the LSTM
                        decoder_hidden_size=32,)
            nf = NeuralForecast(models=model, freq='M')
            nf.fit(df=split_series)

            # Making predictions
            Y_hat_df = nf.predict()

            # Returning prediction
            predictions_lstm.append(Y_hat_df['LSTM'][0])
            test = series_of_interest['y'].to_list()
            actuals.append(test[i])

            # Plotting results
            #my_model.plot(forecast, uncertainty=True)

        # saving model
        models[val] = [predictions_lstm, predictions_nhits, actuals]
    return models


def format_and_graph(input_dict, dates,  model_name):

    '''Takes input dictionary containing predicted versus actual values, 
    generates a timeseries graph and calculates MAPE'''


    colors = {'Washington':'royalblue',
             'Oregon': 'firebrick',
              'California': 'green' }
    fig = go.Figure()

    states = list(input_dict.keys())
    mapes = []
    for state in states:
        predicted = input_dict[state][0]
        actual = input_dict[state][1]
        mape_calc = mean_absolute_percentage_error(predicted, actual)
        mapes.append(mape_calc)

        fig.add_trace(go.Scatter(x=dates, y=actual,
                    mode='lines',
                    name='{state} actual'.format(state = state),
                    line = dict(color=colors[state])))
        
        fig.add_trace(go.Scatter(x=dates, y=predicted,
                    mode='lines',
                    name='{state} predicted, mape: {mape}'.format(state = state, mape = mape_calc),
                    line = dict(color=colors[state], dash = 'dash')))
        # Edit the layout
    fig.update_layout(title='Predicted Versus Actual, {model}, mape: {mape}'.format(model = model_name, mape = sum(mapes)/len(mapes)),
                   xaxis_title='Month',
                   yaxis_title='Temperature (degrees F)')
    fig.show()



    

In [51]:
input_dict_prophet = evaluate_prophet(df, states_of_interest,  1, 0.5)


18:51:14 - cmdstanpy - INFO - Chain [1] start processing
18:51:15 - cmdstanpy - INFO - Chain [1] done processing
18:51:15 - cmdstanpy - INFO - Chain [1] start processing
18:51:15 - cmdstanpy - INFO - Chain [1] done processing
18:51:15 - cmdstanpy - INFO - Chain [1] start processing
18:51:16 - cmdstanpy - INFO - Chain [1] done processing
18:51:16 - cmdstanpy - INFO - Chain [1] start processing
18:51:16 - cmdstanpy - INFO - Chain [1] done processing
18:51:16 - cmdstanpy - INFO - Chain [1] start processing
18:51:17 - cmdstanpy - INFO - Chain [1] done processing
18:51:17 - cmdstanpy - INFO - Chain [1] start processing
18:51:17 - cmdstanpy - INFO - Chain [1] done processing
18:51:17 - cmdstanpy - INFO - Chain [1] start processing
18:51:18 - cmdstanpy - INFO - Chain [1] done processing
18:51:18 - cmdstanpy - INFO - Chain [1] start processing
18:51:18 - cmdstanpy - INFO - Chain [1] done processing
18:51:18 - cmdstanpy - INFO - Chain [1] start processing
18:51:19 - cmdstanpy - INFO - Chain [1]

In [52]:
# Creating input variables
total_observations = len(df[df['State'] == states_of_interest[0]])
train_len = int(total_observations * 0.5)
dates = df[df['State'] == states_of_interest[0]]['Date'].to_list()
model_name = 'Prophet'

# Now evaluating prophet performance
format_and_graph(input_dict_prophet, dates,  model_name)

### Now Evaluating NN Forecast

In [53]:
evaluate_neural_forecast(df, states_of_interest,1, 0.5)

Seed set to 1
Seed set to 1
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type          | Params | Mode 
----------------------------------------------------------
0 | loss            | MAE           | 0      | train
1 | padder          | ConstantPad1d | 0      | train
2 | scaler          | TemporalNorm  | 0      | train
3 | hist_encoder    | LSTM          | 12.9 K | train
4 | context_adapter | Linear        | 330    | train
5 | mlp_decoder     | MLP           | 385    | train
----------------------------------------------------------
13.6 K    Trainable params
0         Non-trainable params
13.6 K    Total params
0.055     Total estimated model params size (MB)
11        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.

val_check_steps is greater than max_steps, setting val_check_steps to max_steps.

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type          | Params | Mode 
-------------------------------------------------------
0 | loss         | MAE           | 0      | train
1 | padder_train | ConstantPad1d | 0      | train
2 | scaler       | TemporalNorm  | 0      | train
3 | blocks       | ModuleList    | 2.4 M  | train
-------------------------------------------------------
2.4 M     Trainable params
0         Non-trainable params
2.4 M     Total params
9.488     Total estimated model params size (MB)
34        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=50` reached.

'M' is deprecated and will be removed in a future version, please use 'ME' instead.


'M' is deprecated and will be removed in a future version, please use 'ME' instead.

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]




Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

Seed set to 1
Seed set to 1
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type          | Params | Mode 
----------------------------------------------------------
0 | loss            | MAE           | 0      | train
1 | padder          | ConstantPad1d | 0      | train
2 | scaler          | TemporalNorm  | 0      | train
3 | hist_encoder    | LSTM          | 12.9 K | train
4 | context_adapter | Linear        | 330    | train
5 | mlp_decoder     | MLP           | 385  

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.

val_check_steps is greater than max_steps, setting val_check_steps to max_steps.

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type          | Params | Mode 
-------------------------------------------------------
0 | loss         | MAE           | 0      | train
1 | padder_train | ConstantPad1d | 0      | train
2 | scaler       | TemporalNorm  | 0      | train
3 | blocks       | ModuleList    | 2.4 M  | train
-------------------------------------------------------
2.4 M     Trainable params
0         Non-trainable params
2.4 M     Total params
9.488     Total estimated model params size (MB)
34        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=50` reached.

'M' is deprecated and will be removed in a future version, please use 'ME' instead.


'M' is deprecated and will be removed in a future version, please use 'ME' instead.

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]




Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

Seed set to 1
Seed set to 1
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type          | Params | Mode 
----------------------------------------------------------
0 | loss            | MAE           | 0      | train
1 | padder          | ConstantPad1d | 0      | train
2 | scaler          | TemporalNorm  | 0      | train
3 | hist_encoder    | LSTM          | 12.9 K | train
4 | context_adapter | Linear        | 330    | train
5 | mlp_decoder     | MLP           | 385  

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.

val_check_steps is greater than max_steps, setting val_check_steps to max_steps.

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type          | Params | Mode 
-------------------------------------------------------
0 | loss         | MAE           | 0      | train
1 | padder_train | ConstantPad1d | 0      | train
2 | scaler       | TemporalNorm  | 0      | train
3 | blocks       | ModuleList    | 2.4 M  | train
-------------------------------------------------------
2.4 M     Trainable params
0         Non-trainable params
2.4 M     Total params
9.488     Total estimated model params size (MB)
34        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=50` reached.

'M' is deprecated and will be removed in a future version, please use 'ME' instead.


'M' is deprecated and will be removed in a future version, please use 'ME' instead.

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]




Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

Seed set to 1
Seed set to 1
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type          | Params | Mode 
----------------------------------------------------------
0 | loss            | MAE           | 0      | train
1 | padder          | ConstantPad1d | 0      | train
2 | scaler          | TemporalNorm  | 0      | train
3 | hist_encoder    | LSTM          | 12.9 K | train
4 | context_adapter | Linear        | 330    | train
5 | mlp_decoder     | MLP           | 385  

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.

val_check_steps is greater than max_steps, setting val_check_steps to max_steps.

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type          | Params | Mode 
-------------------------------------------------------
0 | loss         | MAE           | 0      | train
1 | padder_train | ConstantPad1d | 0      | train
2 | scaler       | TemporalNorm  | 0      | train
3 | blocks       | ModuleList    | 2.4 M  | train
-------------------------------------------------------
2.4 M     Trainable params
0         Non-trainable params
2.4 M     Total params
9.488     Total estimated model params size (MB)
34        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined