In [94]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from pygam import LinearGAM, s
import pickle

In [95]:
#Load Test Data
train_path = r"C:\Python\GitHub\BusinessForecasting\Assignments\econ8310-assignment1\assignment_data_train.csv"
df_train = pd.read_csv(train_path, parse_dates=['Timestamp'])
df_train['Timestamp'] = pd.to_datetime(df_train['Timestamp'])
df_train.set_index('Timestamp', inplace=True)

In [111]:
#frequency as 'h' for hourly
df_train = df_train.asfreq('h')
df_train.head()

Unnamed: 0_level_0,year,month,day,hour,trips
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:00:00,2018,1,1,0,16714
2018-01-01 01:00:00,2018,1,1,1,19041
2018-01-01 02:00:00,2018,1,1,2,16590
2018-01-01 03:00:00,2018,1,1,3,12626
2018-01-01 04:00:00,2018,1,1,4,8739


In [112]:
#Sorted by time
df_train.sort_index(inplace=True)
df_train.head()

Unnamed: 0_level_0,year,month,day,hour,trips
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:00:00,2018,1,1,0,16714
2018-01-01 01:00:00,2018,1,1,1,19041
2018-01-01 02:00:00,2018,1,1,2,16590
2018-01-01 03:00:00,2018,1,1,3,12626
2018-01-01 04:00:00,2018,1,1,4,8739


In [98]:
# year,month,day,hour for GAM
exog_vars = ['year', 'month', 'day', 'hour']
X_train = df_train[exog_vars].values  # Convert to NumPy array for GAM
y_train = df_train['trips'].values   # The target variable

In [99]:
# GAM model with a smoothing spline
#s(0) = spline for year
#s(1) = spline for month
#s(2) = spline for day.
#s(3) = spline for hour
model = LinearGAM(s(0) + s(1) + s(2) + s(3))
modelFit = model.fit(X_train, y_train)  # One spline for each exogenous variable

In [100]:
# Save the model
with open("model.pkl", "wb") as f:
    pickle.dump(modelFit, f)

In [101]:
# Generate predictions on training data
y_pred_train = modelFit.predict(X_train)

In [102]:
# Get confidence intervals for training predictions
ci_train = modelFit.confidence_intervals(X_train, width=0.95)  # 95% confidence intervals
lower_ci_train, upper_ci_train = ci_train[:, 0], ci_train[:, 1]

In [103]:
# Load the test data
test_path = r"C:\Python\GitHub\BusinessForecasting\Assignments\econ8310-assignment1\assignment_data_test.csv"
df_test = pd.read_csv(test_path, parse_dates=['Timestamp'])
df_test['Timestamp'] = pd.to_datetime(df_test['Timestamp'])
df_test.set_index('Timestamp', inplace=True)

In [104]:
# Explicitly set the frequency for the test data (same as train data)
df_test = df_test.asfreq('h')  # Make sure the frequency is consistent with train data

In [105]:
# Prepare test features
X_test = df_test[exog_vars].values

In [106]:
# Generate predictions for test data
y_pred_test = modelFit.predict(X_test)

In [107]:
# Get confidence intervals for test predictions
ci_test = modelFit.confidence_intervals(X_test, width=0.95)  # 95% confidence intervals
lower_ci_test, upper_ci_test = ci_test[:, 0], ci_test[:, 1]

In [110]:
# Plot results using Plotly
fig = go.Figure()

# Scatter plot for actual data (train)
fig.add_trace(go.Scatter(
    x=df_train.index, y=y_train, mode='markers',
    name='Actual (Train)', marker=dict(color='blue')
))

# Line plot for predictions (train)
fig.add_trace(go.Scatter(
    x=df_train.index, y=y_pred_train, mode='lines',
    name='GAM Fit (Train)', line=dict(color='red')
))

'''
# Line plot for confidence intervals (train)
fig.add_trace(go.Scatter(
    x=df_train.index, y=lower_ci_train, mode='lines',
    name='Lower CI (Train)', line=dict(dash='dot', color='gray')
))
fig.add_trace(go.Scatter(
    x=df_train.index, y=upper_ci_train, mode='lines',
    name='Upper CI (Train)', line=dict(dash='dot', color='gray')
))
'''

# Line plot for predictions (test)
fig.add_trace(go.Scatter(
    x=df_test.index, y=y_pred_test, mode='lines',
    name='Predicted (Test)', line=dict(color='green')
))

'''
# Line plot for confidence intervals (test)
fig.add_trace(go.Scatter(
    x=df_test.index, y=lower_ci_test, mode='lines',
    name='Lower CI (Test)', line=dict(dash='dot', color='black')
))
fig.add_trace(go.Scatter(
    x=df_test.index, y=upper_ci_test, mode='lines',
    name='Upper CI (Test)', line=dict(dash='dot', color='black')
))
'''

# Update layout
fig.update_layout(
    title="GAM Model Forecast for Taxi Trips",
    xaxis_title="Time",
    yaxis_title="Number of Trips",
    template="plotly_white"
)

# Show plot
fig.show()

In [None]:
# Save predictions
df_test['predicted_trips'] = y_pred_test
df_test[['predicted_trips']].to_csv("predictions_gam.csv")