## Modelling

**Table of Contents:**

0. Packages and Data Loading
1. ARIMA Forecasting
2. Linear Regression
3. Random Forrest

#### 0. Packages and Data Loading

In [2]:
pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pmdarima
Successfully installed pmdarima-2.0.4


In [3]:
# Import packages
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pmdarima import auto_arima
import plotly.express as px
import plotly.figure_factory as ff
import os
from scipy import stats
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.tsa.arima.model import ARIMA

In [None]:
# Set up the notebook
pd.set_option('display.max_columns', 200)

# Connect to google drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

# Specify a path to the working directory
PROJECT_PATH  = '/content/gdrive/MyDrive/94879_OpAI/final-project'
DATA_PATH     = os.path.join(PROJECT_PATH, 'data')

In [None]:
# Load the clean data and display top-3 rows
df_clean = pd.read_csv(os.path.join(DATA_PATH, 'df_clean.csv'))
df_clean.head(3)

In [None]:
# Group by 'date', 'state', and aggregate data
df_aggregated_state = df_clean.groupby(['date', 'recip_state']).agg({
    'deaths': 'sum',
    'administered_dose1_recip': 'sum', # People with at least one Dose by State of Residence
    'series_complete_yes': 'sum', # Total number of people who have completed a primary series (have second dose of a two-dose vaccine or one dose of a single-dose vaccine) based on the jurisdiction and county where vaccine recipient lives
    'census2019':'sum', # population from the census (obtained from the CDC website)
    'population':'sum' # population data from JHU
}).reset_index()

In [None]:
df_aggregated_state['date'] = pd.to_datetime(df_aggregated_state['date'])

In [None]:
df_aggregated_state['date'].min()

In [None]:
df_aggregated_state['date'].max()

In [None]:
df_aggregated_state.head()

We will be making predictions for the months of April, May, and June 2022. Therefore we will need to take all of the data before these months as our training data set.

In [None]:
# Filter rows with dates before April 2022
training_df = df_aggregated_state[df_aggregated_state['date'] < '2022-04-01']

We will be making predictions for the months of April, May, and June 2022:

In [None]:
# Filter df_clean for April, May, and June 2022
test_df = df_aggregated_state[(df_aggregated_state['date'] >= '2022-04-01') & (df_aggregated_state['date'] <= '2022-06-30')]

We will use a dictionary to hold both our training and test data:

In [None]:
training_data = {}
test_data = {}
predicted_data = {}
results ={}

In [None]:
for state in df_aggregated_state['recip_state'].unique():
  training_data[state] = training_df[training_df['recip_state'] == state]
  test_data[state] = test_df[test_df['recip_state'] == state]
  break

#### 1. ARIMA Model

Auto Regressive Integrated Moving Average (ARIMA) is a very common method used in time series forecasting.

In [None]:
# for state in training_data.keys():


In [None]:
specific_training_df = training_data[state]

In [None]:
specific_training_df.sort_values('date', inplace=True)
specific_training_df.set_index('date', inplace=True)
specific_training_series = specific_training_df['deaths']

In [None]:
test_data[state]['date']

In [None]:
# Convert 'date' column to datetime if it's not already
test_data[state]['date'] = pd.to_datetime(test_data[state]['date'])

# Plot the time series data
plt.figure(figsize=(10, 6))
plt.plot(specific_training_series, linestyle='-', color='b', label='Training Data')  # Adjust linestyle, marker, and color as needed
plt.title('COVID Deaths Data')
plt.xlabel('Date')
plt.ylabel('Deaths')
plt.grid(True)  # Add grid lines if desired
plt.legend()
plt.show()

In [None]:
# Use AutoARIMA to find the best model
auto_model = auto_arima(specific_training_series, suppress_warnings=True, seasonal=False)
order = auto_model.order

# Fit ARIMA model with the best parameters
model = ARIMA(specific_training_series, order=order)
result = model.fit()

# Summary of the model
print(result.summary())

# Forecast future values for April, May, and June 2022
forecast_steps = 91  # 3 months * 30 days (approximate)
forecast_index = pd.date_range(start='2022-04-01', periods=forecast_steps, freq='D')
forecast = result.get_forecast(steps=forecast_steps)
forecast_mean = forecast.predicted_mean


In [None]:
# Plot the original time series, the forecast, and the actual test data
plt.figure(figsize=(10, 6))
plt.plot(specific_training_series, label='Training Data', linestyle='-', color='b')  # Training data
plt.plot(forecast_index, forecast_mean, color='red', label='Forecast')  # Forecast
plt.plot(test_data[state]['date'], test_data[state]['deaths'], linestyle='--', color='b', label='Actual Test Data')  # Actual test data
plt.title('AutoARIMA Forecast and Actual Test Data for April, May, and June 2022')
plt.legend()
plt.show()