# ARIMA Model - Training (1997-2020), Training (2021-2023)

# Packages & Data Loading

## Packages

In [None]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from math import sqrt

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

import gdown

In [None]:
from pandas.core.nanops import nanmean as pd_nanmean
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa import stattools
from sklearn.linear_model import LinearRegression

In [None]:
# Show graphs
%matplotlib inline

## Data Loading

In [None]:
# Google Drive file 
file_id = '1RpF3UtvdnvwGilvP5KjQ2tA-hHDNKsnc'

# Create the download URL
download_url = f'https://drive.google.com/uc?export=download&id={file_id}'

# Download the file 
output_path = 'cleaned_data.csv'
gdown.download(download_url, output_path, quiet=False)

# Load the CSV 
df = pd.read_csv(output_path, converters={'year': np.int32, 'month': np.int32})

# Display the DataFrame 
print('Shape of the dataset:', df.shape)
print(df.dtypes)

In [None]:
# Create a copy of the selected columns
df_var = df[['ref_date', 'month', 'year', 'value_unemployment']].copy()

print(df_var.head())

In [None]:
# Group df_var by 'ref_date' and calculate the median 'value_unemployment' for each date
df_arima = df_var.groupby('ref_date').agg(
    year=('year', 'first'),           # Take the first occurrence of 'year' for each date
    month=('month', 'first'),         # Take the first occurrence of 'month' for each date
    median_unemployment=('value_unemployment', 'median')  # Calculate the median 'value_unemployment' for each date
).reset_index()

print(df_arima)


In [None]:
# Shape of data, columns and their datatypes
print('Shape of the dataset:', df_arima.shape)
print(df_arima.dtypes)

# Concert type of the Date column to datetime
df_arima['ref_date'] = df_arima['year'].astype(str) + '-' + df_arima['month'].astype(str)
df_arima['ref_date'] = pd.to_datetime(df_arima['ref_date'], format='%Y-%m')

# set the Date column as index of the dataframe
df_arima.set_index('ref_date', inplace=True)
# Years
df_arima.year.unique()

In [None]:
# Shape of data, columns and their datatypes
print('Shape of the dataset:', df_arima.shape)
print(df_arima.dtypes)

# Stationarity

In [None]:
# Plot the dataset as a time series
def ts_plot(X, xlabel='Time', ylabel='Observed Quantity', title='Time Series', height=8, width=15):
  fig, ax = plt.subplots(figsize=(width, height))
  markers = ['.','*', '.', 'o', '^']
  colors = ['b', 'r', 'g', 'y']
  # Add x-axis and y-axis
  for i, xi in enumerate(X):
    xi.plot(ax=ax, linestyle='-', marker=markers[i], color=colors[i])
  # Set title and labels for axes
  ax.set(xlabel=xlabel,
         ylabel=ylabel,
         title=title)
  if len(X) > 1: # more than one series
    ax.legend()
  plt.show()

ts_plot([df_arima.median_unemployment], xlabel='Year-Month', ylabel='Median Unemployment Rate', title='Median Unemployment Rate (1997-2023)' )

## ACF & PACF

Autocorrelation Function (ACF) & Partial Autocorrelation Function (PACF)

In [None]:
# Autocorrelations
def acf_pacf_plotter(Xt, nlag=30, fig_size=(13, 5), alpha=0.05):
    plt.figure(figsize=fig_size)
    layout = (1, 2)

    # Assign axes
    ax_acf = plt.subplot2grid(layout, (0, 0))
    ax_pacf = plt.subplot2grid(layout, (0, 1))

    # Plot graphs
    plot_acf(Xt.dropna(), alpha=alpha, lags=nlag, ax=ax_acf)
    plot_pacf(Xt.dropna(), alpha=alpha, lags=nlag, ax=ax_pacf)

    # Add an overall title for the figure
    plt.suptitle('Median Unemployment Rate', fontsize=16)

    plt.tight_layout()
    plt.show()  # Ensures the plots are displayed

# Example usage with specified parameters
acf_pacf_plotter(df_arima.median_unemployment, nlag=15)

### Value Difference - Median Unemployment

In [None]:
# Plot ACF and PACF to determine p and q
plt.figure(figsize=(13, 5))

# Add an overall title for the figure
plt.suptitle('Value Difference - Median Unemployment Rate', fontsize=16)

plt.subplot(121)
plot_acf(df_arima['value_diff'].dropna() if 'value_diff' in df_arima else df_arima['median_unemployment'], 
         ax=plt.gca(), lags=30)
plt.title("ACF Plot")
plt.subplot(122)
plot_pacf(df_arima['value_diff'].dropna() if 'value_diff' in df_arima else df_arima['median_unemployment'], 
          ax=plt.gca(), lags=15)
plt.title("PACF Plot")
plt.show()

## ADF - Unemployment 

In [None]:
# Perform ADF test on 'value' column to check for stationarity
adf_test = adfuller(df_arima['median_unemployment'])
print("\nADF Test Results:")
print(f"ADF Statistic: {adf_test[0]}")
print(f"p-value: {adf_test[1]}")
if adf_test[1] < 0.05:
    print("The data is stationary.")
else:
    print("The data is not stationary. Differencing may be required.")

# Data Preparation

## Index

## Assigning Training & Testing Sets

In [None]:
# Split the data into training and testing sets
train = df_arima['1997-01-01':'2020-12-01']
test = df_arima['2021-01-01':'2023-12-01']

print(train.head())
print(train.tail())

# Visualize the Training Data

In [None]:
df_monthly = train.resample('ME').mean()

# Plot the data
print('\nMedian Monthly Unemployment Rate (Training Set)')
plt.figure(figsize=(12, 6))
plt.plot(df_monthly, color='blue', linewidth=1)
plt.title('Monthly Median  Unemployment Rate Time Series (Training Set)')
plt.xlabel('Months (January 1997 - December 2020)')
plt.ylabel('Median Unemployment Rate')
plt.xticks(rotation=45) 
plt.grid(True) 
plt.tight_layout()
plt.show()


df_annual = train.resample('YE').mean()

# Plot the data
print('\n\nMedian Annual Unemployment Rate (Training Set)')
plt.figure(figsize=(12, 6))
plt.plot(df_annual, color='blue', linewidth=1)
plt.title('Yearly Median  Unemployment Rate Time Series (Training Set)')
plt.xlabel('Year (1997-2020)')
plt.ylabel('Median Unemployment Rate')
plt.xticks(rotation=45) 
plt.grid(True)  
plt.tight_layout()
plt.show()

In [None]:
# Apply Transformation (Log Transformation to stabilize variance)
median_unemployment_transformed = np.log(train + 1) 

# Re-check ACF/PACF on Transformed Data 
plt.figure(figsize=(12, 5))
# Add an overall title for the figure
plt.suptitle('Median Unemployment Rate (Transformed)', fontsize=16)
plt.subplot(1, 2, 1)
plot_acf(median_unemployment_transformed.dropna(), lags=40, ax=plt.gca())
plt.title("ACF Plot (Transformed)")
plt.subplot(1, 2, 2)
plot_pacf(median_unemployment_transformed.dropna(), lags=40, ax=plt.gca())
plt.title("PACF Plot (Transformed)")
plt.tight_layout()
plt.show()

## Cross Validation

In [None]:
# 4. Define Cross-Validation Procedure
tscv = TimeSeriesSplit(n_splits=5)

In [None]:
# 5. Perform Cross-Validation
cv_mse_scores = []
for train_index, val_index in tscv.split(train_data):
    train_cv, val_cv = train_data.iloc[train_index], train_data.iloc[val_index]
    
    # Fit the ARIMA model
    model = ARIMA(train_cv, order=(1, 1, 1))
    model_fit = model.fit()
    
    # Forecast
    forecast = model_fit.forecast(steps=len(val_cv))
    
    # Calculate Mean Squared Error
    mse = mean_squared_error(val_cv, forecast)
    cv_mse_scores.append(mse)

print('Cross-Validation Mean Squared Error:', np.mean(cv_mse_scores))

In [None]:
# Index 'ref_date' column
df['ref_date'] = pd.to_datetime(df['ref_date'])  # Convert date column to datetime
df.set_index('ref_date', inplace=True)       # Set date as the index