<a href="https://colab.research.google.com/github/lvscious/Com-Aided-Case-Study/blob/main/Draftt_ARIMA_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install packages for time series analysis and plotting (if not already installed)
install.packages("forecast")
install.packages("tseries")
install.packages("ggplot2")


In [None]:

# Load libraries
library(forecast)
library(tseries)
library(ggplot2)

# Load and clean data (fast)
data <- read.csv("/content/full_cleaned_dataset (1).csv")
data$GDP <- as.numeric(gsub(",", "", data$GDP))
data <- na.omit(data)
gdp_ts <- ts(data$GDP, start = c(2000, 1), frequency = 4)

# Step 1: Stationarity and Transformation (simplified, no outlier adjustment for speed/accuracy)
log_gdp_ts <- log(gdp_ts)
diff_log_gdp_ts <- diff(log(gdp_ts)) # Use log(gdp_ts) here directly
adf_test <- adf.test(diff_log_gdp_ts)
cat("ADF p-value:", adf_test$p.value, "\n")
if (adf_test$p.value > 0.05) {
  cat("Series may need more differencing.\n")
  diff_log_gdp_ts <- diff(diff_log_gdp_ts, lag = 4)  # Seasonal difference if needed
}

# Step 2: Train-Test Split (fast)
train_prop <- 0.8
n <- length(diff_log_gdp_ts)
train_size <- floor(train_prop * n)
train_ts <- window(diff_log_gdp_ts, end = c(start(diff_log_gdp_ts)[1] + (train_size - 1) %/% 4, (train_size - 1) %% 4 + 1))
test_ts <- window(diff_log_gdp_ts, start = c(start(diff_log_gdp_ts)[1] + train_size %/% 4, train_size %% 4 + 1))

# Step 3: Model Selection (fit once) - Only ARIMA
arima_model <- auto.arima(train_ts, seasonal = TRUE)

# Step 4: Diagnostics (fast checks)
checkresiduals(arima_model)  # Uncommented to show ACF
lb_test <- Box.test(residuals(arima_model), lag = 10, type = "Ljung-Box")
cat("Ljung-Box p-value:", lb_test$p.value, "\n")

# Step 5: Forecasting and Evaluation
arima_forecast <- forecast(arima_model, h = length(test_ts))

# Back-transform to original scale for test set metrics
last_log_gdp_before_test <- log_gdp_ts[train_size] # Last log value before the test set
test_set_start_time <- start(test_ts)

test_actual_log_levels <- cumsum(c(last_log_gdp_before_test, test_ts))[-1]
arima_forecast_log_levels <- cumsum(c(last_log_gdp_before_test, arima_forecast$mean))[-1]

test_actual_original <- ts(exp(test_actual_log_levels), start = test_set_start_time, frequency = frequency(log_gdp_ts))
test_forecast_arima <- ts(exp(arima_forecast_log_levels), start = test_set_start_time, frequency = frequency(log_gdp_ts))

# Metrics (fast)
calc_metrics <- function(actual, predicted) {
  rmse <- sqrt(mean((actual - predicted)^2, na.rm = TRUE))
  mae <- mean(abs((actual - predicted)), na.rm = TRUE)
  mape <- mean(abs((actual - predicted) / actual) * 100, na.rm = TRUE)
  return(c(RMSE = rmse, MAE = mae, MAPE = mape))
}

arima_metrics <- calc_metrics(test_actual_original, test_forecast_arima)
cat("ARIMA Metrics:", arima_metrics, "\n")

# Step 6: Final Forecasts - Only ARIMA
best_model <- "ARIMA" # Force ARIMA
cat("Best Model:", best_model, "\n")

full_model <- auto.arima(diff_log_gdp_ts, seasonal = TRUE)
future_forecast <- forecast(full_model, h = 8)

# Back-transform future forecasts to original scale as ts objects
last_log_gdp_full <- log_gdp_ts[length(log_gdp_ts)]

# Calculate the actual start time for the future forecast series
last_year_gdp <- end(gdp_ts)[1]
last_quarter_gdp <- end(gdp_ts)[2]

if (last_quarter_gdp == frequency(gdp_ts)) {
  forecast_start_year <- last_year_gdp + 1
  forecast_start_quarter <- 1
} else {
  forecast_start_year <- last_year_gdp
  forecast_start_quarter <- last_quarter_gdp + 1
}
future_start_c <- c(forecast_start_year, forecast_start_quarter)

# Calculate the forecasted log levels (excluding the initial historical log_gdp)
future_mean_log_levels <- cumsum(c(last_log_gdp_full, future_forecast$mean))[-1]
future_lower_log_levels <- cumsum(c(last_log_gdp_full, future_forecast$lower[, 2]))[-1]
future_upper_log_levels <- cumsum(c(last_log_gdp_full, future_forecast$upper[, 2]))[-1]

# Convert to original scale and create ts objects
future_original <- ts(exp(future_mean_log_levels), start = future_start_c, frequency = frequency(gdp_ts))
future_lower <- ts(exp(future_lower_log_levels), start = future_start_c, frequency = frequency(gdp_ts))
future_upper <- ts(exp(future_upper_log_levels), start = future_start_c, frequency = frequency(gdp_ts))

# Create table (no output)
forecast_table <- data.frame(Point_Forecast = as.numeric(future_original),
                             Lower_95 = as.numeric(future_lower),
                             Upper_95 = as.numeric(future_upper))

# Step 7: Plots (fixed to ensure ts objects)
# For the test set plot, arima_forecast$mean is numeric. Convert it to ts.
arima_forecast_mean_ts <- ts(arima_forecast$mean, start = start(test_ts), frequency = frequency(test_ts))

autoplot(test_ts) +
  autolayer(arima_forecast_mean_ts, series = "ARIMA") +
  ggtitle("Test Set Forecast (Differenced Scale)") +
  theme_minimal()

# Reconstruct full original series and fitted as ts objects for plotting
# First, recreate the full original log series and then exponentiate to get ts object on original scale
full_log_gdp_ts_reconstructed_values <- cumsum(c(log(gdp_ts[1]), diff_log_gdp_ts))
full_ts_original <- ts(exp(full_log_gdp_ts_reconstructed_values),
                       start = start(gdp_ts), frequency = frequency(gdp_ts))

# And fitted values from the final model, then exponentiate to get ts object on original scale
fitted_diff <- fitted(full_model)
fitted_log_levels_values <- cumsum(c(log(gdp_ts[1]), fitted_diff))
fitted_original <- ts(exp(fitted_log_levels_values),
                      start = start(gdp_ts), frequency = frequency(gdp_ts))

autoplot(full_ts_original) +
  autolayer(fitted_original, series = "Fitted") +
  autolayer(future_original, series = "Forecast") +
  autolayer(future_lower, series = "Lower 95%", linetype = "dashed") +
  autolayer(future_upper, series = "Upper 95%", linetype = "dashed") +
  ggtitle("GDP Forecast (Original Scale)") +
  xlab("Year") +
  ylab("GDP (Billions PHP)") +
  theme_minimal()