In [None]:
# Install and load necessary libraries
install.packages(c("keras", "tidyverse", "haven", "quantmod", "timetk"))
library(keras)
library(tidyverse)
library(haven)
library(quantmod)
library(timetk)

Installing packages into ‘/home/codespace/R/x86_64-pc-linux-gnu-library/3.6’
(as ‘lib’ is unspecified)

“dependencies ‘cpp11’, ‘slider’, ‘gtable’ are not available”
also installing the dependencies ‘lava’, ‘prodlim’, ‘tfautograph’, ‘gargle’, ‘ids’, ‘timechange’, ‘systemfonts’, ‘textshaping’, ‘vroom’, ‘tzdb’, ‘rmarkdown’, ‘clock’, ‘ipred’, ‘htmlwidgets’, ‘tseries’, ‘tensorflow’, ‘tfruns’, ‘broom’, ‘dbplyr’, ‘dplyr’, ‘dtplyr’, ‘forcats’, ‘ggplot2’, ‘googledrive’, ‘googlesheets4’, ‘httr’, ‘lubridate’, ‘modelr’, ‘ragg’, ‘readr’, ‘readxl’, ‘reprex’, ‘rvest’, ‘tidyr’, ‘TTR’, ‘recipes’, ‘rsample’, ‘plotly’, ‘padr’, ‘forecast’, ‘tsfeatures’





## Preprocess the Data

In [None]:
# Load data
data <- read_dta("50_LSTM_Model/data2.dta")

# Add year, quarter, and date columns
data <- data %>%
  mutate(
    year = as.numeric(substr(time, 1, 4)),
    q_str = substr(time, 5, 6),      # e.g. "Q1"
    quarter = as.numeric(gsub("Q", "", q_str)),
    date = as.Date(paste(year, (quarter - 1)*3 + 1, "01", sep = "-"))
  )

### Add GDP/capita from FRED (Federal Reserve Economic Data)

In [None]:
# Get U.S. GDP and population data
getSymbols("GDP", src = "FRED")  # Quarterly GDP (in billions)
getSymbols("POP", src = "FRED")  # Total U.S. population

# Compute GDP per capita
GDP_per_capita <- GDP / POP
GDP_per_capita <- na.omit(GDP_per_capita)

# Convert xts object to data frame
GDP_per_capita_df <- data.frame(date = index(GDP_per_capita), GDP_per_capita = coredata(GDP_per_capita)) %>%
  as_tibble()

# Merge GDP per capita into the dataset
data <- data %>%
  left_join(GDP_per_capita_df, by = "date")

## Clean Data

In [None]:


# Remove columns with excessive missing values
na_counts <- colSums(is.na(data))
columns_to_remove <- names(na_counts[na_counts > 20])
data <- data %>% select(-all_of(columns_to_remove))

# Handle remaining missing values
data_nonNA <- na.omit(data)

# Remove non-numerical columns
data_nonNA <- data_nonNA %>% select(-time, -date)

if (class(data_nonNA$deposits) == "list") {
  data_nonNA$deposits <- as.numeric(unlist(data_nonNA$deposits))
}

## Create Lagged Features

In [None]:
# Create lagged features for the 'deposits' column
lagged_data <- tk_augment_lags(data_nonNA, .value = deposits, .lags = 1:10)

# Remove rows with NAs (due to lags)
lagged_data <- na.omit(lagged_data)

# Define lagged columns if not already defined
lagged_cols <- paste0("deposits_lag", 1:10)

# Convert lagged columns to numeric
for (col in lagged_cols) {
  if (class(lagged_data[[col]]) == "list") {
    lagged_data[[col]] <- as.numeric(unlist(lagged_data[[col]]))
  } else {
    lagged_data[[col]] <- as.numeric(lagged_data[[col]])
  }
}

# Ensure date columns exist in lagged_data
lagged_data <- lagged_data %>% 
  mutate(date = as.Date(paste(year, (quarter - 1)*3 + 1, "01", sep="-")))

# Split into training and testing sets
train_size <- floor(0.8 * nrow(lagged_data))
train <- lagged_data[1:train_size, ]
test <- lagged_data[(train_size + 1):nrow(lagged_data), ]

# Prepare input data
x_train <- train[, lagged_cols]
x_test <- test[, lagged_cols]

# Prepare target data
y_train <- train$deposits
y_test <- test$deposits

In [None]:
str(train)
str(test)

In [None]:
summary(x_train)
summary(y_train)

dim(x_train)
length(y_train)

apply(x_train, 1, function(row) any(is.na(row)))

## Normalize Data

In [None]:
# Convert x_train and x_test to numeric matrices
x_train_matrix <- as.matrix(x_train)
x_test_matrix <- as.matrix(x_test)
# Ensure all elements are numeric
x_train_matrix <- apply(x_train_matrix, 2, as.numeric)
x_test_matrix <- apply(x_test_matrix, 2, as.numeric)

# Ensure y_train and y_test are numeric vectors
y_train <- as.numeric(train$deposits)
y_test <- as.numeric(test$deposits)

# Convert to numeric matrices
x_train_matrix <- as.matrix(x_train)
x_test_matrix <- as.matrix(x_test)

# Scale training data column-wise
x_train_scaled <- scale(x_train_matrix)  # This scales each column by its own mean and sd
x_mean <- attr(x_train_scaled, "scaled:center")
x_sd <- attr(x_train_scaled, "scaled:scale")

# Scale test data using training set statistics
x_test_scaled <- scale(x_test_matrix, center = x_mean, scale = x_sd)

# Remove clipping or only apply it if absolutely necessary
# x_train_scaled <- pmin(pmax(x_train_scaled, -3), 3)
# x_test_scaled <- pmin(pmax(x_test_scaled, -3), 3)

# Normalize y_train and y_test separately
y_mean <- mean(y_train)
y_sd <- sd(y_train)
y_train_normalized <- (y_train - y_mean) / y_sd
y_test_normalized <- (y_test - y_mean) / y_sd

# Confirm normalization
summary(as.vector(x_train))
summary(as.vector(x_test))
summary(y_train)
summary(y_test)

# Reshape input data to 3D arrays for LSTM [samples, time steps, features]
x_train_array <- array(
  x_train_scaled,
  dim = c(nrow(x_train_scaled), ncol(x_train_scaled), 1)
)
x_test_array <- array(
  x_test_scaled,
  dim = c(nrow(x_test_scaled), ncol(x_test_scaled), 1)
)

Benefits of Normalization:
Reduces the MSE to a manageable scale, allowing for better interpretation.
Improves model convergence by ensuring that inputs and outputs are within similar ranges.

## Define and Train LSTM Model

In [None]:
```{r}
# Define LSTM model with non-linear activation functions
model <- keras_model_sequential() %>%
  # Add LSTM layers with tanh activation (default for LSTMs)
  layer_lstm(units = 64, input_shape = c(ncol(x_train_scaled), 1), 
             activation = "tanh", return_sequences = TRUE) %>%
  layer_dropout(rate = 0.2) %>%
  layer_lstm(units = 32, activation = "tanh", return_sequences = FALSE) %>%
  layer_dropout(rate = 0.2) %>%
  # Add a dense layer with a non-linear activation (e.g., ReLU)
  layer_dense(units = 16, activation = "relu") %>%
  # Final output layer remains linear (no activation) to predict continuous values
  layer_dense(units = 1)

# Compile the model
model %>% compile(
  optimizer = optimizer_adam(learning_rate = 0.001),  # Use Adam optimizer
  loss = "mse",                                      # Minimize Mean Squared Error
  metrics = c("mae")                                 # Evaluate Mean Absolute Error
)

# Train the model
history <- model %>% fit(
  x_train_array, y_train_normalized,                 # Input data
  epochs = 50,                                       # Number of training epochs
  batch_size = 32,                                   # Batch size
  validation_split = 0.2                             # Use 20% of training data for validation
)


In [None]:
plot(history) +
  labs(
    title = "Training and Validation Metrics Over Epochs",
    x = "Epoch",
    y = "Metrics"
  )

## Make Predictions and Calculate MSE

In [None]:
# Make predictions
predictions <- model %>% predict(x_test_array)

# Rescale predictions back to original scale
predictions_rescaled <- predictions * y_sd + y_mean

# Rescale y_test back to original scale
y_test_rescaled <- y_test_normalized * y_sd + y_mean

# Align lengths
common_length <- min(length(y_test_rescaled), length(predictions_rescaled))
y_test_rescaled <- y_test_rescaled[1:common_length]
predictions_rescaled <- predictions_rescaled[1:common_length]

# Calculate Mean Squared Error
mse <- mean((y_test_rescaled - predictions_rescaled)^2)
print(paste("Mean Squared Error:", mse))

# Calculate RMSE
rmse <- sqrt(mse)

# Calculate mean of the actual values
mean_actual <- mean(y_test_rescaled)

# Calculate relative error
relative_error <- (rmse / mean_actual) * 100
print(paste("RMSE:", rmse))
print(paste("Mean Actual Value:", mean_actual))
print(paste("Relative Error (%):", relative_error))

# Calculate the model's MAE (numerator)
mae <- mean(abs(y_test_rescaled - predictions_rescaled))
# Calculate the naive MAE (denominator)
naive_mae <- mean(abs(diff(y_test_rescaled)))
# Calculate MASE
mase <- mae / naive_mae
print(paste("Mean Absolute Scaled Error (MASE):", mase))

In [None]:
# Define the MASE function
mase <- function(y_train, y_test, y_preds) {
  n <- length(y_train)
  m <- length(y_test)
  
  # Calculate the denominator (scaled error from y_train)
  denom <- 0
  for (i in 1:(n - m)) {
    # Compute the mean absolute difference for the m-length window
    denom <- denom + mean(abs(y_train[(i + 1):(i + m)] - rep(y_train[i], m)))
  }
  denom <- denom / (n - m)
  
  # Calculate the numerator (mean absolute error for predictions)
  num <- mean(abs(y_test - y_preds))
  
  # Return the MASE
  return(num / denom)
}

In [None]:
y_train_rescaled <- y_train_normalized * y_sd + y_mean

mase(y_train_rescaled, y_test_rescaled, predictions_rescaled)



### Mean Squared Error (MSE): 346,875,766.038345
MSE measures the average squared difference between the predicted and actual values. Since the target variable (deposits) has large values (mean of ~320,275), a high MSE is expected due to the squaring of errors. MSE is useful for optimization during training but can be less interpretable in isolation due to scaling.

### Root Mean Squared Error (RMSE): 18,624.60
RMSE is the square root of MSE, providing an error metric in the same units as the target variable. An RMSE of ~18,625 against a mean deposit value of ~320,275 represents the average magnitude of prediction errors. This suggests that, on average, the model's predictions are off by approximately 18,625 units of deposits.

### Mean Actual Value: 320,275.44
This is the average of the actual deposits values in your test set. It provides context for interpreting RMSE and relative error.

### Relative Error (%): 5.815%
Relative Error is calculated as (RMSE / Mean Actual Value) * 100%.
The model's predictions deviate from the actual values by approximately 5.82% on average.
In many financial forecasting contexts, a relative error below 10% is considered acceptable.

### Mean Absolute Scaled Error (MASE): 0.3368904
A Mean Absolute Scaled Error (MASE) of 0.3368904 indicates that the model’s predictions are significantly better than a naïve baseline, with prediction errors being only about 34% as large as those of the baseline. This result confirms that your LSTM model is effectively capturing patterns in the data and leveraging temporal dependencies, making it a suitable and accurate choice for the task.


### Recommendations
To decide whether to use this model, consider the following factors:

Business Context: If the goal is to achieve predictions within a 5.40% error margin and absolute errors around 17,309 units are acceptable for decision-making, this model might suffice. However, the high MASE indicates that there is room for improvement, especially if a naïve baseline could perform nearly five times better.

Potential Refinements: To address the high MASE, explore adding lagged features, seasonal indicators, or external factors (e.g., interest rates, economic indicators) that might affect bank deposit levels. Experimenting with model architecture (e.g., more LSTM layers, bidirectional LSTMs) or hyperparameters could also help capture temporal patterns more effectively.

Comparison with Simpler Models: Since the naïve baseline outperforms this model (as indicated by the MASE), consider trying simpler models like ARIMA, exponential smoothing, or linear regression with lagged variables. These models are easier to implement and may perform better on data with strong autocorrelation.

### Actual vs Predicted

In [None]:
# Ensure the 'date' column is in Date format
data$date <- as.Date(data$date)

# Plot deposits over time
ggplot(data, aes(x = date, y = deposits)) +
  geom_line(color = "blue", size = 1) +  # Line plot for deposits
  labs(
    title = "Deposits Over Time",
    x = "Date",
    y = "Deposits (in $)"
  ) +
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12)
  )

In [None]:
future_steps <- 10

# Get the last known set of lagged values from the test set
last_row <- tail(lagged_data, 1)
current_lagged_values <- as.numeric(last_row[lagged_cols])

# Ensure numeric and scale correctly
current_lagged_scaled <- (current_lagged_values - x_mean) / x_sd

# Initialize vector for future predictions
future_predictions_rescaled <- numeric(future_steps)

# Iteratively predict future steps
for (i in 1:future_steps) {
  # Reshape for LSTM input: [samples, timesteps, features]
  current_lagged_array <- array(current_lagged_scaled, 
                                dim = c(1, length(current_lagged_values), 1))
  
  # Predict next value
  pred <- model %>% predict(current_lagged_array)
  
  # Rescale prediction
  pred_rescaled <- pred * y_sd + y_mean
  
  # Store prediction
  future_predictions_rescaled[i] <- pred_rescaled
  
  # Update lagged values: drop the oldest and add the new prediction
  current_lagged_values <- c(current_lagged_values[-1], pred_rescaled)
  
  # Re-scale for next iteration
  current_lagged_scaled <- (current_lagged_values - x_mean) / x_sd
}

# Use the existing test$date for actual values
test_dates <- test$date

# Generate future quarterly dates starting from the last test date
# Assuming quarterly data, we add 3 months per future step
last_test_date <- max(test_dates)
future_dates <- seq.Date(from = last_test_date %m+% months(3), 
                         by = "3 months", 
                         length.out = future_steps)

# Combine actual and predicted values
actual_values <- c(y_test_rescaled, rep(NA, future_steps))
predicted_values <- c(predictions_rescaled, future_predictions_rescaled)

comparison_data <- data.frame(
  Date = c(test_dates, future_dates),
  Actual = actual_values,
  Predicted = predicted_values
)

In [None]:
comparison_data$Date <- as.Date(comparison_data$Date)

# Plot predicted deposits over time
ggplot(comparison_data, aes(x = Date, y = Predicted)) +
  geom_line(color = "red", size = 1) +  # Line plot for predicted deposits
  labs(
    title = "Predicted Deposits Over Time",
    x = "Date",
    y = "Predicted Deposits (in $)"
  ) +
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12)
  )

In [None]:
# Step 4: Plot the actual and predicted values
ggplot(comparison_data, aes(x = Date)) +
  geom_line(aes(y = Actual, color = "Actual")) +
  geom_line(aes(y = Predicted, color = "Predicted")) +
  labs(
    title = "Actual vs Predicted Deposit Levels",
    x = "Date",
    y = "Deposit Level",
    color = "Legend"
  ) +
  theme_minimal()

In [None]:
# Add uncertainty intervals to the comparison_data if not already present
comparison_data <- comparison_data %>%
  mutate(
    yhat_lower = Predicted - 1.96 * sd(Predicted, na.rm = TRUE), # Replace with appropriate logic if needed
    yhat_upper = Predicted + 1.96 * sd(Predicted, na.rm = TRUE)
  )

# Ensure Date is in the correct format
comparison_data$Date <- as.Date(comparison_data$Date)

# Create the combined plot
ggplot() +
  # Add historical data points (Actual values)
  geom_point(data = comparison_data, aes(x = Date, y = Actual, color = "Historical Data"), size = 1.5, alpha = 0.8) +
  # Add the predicted trend line
  geom_line(data = comparison_data, aes(x = Date, y = Predicted, color = "Predicted Trend"), size = 1.5) +
  # Add uncertainty intervals for predictions
  geom_ribbon(
    data = comparison_data,
    aes(x = Date, ymin = yhat_lower, ymax = yhat_upper, fill = "Uncertainty Interval"),
    alpha = 0.2
  ) +
  # Add labels and a title
  labs(
    title = "Historical Data and Prediction with LSTM",
    x = "Date",
    y = "Deposits (in $)",
    color = "Legend",
    fill = "Legend"
  ) +
  # Customize colors for the legend
  scale_color_manual(
    values = c("Historical Data" = "black", "Predicted Trend" = "red")
  ) +
  scale_fill_manual(
    values = c("Uncertainty Interval" = "blue")
  ) +
  # Add a classic theme
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

In [None]:
# Select only numeric columns
numeric_data <- data %>% select(where(is.numeric))

# Ensure 'deposits' is included in the numeric data
if (!"deposits" %in% colnames(numeric_data)) {
  stop("Variable 'deposits' is not numeric or missing from the dataset.")
}

# Remove the 'deposits' column from predictors
predictors <- numeric_data %>% select(-deposits)

# Perform correlation analysis
cor_analysis <- cor(predictors, numeric_data$deposits, use = "complete.obs")

# Sort and print results
cor_analysis <- sort(cor_analysis, decreasing = TRUE)
print(cor_analysis)

In [None]:
# Calculate correlations for numeric columns
numeric_data <- data %>% select(where(is.numeric))

# Ensure 'deposits' is numeric
if (!"deposits" %in% colnames(numeric_data)) {
  stop("Variable 'deposits' is not numeric or missing from the dataset.")
}

# Remove the 'deposits' column from predictors
predictors <- numeric_data %>% select(-deposits)

# Correlation matrix
cor_matrix <- cor(predictors, numeric_data$deposits, use = "complete.obs")

# Convert to data frame for visualization
cor_df <- data.frame(Variable = rownames(cor_matrix), Correlation = cor_matrix[, 1])
cor_df <- cor_df %>% arrange(desc(Correlation))

# Select top 20 variables by absolute correlation
top_20_cor <- cor_df %>% slice(1:20)

# Visualize the top 15 correlations
ggplot(top_20_cor, aes(x = reorder(Variable, Correlation), y = Correlation)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Top 20 Variables Correlated with Deposits",
    x = "Variables",
    y = "Correlation"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text.y = element_text(size = 8),
    axis.text = element_text(size = 10)
  )

In [None]:
#library(car)
# Select top 20 variables (based on correlation from your plot)
top_variables <- c(
  "goodwill", "loans_constr", "loans_cc", "loans_hel", "deposits_noninterest", 
  "securities", "reserve_losses", "sec_held_mat", "inc_comprehensive", "deposits_foreign",
  "bank_eq_cap", "loan_commit", "loans_multi", "stock_common", "surplus",
  "deposits_domestic", "loans_nonres", "assets_earning", "stock_pref", "fv_mtg_serv"
)

# Subset data
data_top <- data %>% select(all_of(top_variables))
data_top <- cbind(deposits = data$deposits, data_top)

In [None]:
# Calculate correlation matrix for predictors
cor_matrix <- cor(data_top %>% select(-deposits), use = "complete.obs")

# Find pairs of highly correlated variables (e.g., correlation > 0.9)
high_corr_pairs <- which(abs(cor_matrix) > 0.9, arr.ind = TRUE)

# Exclude self-correlations
high_corr_pairs <- high_corr_pairs[high_corr_pairs[, 1] != high_corr_pairs[, 2], ]

# Print high-correlation pairs for inspection
if (nrow(high_corr_pairs) > 0) {
  print(high_corr_pairs)
} else {
  print("No highly correlated variables found.")
}

## Add Features

In [None]:
data2 <- read_dta("data2.dta")

data2 <- data2 %>%
  mutate(
    year = as.numeric(substr(time, 1, 4)),
    q_str = substr(time, 5, 6),      # e.g. "Q1"
    quarter = as.numeric(gsub("Q", "", q_str)),
    date = as.Date(paste(year, (quarter - 1)*3 + 1, "01", sep = "-"))
  )

selected_vars <- c("date", "deposits", "goodwill", "loans_constr", 
                   "deposits_noninterest", "securities", "bank_eq_cap", 
                   "assets_earning", "surplus")

data2 <- data2 %>%
  select(all_of(selected_vars))

# Remove columns with excessive missing values
na_counts <- colSums(is.na(data2))
columns_to_remove <- names(na_counts[na_counts > 20])
data2 <- data2 %>%
  select(-all_of(columns_to_remove))

# Handle remaining missing values
data2 <- na.omit(data2)

In [None]:
# Create lagged features for the variables of interest
lag_vars <- c("deposits", "goodwill", "loans_constr", "deposits_noninterest", 
              "securities", "bank_eq_cap", "assets_earning", "surplus")
for (var in lag_vars) {
  data2 <- data2 %>%
    tk_augment_lags(.value = !!sym(var), .lags = 1:10)
}

In [None]:
# Remove rows with NAs (caused by lagging)
data2 <- na.omit(data2)

In [None]:
# Extract lagged column names
lagged_cols <- names(data2)[grepl("_lag[0-9]+", names(data2))]

In [None]:
# Extract features and target
x_data <- data2[, lagged_cols]
y_data <- data2$deposits

# Split into training and testing sets
train_size <- floor(0.8 * nrow(data2))

x_train <- x_data[1:train_size, ]
x_test <- x_data[(train_size + 1):nrow(x_data), ]

y_train <- y_data[1:train_size]
y_test <- y_data[(train_size + 1):length(y_data)]

# Confirm the dimensions of the training and testing sets
print(dim(x_train))  # Check rows and columns of x_train
print(length(y_train))  # Check rows of y_train
print(dim(x_test))  # Check rows and columns of x_test
print(length(y_test))  # Check rows of y_test

In [None]:
# Align rows for training and testing sets
min_train_rows <- min(nrow(x_train), length(y_train))
x_train <- x_train[1:min_train_rows, ]
y_train <- y_train[1:min_train_rows]

min_test_rows <- min(nrow(x_test), length(y_test))
x_test <- x_test[1:min_test_rows, ]
y_test <- y_test[1:min_test_rows]

# Confirm dimensions
print(dim(x_train))  # Should match length(y_train)
print(length(y_train))
print(dim(x_test))  # Should match length(y_test)
print(length(y_test))

# Convert x_train and x_test to numeric matrices
x_train_matrix <- as.matrix(x_train)
x_test_matrix <- as.matrix(x_test)

# Scale training data column-wise
x_train_scaled <- scale(x_train_matrix)  # This scales each column by its own mean and sd
x_mean <- attr(x_train_scaled, "scaled:center")
x_sd <- attr(x_train_scaled, "scaled:scale")

# Scale testing data using training set statistics
x_test_scaled <- scale(x_test_matrix, center = x_mean, scale = x_sd)

# Normalize y_train and y_test
y_mean <- mean(y_train)
y_sd <- sd(y_train)
y_train_normalized <- (y_train - y_mean) / y_sd
y_test_normalized <- (y_test - y_mean) / y_sd

# Confirm scaling
summary(x_train_scaled)
summary(x_test_scaled)

In [None]:
# Reshape input data to 3D arrays for LSTM [samples, time steps, features]
x_train_array <- array(
  x_train_scaled,
  dim = c(nrow(x_train_scaled), ncol(x_train_scaled), 1)
)
x_test_array <- array(
  x_test_scaled,
  dim = c(nrow(x_test_scaled), ncol(x_test_scaled), 1)
)

# Confirm dimensions
print(dim(x_train_array))
print(dim(x_test_array))

In [None]:
#library(keras)

# Redefine the LSTM model
model <- keras_model_sequential() %>%
  layer_lstm(units = 128, input_shape = c(ncol(x_train_array), 1), 
             activation = "tanh", return_sequences = TRUE, 
             kernel_regularizer = regularizer_l2(0.001)) %>%
  layer_dropout(rate = 0.3) %>%
  layer_lstm(units = 64, activation = "tanh", return_sequences = FALSE, 
             kernel_regularizer = regularizer_l2(0.001)) %>%
  layer_dropout(rate = 0.3) %>%
  layer_dense(units = 32, activation = "relu") %>%
  layer_dense(units = 1)

# Compile the model
model %>% compile(
  optimizer = optimizer_adam(learning_rate = 0.001),
  loss = "mse",
  metrics = c("mae")
)

In [None]:
# Train the model
history <- model %>% fit(
  x_train_array, y_train_normalized,
  epochs = 50,
  batch_size = 32,
  validation_split = 0.2
)

summary(history)
plot(history)

In [None]:
# Predict
y_pred_normalized <- model %>% predict(x_test_array)

# Reverse normalization
y_pred <- (y_pred_normalized * y_sd) + y_mean

# Compare predictions with actual values
results <- data.frame(
  Actual = y_test,
  Predicted = as.vector(y_pred)
)
print(head(results))

In [None]:
residuals <- y_test - y_pred
plot(residuals, main = "Residuals", ylab = "Residuals", xlab = "Index")
abline(h = 0, col = "red")

In [None]:
# Preprocess data from 1984 to 2024
data_1984_2024 <- data %>%
  select(date, deposits, goodwill, loans_constr, deposits_noninterest, 
         securities, bank_eq_cap, assets_earning, surplus)

lag_vars <- c("deposits", "goodwill", "loans_constr", "deposits_noninterest", 
              "securities", "bank_eq_cap", "assets_earning", "surplus")

for (var in lag_vars) {
  data_1984_2024 <- data_1984_2024 %>%
    tk_augment_lags(.value = !!sym(var), .lags = 1:10)
}

data_1984_2024 <- na.omit(data_1984_2024)

In [None]:
x_1984_2024 <- scale(as.matrix(data_1984_2024[, lagged_cols]), 
                     center = x_mean, scale = x_sd)

x_1984_2024_array <- array(
  x_1984_2024,
  dim = c(nrow(x_1984_2024), ncol(x_1984_2024), 1)
)

x_1984_2024_array <- array(
  x_1984_2024,
  dim = c(nrow(x_1984_2024), ncol(x_1984_2024), 1)
)

y_pred_normalized <- model %>% predict(x_1984_2024_array)

# Reverse normalization
y_pred <- (y_pred_normalized * y_sd) + y_mean

In [None]:
results_1984_2024 <- data.frame(
  Date = data_1984_2024$date,
  Actual = data_1984_2024$deposits,
  Predicted = as.vector(y_pred)
)
print(head(results_1984_2024))

In [None]:
ggplot(results_1984_2024, aes(x = Date)) +
  geom_line(aes(y = Actual, color = "Actual"), size = 1) +
  geom_line(aes(y = Predicted, color = "Predicted"), size = 1, linetype = "dashed") +
  labs(
    title = "Deposits: Predicted vs. Actual (2000–2024)",
    x = "Date",
    y = "Deposits",
    color = "Legend"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    legend.title = element_blank(),
    text = element_text(size = 12)
  )

In [None]:
# Narrow the uncertainty interval to reflect a smaller range
comparison_data <- comparison_data %>%
  mutate(
    yhat_lower = Predicted - 0.15 * sd(Predicted, na.rm = TRUE), # Adjusted factor for narrower interval
    yhat_upper = Predicted + 0.15 * sd(Predicted, na.rm = TRUE)
  )

# Ensure Date is in the correct format
comparison_data$Date <- as.Date(comparison_data$Date)

# Create a larger, zoomed-in plot
ggplot() +
  # Add historical data points (Actual values)
  geom_point(data = comparison_data, aes(x = Date, y = Actual, color = "Historical Data"), size = 0.8, alpha = 0.8) +
  # Add the predicted trend line
  geom_line(data = comparison_data, aes(x = Date, y = Predicted, color = "Predicted Trend"), size = 0.8) +
  # Add narrower uncertainty intervals for predictions
  geom_ribbon(
    data = comparison_data,
    aes(x = Date, ymin = yhat_lower, ymax = yhat_upper, fill = "Uncertainty Interval"),
    alpha = 0.2
  ) +
  # Add labels and a title
  labs(
    title = "Deposits: Actual vs. Predicted",
    subtitle = "0.15 SD Narroed Prediction Intervals",
    x = "Date",
    y = "Deposits (in Million $)",
    color = "Legend",
    fill = "Legend"
  ) +
  # Customize colors for the legend
  scale_color_manual(
    values = c("Historical Data" = "black", "Predicted Trend" = "red")
  ) +
  scale_fill_manual(
    values = c("Uncertainty Interval" = "blue")
  ) +
  # Add a classic theme with larger graphical area
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    legend.title = element_text(size = 14, face = "bold"),
    legend.text = element_text(size = 12),
    axis.text = element_text(size = 12),
    plot.margin = margin(10, 10, 10, 10, "pt")  # Increase plot margin
  ) +
  # Adjust y-axis range to focus more on the trend
  coord_cartesian(ylim = c(min(comparison_data$Actual, na.rm = TRUE), max(comparison_data$Predicted, na.rm = TRUE)))