# Feature Selection in XGBoost
This document outlines the step-by-step process for performing feature selection during the development of a machine learning model using **XGBoost** in R.

## 1. Initial Setup and Environment Cleanup

Before beginning feature selection or model building, it's good practice to start with a clean R environment. This avoids issues caused by residual objects or plots from previous sessions.


In [None]:
# Remove all existing objects from the environment
rm(list = ls())

# Clear the console output
cat('\014')

# Trigger garbage collection to free up unused memory
gc()

## 2. Set Working Directory

In [None]:
# Define your working directory path (update this before running)
working_dir <- ""

# Set the working directory
setwd(working_dir)

## 3. Import Libraries

In [None]:
library(coreUtils)
library(TUISG)
library(titanng)
library(comet)
library(pleiadis)
library(dia)
library(tidyverse)
library(reshape2)
library(mlr)
library(xgboost)
library(fst)
library(gt)
library(plotly)
library(data.table)
library(dummies)
library(caret)
library(magrittr)
library(dplyr)
library(openxlsx)

## 4. Define Functions

This section defines three core functions to evaluate binary classification models:
- `eval_func`: Calculates **Gini coefficient** and **KS statistic**
- `auc_func`: Computes the **AUC (Area Under the ROC Curve)** using `pROC`
- `ksTable`: Produces a detailed decile-wise KS table with cumulative gains and Gini values

### 4.1 Gini and KS Evaluation Function

In [None]:
 
eval_func <- function(actuals, pred_prob){
  q <- quantile(x = pred_prob, probs = seq(0, 1, length.out = 11), type = 8)
  if(length(unique(q)) < 11){
    q <- unique(q)
    cat(crayon::red("\nNote that quantiles are not unique. Showing only unique quantiles...\n"))
  }
  df <- data.frame(prob = pred_prob, actuals)
  ksTable <-  df %>%
    mutate(pBucket = cut(prob, breaks = q, include.lowest = T)) %>%
    group_by(pBucket) %>%
    summarise(nNonEvent = sum(actuals == 0),
              nEvent = sum(actuals == 1)) %>%
    mutate(nTotal = nNonEvent + nEvent) %>%
    mutate(pct.Pop = nTotal/sum(nTotal),
           Event.rate = nEvent / nTotal * 100) %>%
    mutate(cum.NonEvent = cumsum(nNonEvent),
           cum.Event = cumsum(nEvent)) %>%
    mutate(pct.cum.NonEvent = cum.NonEvent / sum(nNonEvent) ,
           pct.cum.Event = cum.Event / sum(nEvent)) %>%
    mutate(pct.cum.NonEvent.lag = lag(pct.cum.NonEvent),
           pct.cum.Event.lag = lag(pct.cum.Event)) %>%
    mutate(ks = abs(pct.cum.NonEvent - pct.cum.Event))
  ksTable[1,c("pct.cum.NonEvent.lag","pct.cum.Event.lag")] = 0
  ksTable$Event_Enevt.lag_diff = ksTable$pct.cum.Event - ksTable$pct.cum.Event.lag
  ksTable$NonEvent_NonEvent.lag_sum = ksTable$pct.cum.NonEvent + ksTable$pct.cum.NonEvent.lag
  ksTable$gini_0 <- ksTable$Event_Enevt.lag_diff * ksTable$NonEvent_NonEvent.lag_sum * 0.5
  gini = (sum(ksTable$gini_0) - 0.5)*2
  ks = max(ksTable$ks)
  return(list(round(gini,4),round(ks,4)))
}


### 4.2 AUC Calculation Function

In [None]:
auc_func <- function(df){
  library(pROC)
  roc_curve = roc(df$actual, df$prob, quiet = TRUE)
  roc_auc = auc(roc_curve)
  auc_value <- as.numeric(roc_auc)
  return (round(auc_value,4))
}

### 4.3 KS Table for Decile Analysis

In [None]:

ksTable <- function(actuals, pred_prob, view = T){
  q <- quantile(x = pred_prob, probs = seq(0, 1, length.out = 11), type = 8)
  if(length(unique(q)) < 11){
    q <- unique(q)
    cat(crayon::red("\nNote that quantiles are not unique. Showing only unique quantiles...\n"))
  }
  df <- data.frame(prob = pred_prob, actuals)
  ksTable <-  df %>%
    mutate(pBucket = cut(prob, breaks = q, include.lowest = T)) %>%
    group_by(pBucket) %>%
    summarise(nNonEvent = sum(actuals == 0),
              nEvent = sum(actuals == 1)) %>%
    mutate(Decile = row_number()) %>%
    mutate(nTotal = nNonEvent + nEvent) %>%
    mutate(pct.Pop = nTotal/sum(nTotal),
           Event.rate = nEvent / nTotal ) %>%
    mutate(cum.NonEvent = cumsum(nNonEvent),
           cum.Event = cumsum(nEvent)) %>%
    mutate(pct.cum.NonEvent = cum.NonEvent / sum(nNonEvent),
           pct.cum.Event = cum.Event / sum(nEvent)) %>%
    mutate(ks = round(abs(pct.cum.NonEvent - pct.cum.Event) * 100, 2)) %>%
    mutate(pct.cum.NonEvent.lag = lag(pct.cum.NonEvent),
           pct.cum.Event.lag = lag(pct.cum.Event)) %>%
    mutate(Event_Enevt.lag_diff = (pct.cum.Event + pct.cum.Event.lag),
           NonEvent_NonEvent.lag_sum = (pct.cum.NonEvent - pct.cum.NonEvent.lag))
 
  ksTable[1,c("pct.cum.NonEvent.lag","pct.cum.Event.lag")] = 0
 
  ksTable = ksTable%>%mutate(Gini = Event_Enevt.lag_diff * NonEvent_NonEvent.lag_sum * 0.5)
 
  ksTable[1,c("Gini")] = ((ksTable$pct.cum.NonEvent[1])*(ksTable$pct.cum.Event[1])*0.5)
 
  req_cols = c("Decile", "pBucket", "nNonEvent", "nEvent", "nTotal", "pct.Pop", "Event.rate", "cum.NonEvent", "cum.Event", "pct.cum.NonEvent", "pct.cum.Event", "ks", "Gini")
 
  return(ksTable[,req_cols])
}


## 5. Setting up File paths and Model Parameters

In [None]:
# Set paths for training, test, and out-of-time data
train_data_path = ""
test_data_path = ""
oot_data_path = ""

# Load the variable dictionary and convert column names to lowercase
variable_dict = ""
dictionary <- fread(variable_dict)
dictionary[, Attribute := tolower(Attribute)]

# Define the target variable (dependent variable for prediction)
target = ""

# Set up model saving directory and filenames
model_saving_dir = ""
model_name = ""
model_version = ""

# File for saving feature importance results
Feat_imp_file = ""

# Variable after IV (Information Value) and Feature Importance filtering
vars_after_IV_FI = ""

# Construct full file paths for model and output files
model_name = paste0(model_saving_dir, model_name, "_", model_version, ".rds")
Feat_imp_file_name = paste0(model_saving_dir, Feat_imp_file, "_", model_version, ".csv")

# Bin object file (for storing binning results for features)
bin_obj = paste0(model_saving_dir, "all_vars_binning_object.rds")

## 6. Creating Target

In [None]:
## Creating target
train_data = read_csv(train_data_path)%>%
  mutate(
    dpd_18_60_ind = case_when(
      dpd_18_60_ind == "04_Bad" ~ 1,
      dpd_18_60_ind == "03_Good" ~ 0,
      TRUE ~ NA_real_
    ))
test_data = read_csv(test_data_path)%>%
  mutate(
    dpd_18_60_ind = case_when(
      dpd_18_60_ind == "04_Bad" ~ 1,
      dpd_18_60_ind == "03_Good" ~ 0,
      TRUE ~ NA_real_
    ))
oot_data = read_csv(oot_data_path)%>%
  mutate(
    dpd_18_60_ind = case_when(
      dpd_18_60_ind == "04_Bad" ~ 1,
      dpd_18_60_ind == "03_Good" ~ 0,
      TRUE ~ NA_real_
    ))


## 7. Removing Features with BLacklist, Zero Variance, target

In [None]:
black_df_1 = read_csv("/prd/momarmi/lists/Blacklist.csv")
black_df_2 <- black_df_1 %>%
  filter(blacklist == 1 | blacklist == 2 |blacklist == 3 | blacklist == 4 | blacklist == 6)
 
blacklist1 <- as.vector(black_df_2$Attribute)
blacklist2 <- c('dpd_18_60_ind')
 
blacklist <- union(blacklist1, blacklist2)
all_vars <- as.vector(black_df_1$Attribute)
vars_list <- setdiff(all_vars, blacklist)
 
#Dropping non numeric and blacklist variables
predictors <- train_data[,vars_list] %>%
  select_if(is.numeric) %>%  
  colnames %>%
  setdiff(blacklist)
 
#Dropping zero variance variables
zeroVariance <- train_data[,vars_list] %>%
  select(one_of(predictors)) %>%
  summarise_all(var) %>%
  select_if(function(.) . == 0) %>%
  colnames
model_var = setdiff(predictors, c(zeroVariance))
length(model_var)

## 8. Join dictionary to get description

In [None]:
df = data.frame("Attribute" = as.character(model_var))
df = left_join(df,dictionary,by=c("Attribute"))
description = df$Description
feat_rm = data.frame('Description' = as.character((description[which(grepl("M =",description))])))
feat_rm1 = data.frame('Description' = as.character((description[which(grepl(" at month",description))])))
df1 = anti_join(df,feat_rm,by=c("Description"))
df2 = anti_join(df1,feat_rm1,by=c("Description"))

# Removing the balance variables
balVars <- tibble(Attribute = predictors) %>%
  left_join(dictionary) %>%
  filter(
    grepl("Total past due amount", Description)
    | grepl("at month", Description)
    | grepl("Aggregate amount past due", Description)
    | grepl("Average total open-to-buy", Description)
    | grepl("Total scheduled monthly payment", Description)
    | grepl("Limit of overdraft of", Description)
    | grepl("Total monthly obligation", Description)
    | grepl("Aggregate financial non-mortgage actual payment", Description)
    | grepl("Aggregate financial non-mortgage amount past due", Description)
    | grepl("Aggregate bankcard term at month", Description)
    | grepl("Aggregate bankcard Minimum Payment Amount", Description)
    | grepl("Aggregate unsecured loan actual payment amount", Description)
    | grepl("Aggregate  actual payment amount at month", Description)
    | grepl("Aggregate spend at month", Description)
    | grepl("Aggregate revolving spend at month", Description)
    | grepl("Aggregate retail spend at month", Description)
    | grepl("Total payment amount", Description)
    | grepl("Min total open-to-buy for", Description)
    | grepl("Aggregate spend over the", Description)
    | grepl("Peak monthly spend over the", Description)
    | grepl("Aggregate retail spend over the", Description)
    | grepl("Delinquent Ammount", Description)
    | grepl("Total open to buy of", Description)
    | grepl("Aggregate unsecured loan amount past due", Description)
    | grepl("Aggregate excess payment", Description)
    | grepl("Total past due amount of", Description)
    | grepl("Average balance of", Description)
    | grepl("Total past due amount of", Description)
    | grepl("Total balance of", Description)
    | grepl("Max total open-to-buy", Description)
    | grepl("Total credit line of", Description)
    | grepl("Aggregate balance at", Description)
    # | grepl("Average aggregate excess payment", Description)
    | grepl("Aggregate unsecured loan balance", Description)
    | grepl("Aggregate credit line at", Description)
    | grepl("Aggregate financial non-mortgage balance", Description)
    | grepl("Aggregate bankcard amount past", Description)
    | grepl("Highest balance of", Description)
    | grepl("Maximum balance owed", Description)
    # | grepl("Aaggregate excess payment", Description)
    | grepl("Aggregate credit line at", Description)
  ) %>%
  select(Attribute) %>%
  unlist %>% unname
 
model_var <- df2$Attribute
model_var <- setdiff(model_var, c(balVars))



## 9. Data converted to numeric on model variables and imputed with -99 for null variables 

### 9.1 Convert Variables to Numeric type

In [None]:
train_data[,vars_list] <- data.frame(lapply(train_data[,vars_list], as.numeric))
test_data[,vars_list] <- data.frame(lapply(test_data[,vars_list], as.numeric))
oot_data[,vars_list] <- data.frame(lapply(oot_data[,vars_list], as.numeric))


### 9.2 Impute null values with -99

In [None]:
## Impute null values; can drop them if there is sufficient data
train_data=train_data%>%mutate(across(all_of(model_var),~replace_na(.x,-99)))
test_data=test_data%>%mutate(across(all_of(model_var),~replace_na(.x,-99)))
oot_data=oot_data%>%mutate(across(all_of(model_var),~replace_na(.x,-99)))

### 9.3 Preparing model variables

In [None]:
##Model vars
x_train <- train_data[,model_var]
x_test <- test_data[,model_var]
x_oot <- oot_data[,model_var]
 
y_train <- train_data[[target]]
y_test <- test_data[[target]]
y_oot <- oot_data[[target]]

## 10. Convert dataframe to XGB DMatrix object

In [None]:
xgb_train<-xgb.DMatrix(as.matrix(x_train),label=y_train,x_train[["weight"]])
xgb_test<-xgb.DMatrix(as.matrix(x_test),label=y_test,x_test[["weight"]])
xgb_oot<-xgb.DMatrix(as.matrix(x_oot),label=y_oot,x_train[["weight"]])

## 11. Base model Training

In [None]:
## Training model with depth 1
param <- list(max_depth = 1, eta = 0.1,nthread = -1,
              objective = "binary:logistic", eval_metric = "auc")
watchlist<-list(eval = xgb_test, train = xgb_train)
xgb <- xgb.train(param,xgb_train,watchlist,nrounds=1500,verbose=1,print_every_n = 100)
saveRDS(xgb,file =model_name)
 
xgb <- readRDS(model_name)

## 12. Base model validation

In [None]:
## Plot loss curve
a<-as.data.frame(xgb$evaluation_log)
library(reshape)
library(ggplot2)
b<-melt(a,id.vars ="iter",variable_name = "auc")
ggplot(b, aes(iter,value)) + geom_line(aes(colour = auc)) + scale_x_continuous(breaks =seq(0,1500,by = 100))

# Performance of Base Model
prob<-predict(xgb,as.matrix(x_train))
train_out <- data.frame(actual=y_train,prob=prob)
a = eval_func(train_out$actual, train_out$prob)
auc_train = auc_func(train_out)
gini_train = unlist(a[1])
ks_train = unlist(a[2])
 
prob<-predict(xgb,as.matrix(x_test))
test_out <- data.frame(actual=y_test,prob=prob)
a = eval_func(test_out$actual, test_out$prob)
auc_test = auc_func(test_out)
gini_test = unlist(a[1])
ks_test = unlist(a[2])
 
prob<-predict(xgb,as.matrix(x_oot))
oot_out <- data.frame(actual=y_oot,prob=prob)
a = eval_func(oot_out$actual, oot_out$prob)
auc_oot = auc_func(oot_out)
gini_oot = unlist(a[1])
ks_oot = unlist(a[2])
 
# Base Model Performance
cat(auc_train, gini_train, ks_train, auc_test, gini_test, ks_test, auc_oot, gini_oot, ks_oot, (ks_train - ks_test)/ks_train, (ks_test - ks_oot)/ks_test)
print("Model has trained!")
 
# Features importance of Base Model
var_imp_all_vars <- xgb.importance(xgb$feature_names, model = xgb)
Feature_imp <- as.data.frame(var_imp_all_vars)
colnames(Feature_imp)[1] <- "Attribute"
Feature_imp <- Feature_imp %>% left_join(dictionary, by = "Attribute")
write.csv(Feature_imp, Feat_imp_file_name)
 
 
#######################################################################################
model_results <- matrix(c(auc_train, auc_test, auc_oot, gini_train, gini_test, gini_oot, ks_train, ks_test, ks_oot, (ks_train-ks_train)/ks_train , (ks_train-ks_test)/ks_train, (ks_train-ks_oot)/ks_train), nrow = 3, ncol = 4)
 
model_results <- data.frame(model_results)
colnames(model_results) <- c("AUC", "Gini", "KS", "KS Drop")
rownames(model_results) <- c("Train", "Test", "OOT")
 
#class_description = matric(c())
 
train_ks_table = ksTable(train_out$actual, train_out$prob)
test_ks_table = ksTable(test_out$actual, test_out$prob)
oot_ks_table = ksTable(oot_out$actual, oot_out$prob)

## 13. KS Table & Feature Importance File Creation

In [None]:
### Define the Excel format style
wb <- createWorkbook()
 
headerStyle <- createStyle(textDecoration = "bold", border = "TopBottomLeftRight", halign = "center", valign="center", fgFill = "#B4C6E7")
borderStyle <- createStyle(border = "TopBottomLeftRight")
BoldStyle <- createStyle(textDecoration = "bold", halign = "center", valign="center")
NumStyle <- createStyle(border = "TopBottomLeftRight", halign = "center", valign="center")
PercentageStyle <- createStyle(border = "TopBottomLeftRight", halign = "center", valign="center", numFmt="PERCENTAGE")
 
#########################################################################################
### Adding the KS-Table workbook
addWorksheet(wb,"KS-Table")
 
 
writeData(wb,"KS-Table",model_results,startRow = 2,startCol = 2,colNames = TRUE,headerStyle = headerStyle, rowNames = TRUE)
addStyle(wb,"KS-Table",headerStyle, rows=2,cols=2:6,gridExpand = TRUE)
addStyle(wb,"KS-Table",headerStyle,rows=3:5,cols=2,gridExpand = TRUE)
addStyle(wb,"KS-Table",NumStyle,rows=3:5,cols=3:6,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=3:5,cols=6,gridExpand = TRUE)
writeData(wb,"KS-Table",x="",startRow = 6,startCol = 2)
 
writeData(wb,"KS-Table",x="TRAIN",startRow = 9,startCol = 2)
addStyle(wb,"KS-Table",BoldStyle,rows=9,cols=2,gridExpand = TRUE)
writeData(wb,"KS-Table",train_ks_table,startRow = 11,startCol = 2,colNames = TRUE,headerStyle = headerStyle, rowNames = FALSE)
addStyle(wb,"KS-Table",NumStyle,rows=12:21,cols=2:14,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=12:21,cols=7:8,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=12:21,cols=11:12,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=12:21,cols=14,gridExpand = TRUE)
 
 
 
writeData(wb,"KS-Table",x="",startRow = 22,startCol = 2)
 
writeData(wb,"KS-Table",x="TEST",startRow = 25,startCol = 2,headerStyle = headerStyle)
addStyle(wb,"KS-Table",BoldStyle,rows=25,cols=2,gridExpand = TRUE)
writeData(wb,"KS-Table",test_ks_table,startRow = 27,startCol = 2,colNames = TRUE,headerStyle = headerStyle, rowNames = FALSE)
addStyle(wb,"KS-Table",NumStyle,rows=28:37,cols=2:14,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=28:37,cols=7:8,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=28:37,cols=11:12,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=28:37,cols=14,gridExpand = TRUE)
 
writeData(wb,"KS-Table",x="",startRow = 38,startCol = 2)
 
writeData(wb,"KS-Table",x="OOT",startRow = 41,startCol = 2,headerStyle = headerStyle)
addStyle(wb,"KS-Table",BoldStyle,rows=41,cols=2,gridExpand = TRUE)
writeData(wb,"KS-Table",oot_ks_table,startRow = 43,startCol = 2,colNames = TRUE,headerStyle = headerStyle, rowNames = FALSE)
addStyle(wb,"KS-Table",NumStyle,rows=44:53,cols=2:14,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=44:53, cols=7:8,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=44:53, cols=11:12,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=44:53, cols=14,gridExpand = TRUE)
 
 
######### Adding Feature Importance
addWorksheet(wb,"Feature_imp")
writeData(wb,"Feature_imp",Feature_imp,startRow = 1,startCol = 1,colNames = TRUE,headerStyle = headerStyle, rowNames = FALSE)
addStyle(wb,"Feature_imp",headerStyle, rows=1,cols=1:5,gridExpand = TRUE)
addStyle(wb,"Feature_imp", borderStyle,rows=2:nrow(Feature_imp)+1,cols=1:5,gridExpand = TRUE)
addStyle(wb,"Feature_imp",NumStyle,rows=2:nrow(Feature_imp)+1,cols=2:4,gridExpand = TRUE)
 
 
 
saveWorkbook(wb,paste0(model_saving_dir, "model_report_", model_version, ".xlsx"),overwrite = TRUE)


## 14. Variable Selection

### 14.1 Binning

In [None]:
x_train <- train_data[, c(model_var, target, "weight")]
#%>% sample_n(size = 140700)
 
x_train = x_train %>%
  mutate_at(model_var, ~ ifelse(is.na(.) | . <0, -99, .))
 
# Default Binning
initParallel(seq)
bins <- newBinning(
  data = x_train
  , target = target
  , weights = "weight"
  , varsToInclude = model_var
)
 
saveRDS(bins, bin_obj)

### 14.2 Wellbinned

In [None]:
wellBinned <- names(bins$rejectCodes[bins$rejectCodes == 0])
iv <- lapply(bins$bins, function(x)
{
  sum(x$IV)
})

### 14.3 Features with IV > 0.1

In [None]:
## Get variable list with IV > 0.1
iv <- unlist(iv)
iv <- data.frame(Attribute = names(iv), IV = iv) %>% arrange(desc(IV)) %>% filter( IV != Inf)
iv <- left_join(iv, dictionary, by = "Attribute")
iv_top_vars_df <- iv %>% filter (IV>0.1)

### 14.4 Selecting Intersection Variables Based on Information Value (IV) and Feature Importance

In [None]:
Feature_imp_vars <- as.vector(Feature_imp$Attribute)
iv_top_vars <- as.vector(iv_top_vars_df$Attribute)
model_vars1 <- union(intersect(wellBinned,iv_top_vars), intersect(wellBinned,Feature_imp_vars))

# Joining description to the Features
var_df <- data.frame("Attribute" = model_vars1)
var_df1 <- inner_join(var_df, iv, by = "Attribute") %>% arrange(desc(IV))
vars_after_IV_FI = paste0(model_saving_dir, vars_after_IV_FI,"_", model_version, ".csv")
 
write.csv(var_df1, vars_after_IV_FI)

## 15. Correlation Check

In [None]:
x_train <- train_data[, c(model_var, target, "weight")]
 
x_train = x_train %>%
  mutate_at(model_var, ~ ifelse(is.na(.) | . <0, -99, .))
 
## Check correlation; Variables are passed in descending order by IV
run_correlation_check <- function(data, cor_thr = 0.4){
  cor_mat <- cor(data)
  diag(cor_mat) <- 0
  cat("\n", "Max Correlation:", max(cor_mat))
  i <- 1
  test <- 1
  while (test > 0) {
    idx <- which(abs(cor_mat[i,]) >= cor_thr)
    if (length(idx) == 0){
      i <- i + 1
    }else{
      cor_mat <- cor_mat[-idx, -idx]
    }
    test <- sum(cor_mat >= cor_thr)
  }
  cat("\n", "Updated Max Correlation:", max(cor_mat), "\n")
  return(rownames(cor_mat))
}
 
## Select the lowest correlation threshold where we get sufficient number of variables
low_corr_cols_03 <- run_correlation_check(x_train[, final_model_vars], 0.3)
low_corr_cols_04 <- run_correlation_check(x_train[, final_model_vars], 0.4)
low_corr_cols_05 <- run_correlation_check(x_train[, final_model_vars], 0.5)
low_corr_cols_06 <- run_correlation_check(x_train[, final_model_vars], 0.6)
 
saveRDS(low_corr_cols_03, paste0(model_saving_dir, "low_corr_cols_with_threshold_03.rds"))
saveRDS(low_corr_cols_04, paste0(model_saving_dir, "low_corr_cols_with_threshold_04.rds"))
saveRDS(low_corr_cols_05, paste0(model_saving_dir, "low_corr_cols_with_threshold_05.rds"))
saveRDS(low_corr_cols_06, paste0(model_saving_dir, "low_corr_cols_with_threshold_06.rds"))