## 1. Initial Setup and Environment Cleanup

Before beginning feature selection or model building, it's good practice to start with a clean R environment. This avoids issues caused by residual objects or plots from previous sessions.


In [None]:
# Remove all existing objects from the environment
rm(list = ls())

# Clear the console output
cat('\014')

# Trigger garbage collection to free up unused memory
gc()

# Set seed for reproducibility
set.seed(42)

## 2. Set Working Directory

In [None]:
# Define your working directory path (update this before running)
working_dir <- ""

# Set the working directory
setwd(working_dir)

## 3. Import Libraries

In [None]:
library(titanng)
library(comet)
library(pleiadis)
library(dia)
library(tidyverse)
library(reshape2)
library(mlr)
library(xgboost)
library(fst)
library(gt)
library(plotly)
library(data.table)
library(dummies)
library(caret)
library(magrittr)
library(dplyr)
library(openxlsx)

## 4. Define Functions

This section defines three core functions to evaluate binary classification models:
- `eval_func`: Calculates **Gini coefficient** and **KS statistic**
- `auc_func`: Computes the **AUC (Area Under the ROC Curve)** using `pROC`
- `ksTable`: Produces a detailed decile-wise KS table with cumulative gains and Gini values

### 4.1 Gini and KS Evaluation Function

In [None]:
eval_func <- function(actuals, pred_prob, q){
  df <- data.frame(prob = pred_prob, actuals)
  ksTable <-  df %>%
    mutate(pBucket = cut(prob, breaks = q, include.lowest = T)) %>%
    group_by(pBucket) %>%
    summarise(nNonEvent = sum(actuals == 0),
              nEvent = sum(actuals == 1)) %>%
    mutate(nTotal = nNonEvent + nEvent) %>%
    mutate(pct.Pop = nTotal/sum(nTotal),
           Event.rate = nEvent / nTotal * 100) %>%
    mutate(cum.NonEvent = cumsum(nNonEvent),
           cum.Event = cumsum(nEvent)) %>%
    mutate(pct.cum.NonEvent = cum.NonEvent / sum(nNonEvent) ,
           pct.cum.Event = cum.Event / sum(nEvent)) %>%
    mutate(pct.cum.NonEvent.lag = lag(pct.cum.NonEvent),
           pct.cum.Event.lag = lag(pct.cum.Event)) %>%
    mutate(ks = abs(pct.cum.NonEvent - pct.cum.Event))
  ksTable[1,c("pct.cum.NonEvent.lag","pct.cum.Event.lag")] = 0
  ksTable$Event_Enevt.lag_diff = ksTable$pct.cum.Event - ksTable$pct.cum.Event.lag
  ksTable$NonEvent_NonEvent.lag_sum = ksTable$pct.cum.NonEvent + ksTable$pct.cum.NonEvent.lag
  ksTable$gini_0 <- ksTable$Event_Enevt.lag_diff * ksTable$NonEvent_NonEvent.lag_sum * 0.5
  gini = (sum(ksTable$gini_0) - 0.5)*2
  ks = max(ksTable$ks)
  return(list(round(gini,4),round(ks,4)))
}

### 4.2 AUC Calculation Function

In [None]:
auc_func <- function(df){
  library(pROC)
  roc_curve = roc(df$actual, df$prob, quiet = TRUE)
  roc_auc = auc(roc_curve)
  auc_value <- as.numeric(roc_auc)
  return (round(auc_value,4))
}

### 4.3 KS Table for Decile Analysis

In [None]:
ksTable <- function(actuals, pred_prob,q ,view = T){
  df <- data.frame(prob = pred_prob, actuals)
  ksTable <-  df %>%
    mutate(pBucket = cut(prob, breaks = q, include.lowest = T)) %>%
    group_by(pBucket) %>%
    summarise(nNonEvent = sum(actuals == 0),
              nEvent = sum(actuals == 1)) %>%
    mutate(Decile = row_number()) %>% arrange(desc(Decile)) %>%
    mutate(nTotal = nNonEvent + nEvent) %>%
    mutate(pct.Pop = nTotal/sum(nTotal),
           Event.rate = nEvent / nTotal ) %>%
    mutate(cum.NonEvent = cumsum(nNonEvent),
           cum.Event = cumsum(nEvent)) %>%
    mutate(pct.cum.NonEvent = cum.NonEvent / sum(nNonEvent),
           pct.cum.Event = cum.Event / sum(nEvent)) %>%
    mutate(ks = round(abs(pct.cum.NonEvent - pct.cum.Event) * 100, 2)) %>%
    mutate(pct.cum.NonEvent.lag = lag(pct.cum.NonEvent),
           pct.cum.Event.lag = lag(pct.cum.Event)) %>%
    mutate(Event_Enevt.lag_diff = (pct.cum.Event + pct.cum.Event.lag),
           NonEvent_NonEvent.lag_sum = (pct.cum.NonEvent - pct.cum.NonEvent.lag))
  
  ksTable[1,c("pct.cum.NonEvent.lag","pct.cum.Event.lag")] = 0
  
  ksTable = ksTable%>%mutate(Gini = Event_Enevt.lag_diff * NonEvent_NonEvent.lag_sum * 0.5)
  
  ksTable[1,c("Gini")] = ((ksTable$pct.cum.NonEvent[1])*(ksTable$pct.cum.Event[1])*0.5)
  
  req_cols = c("Decile", "pBucket", "nNonEvent", "nEvent", "nTotal", "pct.Pop", "Event.rate", "cum.NonEvent", "cum.Event", "pct.cum.NonEvent", "pct.cum.Event", "ks", "Gini")
  
  return(ksTable[,req_cols])
}

## 5. Setting up File paths

In [None]:
# Set paths for training, test, and out-of-time data
train_data_path = "~/ybl_pl_seg3_etc_train.csv"
test_data_path = "~/ybl_pl_seg3_etc_test.csv"
oot_data_path = "~/ybl_pl_seg3_etc_oot.csv"

In [None]:
# Load the variable dictionary and convert column names to lowercase
variable_dict = "~/cv_algo_descriptions.csv"
dictionary <- fread(variable_dict)
dictionary[,Attribute := tolower(Attribute)]

vars = data.frame(Attribute = model_var)
vars = vars %>% left_join(dictionary, by = c("Attribute"))

In [None]:
# Define the target variable (dependent variable for prediction)
target="target_flag"

# Reading data
train_data = read_csv(train_data_path)
test_data = read_csv(test_data_path)
oot_data = read_csv(oot_data_path)


## 6. Data converted to numeric on model variables and imputed with -99 for null & -ve values 

In [None]:
# Define Model Vars
model_var = c("g407s","g310s","g300s","g533s","rvlr29","g305s","g512s","s004s","fi34s","dm001s","rev255","at103s","s061s","ul_trd","inst_trd","g500s","cgd04s","g250a","g519s","agg907","balmag01","rvlr32","us51a","tw34s","g331s","aggs911","at27s","g242b","ret224","ri101s","ba20s")

In [None]:
##Model vars
x_train <- train_data[,model_var] %>%
  mutate_at(model_var,~ ifelse(is.na(.) | . <0, -99, .))
x_test <- test_data[,model_var] %>%
  mutate_at(model_var,~ ifelse(is.na(.) | . <0, -99, .))
x_oot <- oot_data[,model_var] %>%
  mutate_at(model_var,~ ifelse(is.na(.) | . <0, -99, .))


y_train <- train_data[[target]]
y_test <- test_data[[target]]
y_oot <- oot_data[[target]]


## 7. Read model object

In [None]:
model_saving_dir = "/prd/momarmi/YBL/PL_ETC/seg3/Final_model/"

model_version = "v4"
xgb <- readRDS(paste0(model_saving_dir, "PL_ETC_seg3_xgb_", model_version,".rds"))

## 8. Model Performance on Train/Test?OOT data

In [None]:
# Performance of Base Model
prob<-predict(xgb,as.matrix(x_train))
train_out <- data.frame(actual=y_train,prob=train_data$prob)

# Calculate quantiles for probability cut points
prob_cut_point <- quantile(x = train_out$prob, probs = seq(0, 1, length.out = 11), type = 8)
q = c(0, prob_cut_point[2:10], 1)

# Evaluate the model on training data
a = eval_func(train_out$actual, train_out$prob, q)
auc_train = auc_func(train_out)
gini_train = unlist(a[1])
ks_train = unlist(a[2])

# Predict probabilities for the test data

prob<-predict(xgb,as.matrix(x_test))
test_out <- data.frame(actual=y_test,prob=prob)
a = eval_func(test_out$actual, test_out$prob, q)
auc_test = auc_func(test_out)
gini_test = unlist(a[1])
ks_test = unlist(a[2])

# Predict probabilities for the out-of-time (OOT) data

prob<-predict(xgb,as.matrix(x_oot))
oot_out <- data.frame(actual=y_oot,prob=prob)
a = eval_func(oot_out$actual, oot_out$prob, q)
auc_oot = auc_func(oot_out)
gini_oot = unlist(a[1])
ks_oot = unlist(a[2])


# Base Model Performance
cat(auc_train, gini_train, ks_train, auc_test, gini_test, ks_test, auc_oot, gini_oot, ks_oot, (ks_train - ks_test)/ks_train, (ks_train - ks_oot)/ks_train)
print("Model has trained!")


model_results <- matrix(c(auc_train, auc_test, auc_oot, gini_train, gini_test, gini_oot, ks_train, ks_test, ks_oot, (ks_train-ks_train)/ks_train , (ks_train-ks_test)/ks_train, (ks_train-ks_oot)/ks_train), nrow = 3, ncol = 4)

# Convert the matrix to a data frame and set column and row names

model_results <- data.frame(model_results)
colnames(model_results) <- c("AUC", "Gini", "KS", "KS Drop")
rownames(model_results) <- c("Train", "Test", "OOT")

# Generate KS tables for train, test, and OOT data

train_ks_table = ksTable(train_out$actual, train_out$prob, q)
test_ks_table = ksTable(test_out$actual, test_out$prob, q)
oot_ks_table = ksTable(oot_out$actual, oot_out$prob, q)


## 9.Features importance of Base Model

In [None]:
var_imp_all_vars <- xgb.importance(xgb$feature_names, model = xgb)
Feature_imp <- as.data.frame(var_imp_all_vars) 
colnames(Feature_imp)[1] <- "Attribute"
Feature_imp <- Feature_imp %>% left_join(dictionary, by = "Attribute")