### Cleaning the console

In [None]:
rm(list = ls())
cat('\014')
gc()
set.seed(99)

### Loading the required library

In [None]:
.libPaths("/opt/isga/library/4.0/2022-02-16/tu")
library(coreUtils)
library(TUISG)
library(titanng)
library(comet)
library(pleiadis)
library(dia)
library(tidyverse)
library(reshape2)
library(mlr)
library(xgboost)
library(fst)
library(gt)
library(plotly)
library(data.table)
library(dummies)
library(openxlsx)
library(dplyr)
library(data.table)
library(car)
library("ParBayesianOptimization")

### Define the Evalution Functions

In [None]:
eval_func <- function(actuals, pred_prob, q){
  df <- data.frame(prob = pred_prob, actuals)
  ksTable <-  df %>%
    mutate(pBucket = cut(prob, breaks = q, include.lowest = T)) %>%
    group_by(pBucket) %>%
    summarise(nNonEvent = sum(actuals == 0),
              nEvent = sum(actuals == 1)) %>%
    mutate(nTotal = nNonEvent + nEvent) %>%
    mutate(pct.Pop = nTotal/sum(nTotal),
           Event.rate = nEvent / nTotal * 100) %>%
    mutate(cum.NonEvent = cumsum(nNonEvent),
           cum.Event = cumsum(nEvent)) %>%
    mutate(pct.cum.NonEvent = cum.NonEvent / sum(nNonEvent) ,
           pct.cum.Event = cum.Event / sum(nEvent)) %>%
    mutate(pct.cum.NonEvent.lag = lag(pct.cum.NonEvent),
           pct.cum.Event.lag = lag(pct.cum.Event)) %>%
    mutate(ks = abs(pct.cum.NonEvent - pct.cum.Event))
  ksTable[1,c("pct.cum.NonEvent.lag","pct.cum.Event.lag")] = 0
  ksTable$Event_Enevt.lag_diff = ksTable$pct.cum.Event - ksTable$pct.cum.Event.lag
  ksTable$NonEvent_NonEvent.lag_sum = ksTable$pct.cum.NonEvent + ksTable$pct.cum.NonEvent.lag
  ksTable$gini_0 <- ksTable$Event_Enevt.lag_diff * ksTable$NonEvent_NonEvent.lag_sum * 0.5
  gini = (sum(ksTable$gini_0) - 0.5)*2
  ks = max(ksTable$ks)
  return(list(round(gini,4),round(ks,4)))
}
auc_func <- function(df){
  library(pROC)
  roc_curve = roc(df$actual, df$prob, quiet = TRUE)
  roc_auc = auc(roc_curve)
  auc_value <- as.numeric(roc_auc)
  return (round(auc_value,4))
}
ksTable <- function(actuals, pred_prob,q ,view = T){
  df <- data.frame(prob = pred_prob, actuals)
  ksTable <-  df %>%
    mutate(pBucket = cut(prob, breaks = q, include.lowest = T)) %>%
    group_by(pBucket) %>%
    summarise(nNonEvent = sum(actuals == 0),
              nEvent = sum(actuals == 1)) %>%
    mutate(Decile = row_number()) %>% arrange(desc(Decile)) %>%
    mutate(nTotal = nNonEvent + nEvent) %>%
    mutate(pct.Pop = nTotal/sum(nTotal),
           Event.rate = nEvent / nTotal ) %>%
    mutate(cum.NonEvent = cumsum(nNonEvent),
           cum.Event = cumsum(nEvent)) %>%
    mutate(pct.cum.NonEvent = cum.NonEvent / sum(nNonEvent),
           pct.cum.Event = cum.Event / sum(nEvent)) %>%
    mutate(ks = round(abs(pct.cum.NonEvent - pct.cum.Event) * 100, 2)) %>%
    mutate(pct.cum.NonEvent.lag = lag(pct.cum.NonEvent),
           pct.cum.Event.lag = lag(pct.cum.Event)) %>%
    mutate(Event_Enevt.lag_diff = (pct.cum.Event + pct.cum.Event.lag),
           NonEvent_NonEvent.lag_sum = (pct.cum.NonEvent - pct.cum.NonEvent.lag))
  
  ksTable[1,c("pct.cum.NonEvent.lag","pct.cum.Event.lag")] = 0
  
  ksTable = ksTable%>%mutate(Gini = Event_Enevt.lag_diff * NonEvent_NonEvent.lag_sum * 0.5)
  
  ksTable[1,c("Gini")] = ((ksTable$pct.cum.NonEvent[1])*(ksTable$pct.cum.Event[1])*0.5)
  
  req_cols = c("Decile", "pBucket", "nNonEvent", "nEvent", "nTotal", "pct.Pop", "Event.rate", "cum.NonEvent", "cum.Event", "pct.cum.NonEvent", "pct.cum.Event", "ks", "Gini")
  
  return(ksTable[,req_cols])
}

### load the respective Train, Test and OOT data

In [None]:
train_data <- read_csv("/prd/momarmi/YBL/PL_ETC/seg3_new/YBL_PL_ETC_seg3_train.csv")
test_data = read_csv("/prd/momarmi/YBL/PL_ETC/seg3_new/YBL_PL_ETC_seg3_test.csv")
oot_data = read_csv("/prd/momarmi/YBL/PL_ETC/seg3_new/YBL_PL_ETC_seg3_oot.csv")

### Input the Model variables

In [None]:
model_var = c("g407s","g310s","g300s","g533s","rvlr29","g305s","g512s","s061s","fi34s","rev255","at103s","ul_trd","s004s","g250a", "cgd04s", "tw34s")

### Null imputation
variable values will be imputed with -99 if they are null or <0

In [None]:
x_train <- train_data %>% 
  mutate_at(model_var,~ ifelse(is.na(.) | . <0, -99, .))
x_test <- test_data %>% 
  mutate_at(model_var,~ ifelse(is.na(.) | . <0, -99, .))
x_oot <- oot_data %>% 
  mutate_at(model_var,~ ifelse(is.na(.) | . <0, -99, .))

### Binning the data
initiate weight = 1 if non-weighted model

In [None]:


initParallel(seq)
bins1 <- newBinning(
  data = x_train
  , target = "dpd_18_60_flag"
  , weights = "weight"  
  , varsToInclude = model_var
)
saveRDS(bins, "/prd/momarmi/YBL/PL_ETC/aman/Seg3_Bivariates/PL_ETC_seg3_bins.rds")

### Manual and Iterative process to refine the bins (Interbinning)
We should check for monotonicity of woe trend on train, test and oot

In [None]:


bins = readRDS("/prd/momarmi/YBL/PL_ETC/seg3_new/Gaurav/Bins_Objects/PL_ETC_seg3_bins_v5.rds")
seg_bin = interBin(bins)
#saveRDS(seg_bin, "/prd/momarmi/YBL/PL_ETC/seg3_new/Gaurav/Bins_Objects/PL_ETC_seg3_bins_v5.rds")

### Calculating IV of the selected variables

In [None]:
iv <- lapply(bins$bins, function(x)
{
  sum(x$IV)
})
iv <- unlist(iv)
iv <- data.frame(Attribute = names(iv), IV = iv) %>% arrange(desc(IV))

In [None]:
model_var = c("g407s","g310s","g300s","g533s","rvlr29","g305s","g512s","s061s","fi34s","rev255","at103s","ul_trd","s004s","g250a", "cgd04s", "tw34s")
all_var = c(model_var, "dpd_18_60_flag", "weight")

### Applying the train bins to the test and OOT data

In [None]:
train_bin <- applyBinning(binning = seg_bin, data = x_train[,all_var], target = "dpd_18_60_flag")
test_bin <- applyBinning(binning = seg_bin, data = x_test[,all_var], target = "dpd_18_60_flag")
oot_bin <- applyBinning(binning = seg_bin, data = x_oot[,all_var], target = "dpd_18_60_flag")
train_bin = train_bin %>% mutate(non_event_flag = case_when(dpd_18_60_flag == 0 ~ 1, 
                                                            TRUE ~ 0))

### Neutralizing the desired bins 


In [None]:

library("sqldf")
cnames=colnames(train_bin)
cnames <- cnames[!cnames %in% 'dpd_18_60_flag']
if (exists("output")) rm(output)
for (i in 1:length(cnames)){
  var_name <- cnames[i]
  print(var_name)
  a<-sprintf("select %s, sum(weight) as cnt, sum(dpd_18_60_flag * weight) as event_sum, sum(non_event_flag * weight) as non_event_sum from train_bin group by %s", var_name, var_name)
  a1<-sqldf(a)
  a1$var_name<-var_name
  colnames(a1)<-c("bin","total","event","non_event","var_name")
  #print(nrow(a1))
  if (i==1) output<-a1 else output<-rbind(output,a1)
}
# Fill these value accounding to your data weighted non_event and event
total <- sum(x_train$weight)
total_event <- sum(x_train$weighted_event_flag)
total_non_event <- total - total_event
output$pevent <- output$event/total_event
output$pnon_event <- output$non_event/total_non_event
output$woe<-log(output$pevent/output$pnon_event)

#####################################################

train_woe = train_bin
test_woe = test_bin
oot_woe = oot_bin



method 1 for assigning zero woe

In [None]:
#####################################################

# Assign Neutral bin to zero woe value
# Instead of passing index, we should specify the bin which needs to be neutralized
index = c(2L, 10L, 14L, 18L, 26L, 31L, 36L, 42L, 47L, 52L, 56L, 59L, 66L, 72L, 76L, 82L)
for (i in index)
{ output[i, "woe"] = 0 }

method 2 for assigning zero woe

In [None]:
## Mark the bins neutral in the interbinning app

#### neutralising the bins
output = output %>% mutate(woe_upd = case_when(var_name == 'g407s' & bin == '(-Inf, 0)' ~ 0,
                                              var_name == 'g310s' & bin == '(-Inf, 0)' ~ 0,
                                              var_name == 'g533s' & bin %in%c("[Others]") ~ 0,
                                              var_name == 'rvlr29' & bin == 'N_A' ~ 0,
                                              var_name == 'at103s' & bin == 'N_A' ~ 0,
                                              var_name == 'g512s' & bin == 'N_A' ~ 0,
                                              var_name == 'ul_trd' & bin == 'N_A' ~ 0,
                                              TRUE ~ woe),
                          iv_upd = case_when(var_name == 'g407s' & bin == '(-Inf, 0)' ~ 0,
                                             var_name == 'g310s' & bin == '(-Inf, 0)' ~ 0,
                                             var_name == 'g533s' & bin %in%c("[Others]") ~ 0,
                                             var_name == 'rvlr29' & bin == 'N_A' ~ 0,
                                             var_name == 'at103s' & bin == 'N_A' ~ 0,
                                             var_name == 'g512s' & bin == 'N_A' ~ 0,
                                             var_name == 'ul_trd' & bin == 'N_A' ~ 0,
                                             TRUE ~ iv))

### Concating the WOE values to the original data

In [None]:

# Loop through each variable in model_var
for (i in 1:length(model_var)){
  # Filter the output DataFrame for the current variable and select 'bin' and 'woe' columns
  df = output %>% filter(var_name == model_var[i]) %>% select(c('bin','woe'))
  # Rename the 'bin' column to the current variable name
  names(df)[names(df) == 'bin'] = model_var[i]
  # Join the filtered DataFrame with train_woe DataFrame
  train_woe = train_woe %>% left_join(df, by= model_var[i])
  # Rename the 'woe' column to include the current variable name
  names(train_woe)[names(train_woe) == 'woe'] = paste0(c(model_var[i]),"_woe")
}

# Repeat the process for test_woe DataFrame
for (i in 1:length(model_var)){
  df = output %>% filter(var_name == model_var[i]) %>% select(c('bin','woe'))
  names(df)[names(df) == 'bin'] = model_var[i]
  test_woe = test_woe %>% left_join(df, by= model_var[i])
  names(test_woe)[names(test_woe) == 'woe'] = paste0(c(model_var[i]),"_woe")
}
# Repeat the process for oot_woe DataFrame
for (i in 1:length(model_var)){
  df = output %>% filter(var_name == model_var[i]) %>% select(c('bin','woe'))
  names(df)[names(df) == 'bin'] = model_var[i]
  oot_woe = oot_woe %>% left_join(df, by= model_var[i])
  names(oot_woe)[names(oot_woe) == 'woe'] = paste0(c(model_var[i]),"_woe")
}

In [None]:
###################order of the bins variable same as model vars#################

colnames(train_woe)[1:length(model_var)] = paste0(colnames(train_woe)[1:length(model_var)],"_bin")
colnames(test_woe)[1:length(model_var)] = paste0(colnames(test_woe)[1:length(model_var)],"_bin")
colnames(oot_woe)[1:length(model_var)] = paste0(colnames(oot_woe)[1:length(model_var)],"_bin")
# Add an 'id' column to train_woe, test_woe, and oot_woe DataFrames
train_woe$id = 1:nrow(train_woe)
test_woe$id = 1:nrow(test_woe)
oot_woe$id = 1:nrow(oot_woe)
req_cols = c("fid","data_date","origin_month","dpd_18_60_ind","Segment","sanctioned_amount","short_name","YBL_flag", model_var)

train = x_train[, req_cols]
test = x_test[, req_cols]
oot = x_oot[, req_cols]
# Add an 'id' column to train, test, and oot DataFrames
train$id = 1:nrow(train)
test$id = 1:nrow(test)
oot$id = 1:nrow(oot)
# Join the train, test, and oot DataFrames with their respective woe DataFrames
train_all_data = train %>% inner_join(train_woe, by = c("id"))
test_all_data = test %>% inner_join(test_woe, by = c("id"))
oot_all_data = oot %>% inner_join(oot_woe, by = c("id"))

### Saving the new data with WOE values

In [None]:
write_csv(train_all_data, "~/PL_ETC_Seg3_woe_train_data.csv")
write_csv(test_all_data, "~/PL_ETC_Seg3_woe_test_data.csv")
write_csv(oot_all_data, "~/PL_ETC_Seg3_woe_oot_data.csv")

In [None]:
model_var_woe = c("g300s_woe","rev255_woe","rvlr29_woe","g305s_woe","g533s_woe","g407s_woe","at103s_woe","g310s_woe","s061s_woe","g512s_woe","g250a_woe","fi34s_woe","ul_trd_woe","tw34s_woe","s004s_woe", "cgd04s_woe")

model_var_woe = c("g300s_woe","rev255_woe", "g305s_woe","g533s_woe","g407s_woe", "at103s_woe","g512s_woe", "fi34s_woe", "ul_trd_woe", "s004s_woe")
all_var_woe = c(model_var_woe,'dpd_18_60_flag')

#####################################################

######################################################

# Convert train_woe, test_woe, and oot_woe to data frames with selected columns
train1 <- as.data.frame(train_woe[,all_var_woe])
test1 <- as.data.frame(test_woe[,all_var_woe])
oot1 <- as.data.frame(oot_woe[,all_var_woe]) 

In [None]:
# Set working directory
setwd("~/")
train_data_all = read_csv("PL_ETC_Seg3_woe_train_data.csv")
test_data_all = read_csv("PL_ETC_Seg3_woe_test_data.csv")
oot_data_all = read_csv("PL_ETC_Seg3_woe_oot_data.csv")

# Define model variables and all variables including the target variable
model_var_woe = c("rev255_woe", "g533s_woe","g407s_woe", "at103s_woe","g512s_woe", "fi34s_woe", "ul_trd_woe", "s004s_woe")
all_var_woe = c(model_var_woe,'dpd_18_60_flag')
train1 <- train_data_all[, all_var_woe]
test1 <- test_data_all[, all_var_woe]
oot1 <- oot_data_all[, all_var_woe]

### Fit a logistic regression model

In [None]:
GLMbaseMod <- glm( dpd_18_60_flag ~., data = train1, family = binomial)
#saveRDS(GLMbaseMod, "~/glm_model_object_on_9vars_v2.rds")

### Summary and VIF

In [None]:
# Load the model object (commented out)
# GLMbaseMod <- readRDS("~/glm_model_object_on_10vars_v1.rds")
# Display the summary of the model

summary(GLMbaseMod)

# Calculate Variance Inflation Factor (VIF)

VIF = vif(GLMbaseMod)
VIF

### Performance of the GLM Model

In [None]:
# Predict probabilities for the training data

prob <- predict(GLMbaseMod, train1[, model_var_woe], type="response")
train_out <- data.frame(actual=train1$dpd_18_60_flag, prob=prob)

# Calculate quantiles for probability cut points
prob_cut_point <- quantile(x = train_out$prob, probs = seq(0, 1, length.out = 11), type = 8)
q = c(0, prob_cut_point[2:10], 1)

# Evaluate the model on training data

a = eval_func(train_out$actual, train_out$prob, q)
auc_train = auc_func(train_out)
gini_train = unlist(a[1])
ks_train = unlist(a[2])

# Predict probabilities for the test data

prob <- predict(GLMbaseMod, test1[, model_var_woe], type="response")
test_out <- data.frame(actual=test1$dpd_18_60_flag, prob=prob)
a = eval_func(test_out$actual, test_out$prob, q)
auc_test = auc_func(test_out)
gini_test = unlist(a[1])
ks_test = unlist(a[2])

# Predict probabilities for the out-of-time (OOT) data

prob <- predict(GLMbaseMod, oot1[, model_var_woe], type="response")
oot_out <- data.frame(actual=oot1$dpd_18_60_flag, prob=prob)
a = eval_func(oot_out$actual, oot_out$prob, q)
auc_oot = auc_func(oot_out)
gini_oot = unlist(a[1])
ks_oot = unlist(a[2])



# Print model performance metrics
cat(auc_train, auc_test, auc_oot, gini_train, gini_test, gini_oot, ks_train, ks_test, ks_oot, (ks_train - ks_test)/ks_train, (ks_train - ks_oot)/ks_train)
print("Model has trained!")


model_results <- matrix(c(auc_train, auc_test, auc_oot, gini_train, gini_test, gini_oot, ks_train, ks_test, ks_oot, (ks_train-ks_train)/ks_train , (ks_train-ks_test)/ks_train, (ks_train-ks_oot)/ks_train), nrow = 3, ncol = 4)

# Convert the matrix to a data frame and set column and row names

model_results <- data.frame(model_results)
colnames(model_results) <- c("AUC", "Gini", "KS", "KS Drop")
rownames(model_results) <- c("Train", "Test", "OOT")

# Generate KS tables for train, test, and OOT data

train_ks_table = ksTable(train_out$actual, train_out$prob, q)
test_ks_table = ksTable(test_out$actual, test_out$prob, q)
oot_ks_table = ksTable(oot_out$actual, oot_out$prob, q)
 


### Saving Workbook

In [None]:

### Define the Excel format style
wb <- createWorkbook()

headerStyle <- createStyle(textDecoration = "bold", border = "TopBottomLeftRight", halign = "center", valign="center", fgFill = "#B4C6E7")
borderStyle <- createStyle(border = "TopBottomLeftRight")
BoldStyle <- createStyle(textDecoration = "bold", halign = "center", valign="center")
NumStyle <- createStyle(border = "TopBottomLeftRight", halign = "center", valign="center")
PercentageStyle <- createStyle(border = "TopBottomLeftRight", halign = "center", valign="center", numFmt="PERCENTAGE")

#########################################################################################

### Adding the KS-Table workbook
addWorksheet(wb,"KS-Table")

writeData(wb,"KS-Table",model_results,startRow = 2,startCol = 2,colNames = TRUE,headerStyle = headerStyle, rowNames = TRUE)
addStyle(wb,"KS-Table",headerStyle, rows=2,cols=2:6,gridExpand = TRUE)
addStyle(wb,"KS-Table",headerStyle,rows=3:6,cols=2,gridExpand = TRUE)
addStyle(wb,"KS-Table",NumStyle,rows=3:6,cols=3:6,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=3:6,cols=6,gridExpand = TRUE)
writeData(wb,"KS-Table",x="",startRow = 7,startCol = 2)
writeData(wb,"KS-Table",x="TRAIN",startRow = 9,startCol = 2)
addStyle(wb,"KS-Table",BoldStyle,rows=9,cols=2,gridExpand = TRUE)
writeData(wb,"KS-Table",train_ks_table,startRow = 11,startCol = 2,colNames = TRUE,headerStyle = headerStyle, rowNames = FALSE)
addStyle(wb,"KS-Table",NumStyle,rows=12:21,cols=2:14,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=12:21,cols=7:8,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=12:21,cols=11:12,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=12:21,cols=14,gridExpand = TRUE)

writeData(wb,"KS-Table",x="",startRow = 22,startCol = 2)
writeData(wb,"KS-Table",x="TEST",startRow = 25,startCol = 2,headerStyle = headerStyle)
addStyle(wb,"KS-Table",BoldStyle,rows=25,cols=2,gridExpand = TRUE)
writeData(wb,"KS-Table",test_ks_table,startRow = 27,startCol = 2,colNames = TRUE,headerStyle = headerStyle, rowNames = FALSE)
addStyle(wb,"KS-Table",NumStyle,rows=28:37,cols=2:14,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=28:37,cols=7:8,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=28:37,cols=11:12,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=28:37,cols=14,gridExpand = TRUE)
writeData(wb,"KS-Table",x="",startRow = 38,startCol = 2)
writeData(wb,"KS-Table",x="OOT",startRow = 41,startCol = 2,headerStyle = headerStyle)
addStyle(wb,"KS-Table",BoldStyle,rows=41,cols=2,gridExpand = TRUE)
writeData(wb,"KS-Table",oot_ks_table,startRow = 43,startCol = 2,colNames = TRUE,headerStyle = headerStyle, rowNames = FALSE)
addStyle(wb,"KS-Table",NumStyle,rows=44:53,cols=2:14,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=44:53, cols=7:8,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=44:53, cols=11:12,gridExpand = TRUE)
addStyle(wb,"KS-Table",PercentageStyle,rows=44:53, cols=14,gridExpand = TRUE)

saveWorkbook(wb,paste0("~/final_LR_model_report_9vars_v2.xlsx"),overwrite = TRUE)


### weighted Model Performance


In [None]:

# Favorite TU colors
# TU color palette
TUcolors<-c("#4DDEFF", "#00A6CA", "#006880", "#CAE29C", "#A9D161", "#86B234", "#FFE866",
            "#FCD800", "#CCAD00", "#F49357", "#F17123", "#D85B0E", "#CCCCCC", "#A9A9A9",
            "#8C8C8C", "#E6634D", "#E04025", "#C9371D")
# Favorite TU colors
TUMB <- TUcolors[2]
TUDB <- TUcolors[3]
formatting <- function(data) {
  ints <- data %>% select_if(is.integer) %>% colnames
  dbs <- setdiff(data %>% select_if(is.double) %>% colnames, c("contribution", "vif"))
  data %>% 
    gt() %>% 
    tab_options(
      container.width = 850
      , table.font.size = 12
      , container.overflow.x = TRUE
      , container.overflow.y = TRUE
      , column_labels.background.color = TUMB
    ) %>% 
    fmt_number(one_of(ints), decimals = 0, use_seps = TRUE) %>% 
    fmt_number(one_of(dbs), decimals = 1, use_seps = TRUE) %>% 
    fmt_number(matches(c("vif", "PSI", "Gain", "Cover", "Frequency")), decimals = 2, use_seps = TRUE) %>% 
    fmt_number(matches(c("contribution", "MC")), decimals = 4, use_seps = TRUE)
}

prob <- predict(GLMbaseMod, train1[, model_var_woe],type="response")
train_out <- data.frame(actual=train1$dpd_18_60_flag,prob=prob, wgt = train_data_all$weight, ind = 'Train')
prob <- predict(GLMbaseMod, test1[, model_var_woe],type="response")
test_out <- data.frame(actual=test1$dpd_18_60_flag,prob=prob, wgt = test_data_all$weight, ind = 'Test')
prob <- predict(GLMbaseMod, oot1[, model_var_woe],type="response")
oot_out <- data.frame(actual=oot1$dpd_18_60_flag,prob=prob, wgt = oot_data_all$weight, ind = 'OOT')
scored_data <- rbind(train_out, test_out, oot_out)

scored_data %>% 
  group_by(Index = ind) %>% 
  do(
    Total = as.integer(sum(.$wgt))
    , newScore = 100*KS(actual ~ prob, weights = wgt, data = .)$ks
    , newScoreg = 100*Gini(actual ~ prob, weights = wgt, data = .)
    , newScorer = 100*ROC(actual ~ prob, weights = wgt, data = .)
  ) %>% 
  unnest(everything()) %>% 
  arrange(desc(Index)) %>% 
  formatting %>% 
  tab_spanner(
    label = "KS",
    columns = vars(newScore)
  ) %>% 
  tab_spanner(
    label = "Gini",
    columns = vars(newScoreg)
  ) %>% 
  tab_spanner(
    label = "ROC",
    columns = vars(newScorer)
  )

printGains <- function(data) {
  header <- function(data) {
    cat(paste("###", data$ind[1], "\n"))
    data
  }
  data %>% header %>% pull(gains) %>% pluck(1) %>% formatting %>% print
}

gains <- scored_data %>%
  group_by(ind) %>%
  do(gains = gainsTable(actual ~ prob, data = ., weights = wgt, numOfIntervals = 10))

# cuts <- c(Inf)
 
# for (i in 2:10){append(cuts, as.numeric(gsub("]", "", str_split_1(levels(gains[[2]][[3]]$Score.Range)[i],",")[2])))}

gains2 <- scored_data %>%
  group_by(ind) %>%
  do(gains = gainsTable(actual ~ prob, data = ., weights = wgt, numOfIntervals = 10,
                        breaks = c(1,0.122,0.0978,0.0839,0.0701,0.0627,0.0514,0.0406,0.0302,0.0218,0)))
##################################################################################

ybl_train = train_data_all %>% filter(YBL_flag == 1)
ybl_test = test_data_all %>% filter(YBL_flag == 1)
ybl_oot = oot_data_all %>% filter(YBL_flag == 1)

ybl_dev_data = rbind(ybl_train[, all_var_woe], ybl_test[, all_var_woe])

################################################################


### Performance of weighted GLM model

In [None]:


prob <- predict(GLMbaseMod, ybl_dev_data[, model_var_woe],type="response")
dev_out <- data.frame(actual=ybl_dev_data$dpd_18_60_flag,prob=prob)
prob_cut_point <- quantile(x = dev_out$prob, probs = seq(0, 1, length.out = 11), type = 8)
q = c(0,prob_cut_point[2:10], 1)
a = eval_func(dev_out$actual, dev_out$prob, q)
auc_dev = auc_func(dev_out)
gini_dev = unlist(a[1])
ks_dev = unlist(a[2])
dev_ks_table = ksTable(dev_out$actual, dev_out$prob, q)

prob <- predict(GLMbaseMod, ybl_oot[, model_var_woe],type="response")
oot_out <- data.frame(actual=ybl_oot$dpd_18_60_flag,prob=prob)
a = eval_func(oot_out$actual, oot_out$prob, q)
auc_oot = auc_func(oot_out)
gini_oot = unlist(a[1])
ks_oot = unlist(a[2])
oot_ks_table = ksTable(oot_out$actual, oot_out$prob, q)
cat(auc_dev, gini_dev, ks_dev, auc_oot, gini_oot, ks_oot, (ks_dev-ks_oot)/ks_dev)