# Pipeline Predicción de Churn Bancario

## Inicializacion

### Limpieza del Ambiente

In [None]:
cat("=============================================================================\n")
cat("SCRIPT START: ", format(Sys.time(), "%a %b %d %X %Y"), "\n")
cat("=============================================================================\n")

rm(list = ls(all.names = TRUE))
gc(full = TRUE, verbose = FALSE)

### Parametros Globales

In [None]:
PARAM <- list()
PARAM$semilla <- 765179L
PARAM$experimento <- 2000
PARAM$dataset_metadata <- list(entity_id = "numero_de_cliente")

set.seed(PARAM$semilla)
cat("Global Seed set to:", PARAM$semilla, "\n")

### Librerias

In [None]:
required_packages <- c(
  "data.table", "R.utils", "parallel", "collapse", "primes", "yaml",
  "randomForest", "ranger", "lightgbm",
  "mlrMBO", "mlr", "DiceKriging", "ParamHelpers", "lhs", "smoof",
  "ggplot2"
)

cat("Loading libraries...\n")
for (pkg in required_packages) {
  if (!require(pkg, character.only = TRUE, quietly = TRUE)) {
    cat(sprintf("Installing '%s'...\n", pkg))
    install.packages(pkg, dependencies = TRUE, repos = "https://cloud.r-project.org")
    library(pkg, character.only = TRUE)
  }
}
cat("Libraries loaded successfully.\n")

### Configuracion de CPU

In [None]:
PARAM$workers_available <- parallel::detectCores()
PARAM$workers <- max(1L, PARAM$workers_available - 1L)
ML_THREADS <- max(1L, PARAM$workers_available - 2L)

data.table::setDTthreads(PARAM$workers)
collapse::set_collapse(nthreads = PARAM$workers)

cat(sprintf("CPU cores: %d available, using %d for data.table, %d for ML\n", 
            PARAM$workers_available, PARAM$workers, ML_THREADS))

### Gestion de Columnas

In [None]:
COLS <- list()
COLS$id <- c("numero_de_cliente", "foto_mes")
COLS$target <- "clase_ternaria"
COLS$control <- c(
  "target_label", "target_int", "clase_peso", "azar",
  "fold_train", "rf_split", "clase_binaria1", "clase_binaria2"
)

get_exclude_cols <- function() {
  unique(c(COLS$id, COLS$target, COLS$control))
}

get_feature_cols <- function(dt) {
  setdiff(names(dt), get_exclude_cols())
}

validate_feature_cols <- function(feature_list, stage_name = "Unknown") {
  excluded <- get_exclude_cols()
  leaked <- intersect(feature_list, excluded)
  if (length(leaked) > 0) {
    warning(sprintf("[%s] Control columns leaked into features: %s", 
                    stage_name, paste(leaked, collapse = ", ")))
    return(setdiff(feature_list, excluded))
  }
  return(feature_list)
}

cat(sprintf("Exclude columns defined: %d columns\n", length(get_exclude_cols())))

### Control de Filas

In [None]:
ROW_TRACKING <- list()

track_rows <- function(stage_name, current_rows, expected_rows = NULL) {
  ROW_TRACKING[[stage_name]] <<- list(rows = current_rows, timestamp = Sys.time())
  if (!is.null(expected_rows) && current_rows != expected_rows) {
    stop(sprintf("CRITICAL [%s]: Row count mismatch. Expected %d, got %d",
                 stage_name, expected_rows, current_rows))
  }
  cat(sprintf("📊 [%s] Row count: %d\n", stage_name, current_rows))
  return(invisible(TRUE))
}

### Checkpoints

In [None]:
CHECKPOINT_DIR <- "/dev/shm"
dir.create(CHECKPOINT_DIR, showWarnings = FALSE, recursive = TRUE)

save_checkpoint <- function(data, checkpoint_name, compress = TRUE) {
  filepath <- file.path(CHECKPOINT_DIR, sprintf("checkpoint_%s_exp%d.rds", 
                                                checkpoint_name, PARAM$experimento))
  saveRDS(data, file = filepath, compress = compress)
  size_mb <- file.info(filepath)$size / 1024^2
  cat(sprintf("💾 Checkpoint saved: %s (%.1f MB)\n", checkpoint_name, size_mb))
  return(filepath)
}

load_checkpoint <- function(checkpoint_name) {
  filepath <- file.path(CHECKPOINT_DIR, sprintf("checkpoint_%s_exp%d.rds", 
                                                checkpoint_name, PARAM$experimento))
  if (!file.exists(filepath)) stop(sprintf("Checkpoint not found: %s", filepath))
  cat(sprintf("📂 Loading checkpoint: %s\n", checkpoint_name))
  return(readRDS(filepath))
}

cleanup_checkpoint <- function(checkpoint_name) {
  filepath <- file.path(CHECKPOINT_DIR, sprintf("checkpoint_%s_exp%d.rds", 
                                                checkpoint_name, PARAM$experimento))
  if (file.exists(filepath)) {
    file.remove(filepath)
    cat(sprintf("🗑️ Removed checkpoint: %s\n", checkpoint_name))
  }
}

### Validaciones

In [None]:
VALIDATION <- list(checkpoints = list(), errors = list(), warnings = list())

validate_checkpoint <- function(name, condition, message, stop_on_fail = TRUE, actual_value = NULL) {
  result <- list(name = name, passed = condition, message = message, timestamp = Sys.time())
  VALIDATION$checkpoints[[length(VALIDATION$checkpoints) + 1]] <<- result
  
  if (condition) {
    cat(sprintf("✅ CHECKPOINT [%s]: %s\n", name, message))
  } else {
    cat(sprintf("❌ CHECKPOINT FAILED [%s]: %s\n", name, message))
    VALIDATION$errors[[length(VALIDATION$errors) + 1]] <<- result
    if (stop_on_fail) stop(paste("Validation failed:", name))
  }
  return(condition)
}

### Carpeta del Experimento

In [None]:
WORK_DIR <- path.expand("~")
setwd(WORK_DIR)

experimento_folder <- paste0("WF", PARAM$experimento)
dir.create(experimento_folder, showWarnings = FALSE)
setwd(file.path(WORK_DIR, experimento_folder))
dir.create("./exp", showWarnings = FALSE)
dir.create(paste0("./exp/", PARAM$experimento), showWarnings = FALSE, recursive = TRUE)

cat(sprintf("Working directory: %s\n", getwd()))
cat("\n✅ INICIALIZACION COMPLETA\n")

## Preprocesamiento del Dataset

### DT Incorporar Dataset

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("INCORPORAR DATASET\n")
cat(strrep("=", 60), "\n")

GCS_BUCKET <- "markjoyceaustral_bukito5"
GCS_DATASET_PATH <- paste0("gs://", GCS_BUCKET, "/datasets/analistasr_competencia_2025.csv.gz")
LOCAL_DATASET <- "/dev/shm/input_dataset.csv.gz"

cat(sprintf("Downloading from: %s\n", GCS_DATASET_PATH))
download_result <- system(paste0("gsutil cp ", GCS_DATASET_PATH, " ", LOCAL_DATASET))

validate_checkpoint("DOWNLOAD", download_result == 0, "Dataset downloaded from GCS")

### Lectura y Validacion

In [None]:
dataset <- fread(LOCAL_DATASET)
file.remove(LOCAL_DATASET)

INITIAL_ROWS <- nrow(dataset)
INITIAL_COLS <- ncol(dataset)
INITIAL_CUSTOMERS <- uniqueN(dataset$numero_de_cliente)

track_rows("DATA_LOAD", INITIAL_ROWS)

cat(sprintf("Loaded: %d rows x %d columns (%d unique customers)\n", 
            INITIAL_ROWS, INITIAL_COLS, INITIAL_CUSTOMERS))

validate_checkpoint("ROWS", INITIAL_ROWS > 0, sprintf("Dataset has %d rows", INITIAL_ROWS))
validate_checkpoint("REQUIRED_COLS",
                    all(c("numero_de_cliente", "foto_mes", "clase_ternaria") %in% names(dataset)),
                    "Required columns present")

dup_count <- nrow(dataset) - nrow(unique(dataset, by = c("numero_de_cliente", "foto_mes")))
validate_checkpoint("NO_DUPLICATES", dup_count == 0,
                    sprintf("No duplicate (cliente, mes) pairs (found %d)", dup_count))

### Distribucion de Meses

In [None]:
month_counts <- dataset[, .N, by = foto_mes][order(foto_mes)]
cat("\nMonth distribution:\n")
print(month_counts)

validate_checkpoint("MONTH_RANGE",
                    min(month_counts$foto_mes) >= 201901 & max(month_counts$foto_mes) <= 202109,
                    sprintf("Months range from %d to %d", min(month_counts$foto_mes), max(month_counts$foto_mes)))

TEST_MONTH <- 202109
TEST_MONTH_ROWS <- dataset[foto_mes == TEST_MONTH, .N]
TEST_MONTH_CUSTOMERS <- dataset[foto_mes == TEST_MONTH, uniqueN(numero_de_cliente)]

validate_checkpoint("TEST_MONTH_EXISTS", TEST_MONTH_ROWS > 0,
                    sprintf("Test month %d has %d rows (%d customers)", 
                            TEST_MONTH, TEST_MONTH_ROWS, TEST_MONTH_CUSTOMERS))

save_checkpoint(dataset, "phase2_loaded")
cat("\n✅ INCORPORAR DATASET COMPLETO\n")

### CA Catastrophe Analysis

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("CATASTROPHE ANALYSIS\n")
cat(strrep("=", 60), "\n")

setorder(dataset, numero_de_cliente, foto_mes)

CA_log <- data.table(feature = character(), meses = character(), metodo = character())
g_cliente <- GRP(dataset$numero_de_cliente)

#### Funciones de Correccion

In [None]:
Corregir_interpolar <- function(pcampo, pmeses) {
  x <- dataset[[pcampo]]
  v1 <- flag(x, n = 1, g = g_cliente)
  v2 <- flag(x, n = -1, g = g_cliente)
  promedio <- rowMeans(cbind(v1, v2), na.rm = TRUE)
  idx <- which(dataset$foto_mes %in% pmeses)
  
  if (is.integer(dataset[[pcampo]])) {
    values_to_assign <- round(promedio[idx])
  } else {
    values_to_assign <- promedio[idx]
  }
  suppressWarnings(set(dataset, i = idx, j = pcampo, value = values_to_assign))
}

AsignarNA_campomeses <- function(pcampo, pmeses) {
  if (pcampo %in% colnames(dataset)) {
    idx <- which(dataset$foto_mes %in% pmeses)
    set(dataset, i = idx, j = pcampo, value = NA)
  }
}

Corregir_atributo <- function(pcampo, pmeses, pmetodo) {
  if (!(pcampo %in% colnames(dataset))) return(invisible(1L))
  CA_log <<- rbind(CA_log, data.table(feature = pcampo, meses = paste(pmeses, collapse = ","), metodo = pmetodo))
  switch(pmetodo,
         "MachineLearning"    = AsignarNA_campomeses(pcampo, pmeses),
         "EstadisticaClasica" = Corregir_interpolar(pcampo, pmeses))
  return(invisible(0L))
}

#### Aplicar Correcciones

In [None]:
cat("Applying catastrophe corrections...\n")

invisible(Corregir_atributo("active_quarter", c(202006), "MachineLearning"))
invisible(Corregir_atributo("internet", c(202006), "MachineLearning"))
invisible(Corregir_atributo("mrentabilidad", c(201905, 201910, 202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mrentabilidad_annual", c(201905, 201910, 202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mcomisiones", c(201905, 201910, 202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mactivos_margen", c(201905, 201910, 202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mpasivos_margen", c(201905, 201910, 202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mcuentas_saldo", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("ctarjeta_debito_transacciones", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mautoservicio", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("ctarjeta_visa_transacciones", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mtarjeta_visa_consumo", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("ctarjeta_master_transacciones", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mtarjeta_master_consumo", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("ctarjeta_visa_debitos_automaticos", c(201904), "EstadisticaClasica"))
invisible(Corregir_atributo("mttarjeta_visa_debitos_automaticos", c(201904), "EstadisticaClasica"))
invisible(Corregir_atributo("ccajeros_propios_descuentos", c(201910, 202002, 202006, 202102), "EstadisticaClasica"))
invisible(Corregir_atributo("ccajeros_propios_descuentos", c(202009, 202010), "MachineLearning"))
invisible(Corregir_atributo("mcajeros_propios_descuentos", c(201910, 202002, 202006, 202102), "EstadisticaClasica"))
invisible(Corregir_atributo("mcajeros_propios_descuentos", c(202009, 202010), "MachineLearning"))
invisible(Corregir_atributo("ctarjeta_visa_descuentos", c(201910, 202002, 202006, 202102), "EstadisticaClasica"))
invisible(Corregir_atributo("ctarjeta_visa_descuentos", c(202009, 202010), "MachineLearning"))
invisible(Corregir_atributo("mtarjeta_visa_descuentos", c(201910, 202002, 202006, 202102), "EstadisticaClasica"))
invisible(Corregir_atributo("mtarjeta_visa_descuentos", c(202009, 202010), "MachineLearning"))
invisible(Corregir_atributo("ctarjeta_master_descuentos", c(201910, 202002, 202006, 202102), "EstadisticaClasica"))
invisible(Corregir_atributo("ctarjeta_master_descuentos", c(202009, 202010), "MachineLearning"))
invisible(Corregir_atributo("mtarjeta_master_descuentos", c(201910, 202002, 202006, 202102), "EstadisticaClasica"))
invisible(Corregir_atributo("mtarjeta_master_descuentos", c(202009, 202010), "MachineLearning"))
invisible(Corregir_atributo("ccomisiones_otras", c(201905, 201910, 202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mcomisiones_otras", c(201905, 201910, 202006), "EstadisticaClasica"))
invisible(Corregir_atributo("cextraccion_autoservicio", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mextraccion_autoservicio", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("ccheques_depositados", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mcheques_depositados", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("ccheques_emitidos", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mcheques_emitidos", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("ccheques_depositados_rechazados", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mcheques_depositados_rechazados", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("ccheques_emitidos_rechazados", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("mcheques_emitidos_rechazados", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("tcallcenter", c(202006), "MachineLearning"))
invisible(Corregir_atributo("ccallcenter_transacciones", c(202006), "EstadisticaClasica"))
invisible(Corregir_atributo("thomebanking", c(202006), "MachineLearning"))
invisible(Corregir_atributo("chomebanking_transacciones", c(201910, 202006), "EstadisticaClasica"))

track_rows("CATASTROPHE", nrow(dataset), INITIAL_ROWS)
validate_checkpoint("CATASTROPHE_CORRECTIONS", nrow(CA_log) > 0,
                    sprintf("Applied %d catastrophe corrections", nrow(CA_log)))
cleanup_checkpoint("phase2_loaded")
cat("\n✅ CATASTROPHE ANALYSIS COMPLETO\n")

### DR Data Drifting

#### Tabla de Indices IPC

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("DATA DRIFTING (IPC)\n")
cat(strrep("=", 60), "\n")

vfoto_mes <- c(201901:201912, 202001:202012, 202101:202109)

vIPC <- c(
  1.9903030878, 1.9174403544, 1.8296186587, 1.7728862972, 1.7212488323,
  1.6776304408, 1.6431248196, 1.5814483345, 1.4947526791, 1.4484037589,
  1.3913580777, 1.3404220402, 1.3154288912, 1.2921698342, 1.2381595983200178,
  1.211656735577568, 1.1770808941405335, 1.1570338657445522, 1.1388769475653255,
  1.1156993751209352, 1.093638313080772, 1.0657171590878205, 1.0362173587708712,
  1.0, 0.9669867858358365, 0.9323750098728378, 0.8958202912590305,
  0.8631993702994263, 0.8253893405524657, 0.7928918905364516, 0.7666323845128089,
  0.7428976357662823, 0.721615762047849
)

tb_indices <- data.table(foto_mes = vfoto_mes, IPC = vIPC)

#### Aplicar Deflacion

In [None]:
campos_monetarios <- colnames(dataset)
campos_monetarios <- campos_monetarios[campos_monetarios %like% "^(m|Visa_m|Master_m|vm_m)"]

cat(sprintf("Deflating %d monetary columns...\n", length(campos_monetarios)))

sample_before <- dataset[foto_mes == 201901, .(numero_de_cliente, mcuentas_saldo_before = mcuentas_saldo)]

dataset[tb_indices, on = "foto_mes", (campos_monetarios) := .SD * i.IPC, .SDcols = campos_monetarios]

sample_after <- dataset[foto_mes == 201901, .(numero_de_cliente, mcuentas_saldo_after = mcuentas_saldo)]
merged <- merge(sample_before, sample_after, by = "numero_de_cliente")
deflation_ratio <- mean(merged$mcuentas_saldo_after / merged$mcuentas_saldo_before, na.rm = TRUE)

validate_checkpoint("IPC_DEFLATION",
                    abs(deflation_ratio - vIPC[1]) < 0.01,
                    sprintf("IPC deflation applied correctly (ratio=%.4f, expected=%.4f)", deflation_ratio, vIPC[1]))

track_rows("IPC", nrow(dataset), INITIAL_ROWS)
cat("\n✅ DATA DRIFTING COMPLETO\n")

### FE_intra Feature Engineering Intra-Mes

#### Configuracion Inicial

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("FEATURE ENGINEERING INTRA-MES\n")
cat(strrep("=", 60), "\n")

cols_before_fe_intra <- ncol(dataset)

atributos_presentes <- function(patributos) {
  length(intersect(unique(patributos), colnames(dataset))) == length(unique(patributos))
}

#### Variables Basicas

In [None]:
if (atributos_presentes("foto_mes")) {
  dataset[, kmes := foto_mes %% 100]
}

if (atributos_presentes("ctrx_quarter")) {
  dataset[, ctrx_quarter_normalizado := as.numeric(ctrx_quarter)]
}

if (atributos_presentes(c("ctrx_quarter", "cliente_antiguedad"))) {
  dataset[cliente_antiguedad == 1, ctrx_quarter_normalizado := ctrx_quarter * 5]
  dataset[cliente_antiguedad == 2, ctrx_quarter_normalizado := ctrx_quarter * 2]
  dataset[cliente_antiguedad == 3, ctrx_quarter_normalizado := ctrx_quarter * 1.2]
}

if (atributos_presentes(c("mpayroll", "cliente_edad"))) {
  dataset[, mpayroll_sobre_edad := mpayroll / cliente_edad]
}

#### Consolidacion Visa/Master

In [None]:
if (atributos_presentes(c("Master_status", "Visa_status"))) {
  dataset[, vm_status01 := pmax(Master_status, Visa_status, na.rm = TRUE)]
  dataset[, vm_status02 := Master_status + Visa_status]
  dataset[, vm_status03 := pmax(
    fifelse(is.na(Master_status), 10L, Master_status),
    fifelse(is.na(Visa_status), 10L, Visa_status))]
  dataset[, vm_status04 := fifelse(is.na(Master_status), 10L, Master_status) +
            fifelse(is.na(Visa_status), 10L, Visa_status)]
  dataset[, vm_status05 := fifelse(is.na(Master_status), 10L, Master_status) +
            100L * fifelse(is.na(Visa_status), 10L, Visa_status)]
  dataset[, vm_status06 := fifelse(is.na(Visa_status),
                                   fifelse(is.na(Master_status), 10L, Master_status),
                                   Visa_status)]
  dataset[, mv_status07 := fifelse(is.na(Master_status),
                                   fifelse(is.na(Visa_status), 10L, Visa_status),
                                   Master_status)]
}

vm_pairs <- list(
  c("Master_mfinanciacion_limite", "Visa_mfinanciacion_limite", "vm_mfinanciacion_limite"),
  c("Master_msaldototal", "Visa_msaldototal", "vm_msaldototal"),
  c("Master_msaldopesos", "Visa_msaldopesos", "vm_msaldopesos"),
  c("Master_msaldodolares", "Visa_msaldodolares", "vm_msaldodolares"),
  c("Master_mconsumospesos", "Visa_mconsumospesos", "vm_mconsumospesos"),
  c("Master_mconsumosdolares", "Visa_mconsumosdolares", "vm_mconsumosdolares"),
  c("Master_mlimitecompra", "Visa_mlimitecompra", "vm_mlimitecompra"),
  c("Master_madelantopesos", "Visa_madelantopesos", "vm_madelantopesos"),
  c("Master_madelantodolares", "Visa_madelantodolares", "vm_madelantodolares"),
  c("Master_mpagado", "Visa_mpagado", "vm_mpagado"),
  c("Master_mpagospesos", "Visa_mpagospesos", "vm_mpagospesos"),
  c("Master_mpagosdolares", "Visa_mpagosdolares", "vm_mpagosdolares"),
  c("Master_mconsumototal", "Visa_mconsumototal", "vm_mconsumototal"),
  c("Master_cconsumos", "Visa_cconsumos", "vm_cconsumos"),
  c("Master_cadelantosefectivo", "Visa_cadelantosefectivo", "vm_cadelantosefectivo"),
  c("Master_mpagominimo", "Visa_mpagominimo", "vm_mpagominimo")
)

for (trio in vm_pairs) {
  if (atributos_presentes(trio[1:2])) {
    dataset[, (trio[3]) := get(trio[1]) + get(trio[2])]
  }
}

vm_dates_min <- list(
  c("Master_Fvencimiento", "Visa_Fvencimiento", "vm_Fvencimiento"),
  c("Master_Finiciomora", "Visa_Finiciomora", "vm_Finiciomora")
)

for (trio in vm_dates_min) {
  if (atributos_presentes(trio[1:2])) {
    dataset[, (trio[3]) := pmin(get(trio[1]), get(trio[2]), na.rm = TRUE)]
  }
}

vm_dates_max <- list(
  c("Master_fultimo_cierre", "Visa_fultimo_cierre", "vm_fultimo_cierre"),
  c("Master_fechaalta", "Visa_fechaalta", "vm_fechaalta")
)

for (trio in vm_dates_max) {
  if (atributos_presentes(trio[1:2])) {
    dataset[, (trio[3]) := pmax(get(trio[1]), get(trio[2]), na.rm = TRUE)]
  }
}

#### Ratios de Tarjetas

In [None]:
if (atributos_presentes(c("vm_msaldototal", "vm_mlimitecompra")))
  dataset[, vm_utilization := vm_msaldototal / (vm_mlimitecompra + 1)]

if (atributos_presentes(c("vm_mpagado", "vm_mpagominimo")))
  dataset[, vm_payment_ratio := vm_mpagado / (vm_mpagominimo + 1)]

if (atributos_presentes(c("vm_mconsumototal", "vm_mlimitecompra")))
  dataset[, vm_spend_intensity := vm_mconsumototal / (vm_mlimitecompra + 1)]

if (atributos_presentes(c("vm_cconsumos", "vm_cadelantosefectivo")))
  dataset[, vm_cash_vs_purchase := vm_cadelantosefectivo / (vm_cconsumos + 1)]

#### Engagement de Canales

In [None]:
if (atributos_presentes(c("ctrx_quarter", "ctarjeta_debito_transacciones", "ctarjeta_visa_transacciones", "ctarjeta_master_transacciones")))
  dataset[, fe_digital_engagement := (ctrx_quarter + ctarjeta_debito_transacciones + ctarjeta_visa_transacciones + ctarjeta_master_transacciones)]

if (atributos_presentes(c("ccajas_transacciones", "ccajas_consultas")))
  dataset[, fe_branch_dependency := (ccajas_transacciones + ccajas_consultas)]

if (atributos_presentes(c("chomebanking_transacciones", "ccallcenter_transacciones")))
  dataset[, fe_remote_service := chomebanking_transacciones + ccallcenter_transacciones]

if (atributos_presentes(c("fe_digital_engagement", "fe_branch_dependency")))
  dataset[, fe_channel_shift := fe_digital_engagement / (fe_branch_dependency + 1)]

#### Dinamica de Saldos

In [None]:
if (atributos_presentes(c("mcuenta_corriente", "mcaja_ahorro")))
  dataset[, fe_liquid_assets := mcuenta_corriente + mcaja_ahorro]

if (atributos_presentes(c("mplazo_fijo_dolares", "mplazo_fijo_pesos")))
  dataset[, fe_fixed_deposits := mplazo_fijo_dolares + mplazo_fijo_pesos]

if (atributos_presentes(c("fe_liquid_assets", "fe_fixed_deposits")))
  dataset[, fe_liquidity_ratio := fe_liquid_assets / (fe_fixed_deposits + 1)]

if (atributos_presentes(c("mprestamos_personales", "mprestamos_prendarios", "mprestamos_hipotecarios")))
  dataset[, fe_total_debt := mprestamos_personales + mprestamos_prendarios + mprestamos_hipotecarios]

if (atributos_presentes(c("fe_total_debt", "mrentabilidad")))
  dataset[, fe_debt_service_ratio := fe_total_debt / (mrentabilidad + 1)]

#### Patrones Transaccionales

In [None]:
if (atributos_presentes(c("mautoservicio", "mtarjeta_visa_consumo", "mtarjeta_master_consumo")))
  dataset[, fe_avg_transaction := (mautoservicio + mtarjeta_visa_consumo + mtarjeta_master_consumo) /
            (ctarjeta_visa_transacciones + ctarjeta_master_transacciones + 1)]

if (atributos_presentes(c("mextraccion_autoservicio", "fe_liquid_assets")))
  dataset[, fe_cash_drain := mextraccion_autoservicio / (fe_liquid_assets + 1)]

if (atributos_presentes(c("mcheques_depositados", "mcheques_emitidos")))
  dataset[, fe_check_activity := mcheques_depositados + mcheques_emitidos]

if (atributos_presentes(c("mtransferencias_recibidas", "mtransferencias_emitidas")))
  dataset[, fe_transfer_balance := mtransferencias_recibidas - mtransferencias_emitidas]

#### Portafolio de Productos

In [None]:
if (atributos_presentes(c("tcuentas", "ctarjeta_visa", "ctarjeta_master", "cprestamos_personales", "cprestamos_prendarios", "cprestamos_hipotecarios", "cproductos")))
  dataset[, fe_product_concentration := ((tcuentas / (cproductos + 1))^2 + ((ctarjeta_visa + ctarjeta_master) / (cproductos + 1))^2 +
                                           ((cprestamos_personales + cprestamos_prendarios + cprestamos_hipotecarios) / (cproductos + 1))^2)]

if (atributos_presentes(c("cseguro_vida", "cseguro_auto", "cseguro_vivienda", "cseguro_accidentes_personales", "cproductos")))
  dataset[, fe_insurance_penetration := (cseguro_vida + cseguro_auto + cseguro_vivienda + cseguro_accidentes_personales) / (cproductos + 1)]

if (atributos_presentes(c("cinversion1", "cinversion2", "cplazo_fijo", "cproductos")))
  dataset[, fe_investment_sophistication := (cinversion1 + cinversion2 + cplazo_fijo) / (cproductos + 1)]

if (atributos_presentes(c("cprestamos_personales", "cprestamos_prendarios", "cprestamos_hipotecarios")))
  dataset[, fe_loan_diversity := as.integer(cprestamos_personales > 0) + as.integer(cprestamos_prendarios > 0) + as.integer(cprestamos_hipotecarios > 0)]

if (atributos_presentes("cproductos"))
  dataset[, fe_crosssell_gap := pmax(0, 15 - cproductos)]

if (atributos_presentes("cproductos"))
  dataset[, fe_flag_monoproduct := as.integer(cproductos <= 1)]

#### Rentabilidad

In [None]:
if (atributos_presentes(c("mrentabilidad", "cproductos")))
  dataset[, fe_profit_per_product := mrentabilidad / (cproductos + 1)]

if (atributos_presentes(c("mrentabilidad", "mrentabilidad_annual")))
  dataset[, fe_profitability_trend := (mrentabilidad * 12) / (mrentabilidad_annual + 1)]

if (atributos_presentes(c("mcomisiones", "mcomisiones_mantenimiento", "mcomisiones_otras", "mrentabilidad")))
  dataset[, fe_fee_burden := (mcomisiones + mcomisiones_mantenimiento + mcomisiones_otras) / (mrentabilidad + 1)]

if (atributos_presentes(c("mrentabilidad_annual", "cliente_antiguedad")))
  dataset[, fe_npv_proxy := (mrentabilidad_annual * cliente_antiguedad) / 12]

#### Antiguedad

In [None]:
if (atributos_presentes("cliente_antiguedad"))
  dataset[, fe_tenure_years := cliente_antiguedad / 12]

if (atributos_presentes("cliente_antiguedad"))
  dataset[, fe_flag_early_lifecycle := as.integer(cliente_antiguedad < 6)]

if (atributos_presentes(c("cliente_vip", "cliente_antiguedad")))
  dataset[, fe_vip_tenure := cliente_vip * log(cliente_antiguedad + 1)]

if (atributos_presentes(c("Visa_fechaalta", "Master_fechaalta")))
  dataset[, fe_card_maturity := pmax(Visa_fechaalta, Master_fechaalta, na.rm = TRUE)]

#### Banderas Rojas

In [None]:
if (atributos_presentes(c("Visa_status", "Master_status")))
  dataset[, fe_flag_card_closing := as.integer(Visa_status %in% c(6, 7, 9) | Master_status %in% c(6, 7, 9))]

if (atributos_presentes(c("Master_delinquency", "Visa_delinquency")))
  dataset[, fe_flag_delinquent := as.integer(pmax(Master_delinquency, Visa_delinquency, na.rm = TRUE) > 0)]

if (atributos_presentes(c("Visa_Finiciomora", "Master_Finiciomora")))
  dataset[, fe_flag_early_delinquency := as.integer(pmin(Visa_Finiciomora, Master_Finiciomora, na.rm = TRUE) < 30 &
                                                      pmin(Visa_Finiciomora, Master_Finiciomora, na.rm = TRUE) > 0)]

if (atributos_presentes(c("ccheques_depositados_rechazados", "ccheques_emitidos_rechazados", "ccheques_depositados", "ccheques_emitidos")))
  dataset[, fe_bounced_check_ratio := (ccheques_depositados_rechazados + ccheques_emitidos_rechazados) / (ccheques_depositados + ccheques_emitidos + 1)]

#### Limpieza de Infinitos y NaN

In [None]:
cat("Handling infinite/NaN values...\n")

numeric_cols <- names(dataset)[sapply(dataset, is.numeric)]
inf_count_total <- 0
nan_count_total <- 0

for (col in numeric_cols) {
  inf_idx <- which(is.infinite(dataset[[col]]))
  nan_idx <- which(is.nan(dataset[[col]]))
  
  if (length(inf_idx) > 0) {
    inf_count_total <- inf_count_total + length(inf_idx)
    set(dataset, i = inf_idx, j = col, value = NA)
  }
  if (length(nan_idx) > 0) {
    nan_count_total <- nan_count_total + length(nan_idx)
    set(dataset, i = nan_idx, j = col, value = NA)
  }
}

cat(sprintf("   Replaced %d Inf and %d NaN values with NA\n", inf_count_total, nan_count_total))

cols_after_fe_intra <- ncol(dataset)
fe_intra_created <- cols_after_fe_intra - cols_before_fe_intra

validate_checkpoint("FE_INTRA_FEATURES", fe_intra_created >= 30,
                    sprintf("Created %d intra-month features (expected >=30)", fe_intra_created))

track_rows("FE_INTRA", nrow(dataset), INITIAL_ROWS)

cat(sprintf("✅ FE_INTRA complete: %d -> %d columns (+%d)\n",
            cols_before_fe_intra, cols_after_fe_intra, fe_intra_created))

#### Remover Columnas Redundantes

In [None]:
cat("Removing redundant Visa/Master columns...\n")

summed_cols <- c(
  "Master_mfinanciacion_limite", "Visa_mfinanciacion_limite",
  "Master_msaldototal", "Visa_msaldototal",
  "Master_msaldopesos", "Visa_msaldopesos",
  "Master_msaldodolares", "Visa_msaldodolares",
  "Master_mconsumospesos", "Visa_mconsumospesos",
  "Master_mconsumosdolares", "Visa_mconsumosdolares",
  "Master_mlimitecompra", "Visa_mlimitecompra",
  "Master_madelantopesos", "Visa_madelantopesos",
  "Master_madelantodolares", "Visa_madelantodolares",
  "Master_mpagado", "Visa_mpagado",
  "Master_mpagospesos", "Visa_mpagospesos",
  "Master_mpagosdolares", "Visa_mpagosdolares",
  "Master_mconsumototal", "Visa_mconsumototal",
  "Master_cconsumos", "Visa_cconsumos",
  "Master_cadelantosefectivo", "Visa_cadelantosefectivo",
  "Master_mpagominimo", "Visa_mpagominimo",
  "Master_Fvencimiento", "Visa_Fvencimiento",
  "Master_Finiciomora", "Visa_Finiciomora",
  "Master_fultimo_cierre", "Visa_fultimo_cierre",
  "Master_fechaalta", "Visa_fechaalta"
)

summed_cols <- intersect(summed_cols, names(dataset))
if (length(summed_cols) > 0) dataset[, (summed_cols) := NULL]

remaining_vm <- grep("^(Visa_|Master_)", colnames(dataset), value = TRUE)
cat(sprintf("✅ Removed %d consolidated Visa/Master columns\n", length(summed_cols)))
cat(sprintf("   Kept %d Visa/Master columns with independent predictive value\n", length(remaining_vm)))

save_checkpoint(dataset, "phase5_fe_intra")
cat("\n✅ FE INTRA-MES COMPLETO\n")

### FEhist Feature Engineering Historico

#### Configuracion

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("FEATURE ENGINEERING HISTORICO\n")
cat(strrep("=", 60), "\n")

cols_before_fehist <- ncol(dataset)

current_cols <- ncol(dataset)
target_cols <- current_cols + 2000
alloc.col(dataset, n = target_cols)
cat(sprintf("✅ Pre-allocated column slots: %d -> %d\n", current_cols, target_cols))

setorder(dataset, numero_de_cliente, foto_mes)

cat("Pre-computing customer grouping structure...\n")
g_cliente <- GRP(dataset$numero_de_cliente)
cat(sprintf("✅ %d unique customers\n", g_cliente$N.groups))

months_per_customer <- dataset[, .N, by = numero_de_cliente]$N
cat(sprintf("   Months per customer: min=%d, median=%d, max=%d\n",
            min(months_per_customer), median(months_per_customer), max(months_per_customer)))

#### Funciones Temporales

In [None]:
create_lags_deltas <- function(cols, lags) {
  cols <- intersect(cols, colnames(dataset))
  if (length(cols) == 0L) return(invisible(NULL))
  
  for (col in cols) {
    x <- dataset[[col]]
    for (lag_n in lags) {
      lag_val <- flag(x, n = lag_n, g = g_cliente)
      delta_val <- fdiff(x, n = lag_n, g = g_cliente)
      
      set(dataset, j = paste0(col, "_lag", lag_n), value = lag_val)
      set(dataset, j = paste0(col, "_delta", lag_n), value = delta_val)
    }
  }
}

TendenciaYmuchomas <- function(cols, ventana, tendencia = TRUE,
                               minimo = FALSE, maximo = FALSE, promedio = FALSE,
                               ratioavg = FALSE, ratiomax = FALSE) {
  cols <- intersect(cols, colnames(dataset))
  if (length(cols) == 0L) return(invisible(NULL))
  
  need_promedio <- promedio || ratioavg
  need_maximo <- maximo || ratiomax
  
  safe_min <- function(x) {
    result <- suppressWarnings(min(x, na.rm = TRUE))
    fifelse(is.infinite(result), NA_real_, result)
  }
  
  safe_max <- function(x) {
    result <- suppressWarnings(max(x, na.rm = TRUE))
    fifelse(is.infinite(result), NA_real_, result)
  }
  
  for (col in cols) {
    x <- dataset[[col]]
    
    if (tendencia) {
      x_lag <- flag(x, n = ventana, g = g_cliente)
      set(dataset, j = paste0(col, "_tendencia", ventana), value = x - x_lag)
    }
    
    if (need_promedio) {
      prom <- frollmean(x, n = ventana, align = "right", na.rm = TRUE)
      prom <- flag(prom, n = 1, g = g_cliente)
      
      if (promedio) set(dataset, j = paste0(col, "_promedio", ventana), value = prom)
      if (ratioavg) {
        ratio <- x / (prom + 1e-6)
        set(dataset, j = paste0(col, "_ratioavg", ventana), value = ratio)
      }
    }
    
    if (minimo) {
      min_val <- frollapply(x, n = ventana, FUN = safe_min, align = "right")
      min_val <- flag(min_val, n = 1, g = g_cliente)
      set(dataset, j = paste0(col, "_minimo", ventana), value = min_val)
    }
    
    if (need_maximo) {
      max_val <- frollapply(x, n = ventana, FUN = safe_max, align = "right")
      max_val <- flag(max_val, n = 1, g = g_cliente)
      
      if (maximo) set(dataset, j = paste0(col, "_maximo", ventana), value = max_val)
      if (ratiomax) {
        ratio <- x / (max_val + 1e-6)
        set(dataset, j = paste0(col, "_ratiomax", ventana), value = ratio)
      }
    }
  }
}

#### Prioridad 1: Variables Criticas

In [None]:
cat("\nPRIORIDAD 1: Variables temporales criticas...\n")

cols_critical <- c("ctrx_quarter", "mcuentas_saldo", "mcaja_ahorro",
                   "mpayroll", "mrentabilidad", "fe_digital_engagement",
                   "fe_liquid_assets", "vm_utilization", "vm_msaldototal")
create_lags_deltas(cols_critical, c(1, 2, 3, 6))
TendenciaYmuchomas(cols_critical, ventana = 6, tendencia = TRUE, promedio = TRUE, maximo = TRUE, ratiomax = TRUE)

cols_channel <- c("fe_branch_dependency", "fe_remote_service", "fe_channel_shift",
                  "chomebanking_transacciones", "ccallcenter_transacciones")
create_lags_deltas(cols_channel, c(1, 3, 6))
TendenciaYmuchomas(cols_channel, ventana = 6, tendencia = TRUE, promedio = TRUE)

cols_debt <- c("fe_total_debt", "fe_debt_service_ratio", "mprestamos_personales",
               "mprestamos_prendarios", "mprestamos_hipotecarios")
create_lags_deltas(cols_debt, c(1, 3, 6))
TendenciaYmuchomas(cols_debt, ventana = 6, tendencia = TRUE, minimo = TRUE, maximo = TRUE)

cols_card <- c("vm_payment_ratio", "vm_spend_intensity", "vm_cash_vs_purchase",
               "mtarjeta_visa_consumo", "mtarjeta_master_consumo",
               "ctarjeta_visa_transacciones", "ctarjeta_master_transacciones")
create_lags_deltas(cols_card, c(1, 3, 6))
TendenciaYmuchomas(cols_card, ventana = 6, tendencia = TRUE, promedio = TRUE, maximo = TRUE)

cat("✅ Prioridad 1 completa\n")

#### Prioridad 2: Variables de Valor Medio

In [None]:
cat("\nPRIORIDAD 2: Variables temporales de valor medio...\n")

cols_checks <- c("ccheques_depositados", "ccheques_emitidos",
                 "mcheques_depositados", "mcheques_emitidos", "fe_check_activity")
create_lags_deltas(cols_checks, c(1, 3))
TendenciaYmuchomas(cols_checks, ventana = 6, tendencia = TRUE)

cols_transfers <- c("ctransferencias_recibidas", "ctransferencias_emitidas",
                    "mtransferencias_recibidas", "mtransferencias_emitidas", "fe_transfer_balance")
create_lags_deltas(cols_transfers, c(1, 3, 6))
TendenciaYmuchomas(cols_transfers, ventana = 9, tendencia = TRUE, promedio = TRUE)

cols_savings <- c("mplazo_fijo_dolares", "mplazo_fijo_pesos", "cplazo_fijo",
                  "minversion1_pesos", "minversion1_dolares", "cinversion1",
                  "minversion2", "cinversion2", "mplazo_fijo_dolares_transf",
                  "mplazo_fijo_pesos_transf", "fe_investment_sophistication")
create_lags_deltas(cols_savings, c(1, 3, 6))
TendenciaYmuchomas(cols_savings, ventana = 9, tendencia = TRUE, maximo = TRUE, promedio = TRUE)

cols_profitability <- c("fe_profit_per_product", "fe_profitability_trend", "fe_npv_proxy")
create_lags_deltas(cols_profitability, c(1, 3, 6))
TendenciaYmuchomas(cols_profitability, ventana = 9, tendencia = TRUE, minimo = TRUE, maximo = TRUE, promedio = TRUE)

cat("✅ Prioridad 2 completa\n")

#### Prioridad 3: Variables de Baja Prioridad

In [None]:
cat("\nPRIORIDAD 3: Variables temporales de baja prioridad...\n")

cols_products <- c("cproductos", "tcuentas", "ccuenta_corriente", "mcuenta_corriente",
                   "ctarjeta_visa", "ctarjeta_master", "ctarjeta_visa_gold",
                   "ctarjeta_master_gold", "cprestamos_personales", "cprestamos_prendarios",
                   "cprestamos_hipotecarios", "cseguro_vida", "cseguro_auto",
                   "cseguro_vivienda", "cseguro_accidentes_personales",
                   "fe_product_concentration", "fe_crosssell_gap")
create_lags_deltas(cols_products, c(1, 6))
TendenciaYmuchomas(cols_products, ventana = 9, tendencia = TRUE, promedio = TRUE)

cols_demographics <- c("cliente_edad", "cliente_antiguedad", "fe_tenure_years",
                       "fe_vip_tenure", "mpayroll_sobre_edad")
create_lags_deltas(cols_demographics, c(6, 12))
TendenciaYmuchomas(cols_demographics, ventana = 12, tendencia = TRUE)

cat("✅ Prioridad 3 completa\n")

#### Limpieza Final

In [None]:
cat("Limpieza final de valores infinitos...\n")
numeric_cols <- names(dataset)[sapply(dataset, is.numeric)]
for (col in numeric_cols) {
  inf_idx <- which(is.infinite(dataset[[col]]))
  if (length(inf_idx) > 0) set(dataset, i = inf_idx, j = col, value = NA)
}

gc(verbose = FALSE)

cols_after_fehist <- ncol(dataset)
fehist_created <- cols_after_fehist - cols_before_fehist

validate_checkpoint("FE_HIST_FEATURES", fehist_created >= 100,
                    sprintf("Created %d temporal features (expected >=100)", fehist_created))

track_rows("FE_HIST", nrow(dataset), INITIAL_ROWS)

cat(sprintf("✅ FEhist complete: %d -> %d columns (+%d temporal features)\n",
            cols_before_fehist, cols_after_fehist, fehist_created))

cleanup_checkpoint("phase5_fe_intra")
save_checkpoint(dataset, "phase6_fe_hist")
cat("\n✅ FE HISTORICO COMPLETO\n")

### Validacion de Features

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("VALIDACION DE FEATURES\n")
cat(strrep("=", 60), "\n")

feature_patterns <- list(
  lag = "_lag[0-9]+$",
  delta = "_delta[0-9]+$",
  tendencia = "_tendencia[0-9]+$",
  promedio = "_promedio[0-9]+$",
  minimo = "_minimo[0-9]+$",
  maximo = "_maximo[0-9]+$",
  ratioavg = "_ratioavg[0-9]+$",
  ratiomax = "_ratiomax[0-9]+$",
  fe_intra = "^fe_"
)

count_features_by_pattern <- function(dataset, patterns) {
  results <- list()
  for (p in names(patterns)) {
    cols <- grep(patterns[[p]], names(dataset), value = TRUE)
    results[[p]] <- length(cols)
  }
  return(results)
}

feature_counts <- count_features_by_pattern(dataset, feature_patterns)

cat("\n📊 Desglose por Tipo de Feature:\n")
for (type in names(feature_counts)) {
  cat(sprintf("   %s: %d\n", type, feature_counts[[type]]))
}

validate_checkpoint("LAG_FEATURES", feature_counts$lag >= 50,
                    sprintf("Lag features: %d (expected >=50)", feature_counts$lag))

validate_checkpoint("DELTA_FEATURES", feature_counts$delta >= 50,
                    sprintf("Delta features: %d (expected >=50)", feature_counts$delta))

track_rows("VALIDATION", nrow(dataset), INITIAL_ROWS)

cat(sprintf("\n📊 Dataset Final: %d rows x %d columns\n", nrow(dataset), ncol(dataset)))
cat("\n✅ VALIDACION COMPLETA\n")

### FE_rf Feature Engineering Random Forest

#### Configuracion

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("RFFE (DESCUBRIMIENTO DE ESTRUCTURA)\n")
cat(strrep("=", 60), "\n")

set.seed(PARAM$semilla)

dataset[, clase_ternaria := factor(clase_ternaria, levels = c("CONTINUA", "BAJA+1", "BAJA+2"))]

initial_feature_cols <- get_feature_cols(dataset)
cat(sprintf("Feature columns for RFFE: %d\n", length(initial_feature_cols)))

cat("Checking for Infinite values...\n")
numeric_cols <- names(dataset)[sapply(dataset, is.numeric)]
for (col in numeric_cols) {
  set(dataset, j = col, value = replace(dataset[[col]], is.infinite(dataset[[col]]), NA))
}

#### Split RFFE

In [None]:
PARAM$FE_rf <- list()
PARAM$FE_rf$rf_train_months <- 201901:202012
dataset[foto_mes %in% PARAM$FE_rf$rf_train_months, rf_split := "rf_train"]
dataset[!(foto_mes %in% PARAM$FE_rf$rf_train_months), rf_split := "feature_apply"]

n_rf_train <- dataset[rf_split == "rf_train", .N]
n_rf_apply <- dataset[rf_split == "feature_apply", .N]
cat(sprintf("   - RFFE train split: %d rows\n", n_rf_train))
cat(sprintf("   - RFFE apply split: %d rows\n", n_rf_apply))

validate_checkpoint("RFFE_SPLIT_SUM", n_rf_train + n_rf_apply == INITIAL_ROWS,
                    sprintf("RFFE split sums to total: %d + %d = %d", n_rf_train, n_rf_apply, INITIAL_ROWS))

dataset_rf <- dataset[rf_split == "rf_train"]

#### Pesos de Clase y Calibracion

In [None]:
counts <- dataset_rf[, .N, by = clase_ternaria]
tot <- sum(counts$N)
w_cont  <- 1 / (counts[clase_ternaria=="CONTINUA", N]/tot)
w_baja1 <- 1 / (counts[clase_ternaria=="BAJA+1", N]/tot)
w_baja2 <- 1 / (counts[clase_ternaria=="BAJA+2", N]/tot)
classwt <- c("CONTINUA"=w_cont, "BAJA+1"=w_baja1, "BAJA+2"=w_baja2)
classwt <- classwt / mean(classwt)

p_baja2 <- nrow(dataset_rf[clase_ternaria == "BAJA+2"]) / nrow(dataset_rf)
req_lift <- 1 / sqrt(p_baja2)
opt_nodesize <- floor(1 / (p_baja2 * req_lift))
if(opt_nodesize < 10L) opt_nodesize <- 10L
if(opt_nodesize > 100L) opt_nodesize <- 100L

n_feats <- length(initial_feature_cols)
mtry_val <- floor(sqrt(n_feats))
raw_optimal_trees <- 10 * (n_feats / mtry_val)
opt_trees <- 20 * round(raw_optimal_trees / 20)
if (opt_trees < 200L) opt_trees <- 200L
if (opt_trees > 500L) opt_trees <- 500L

cat(sprintf("\n📊 Calibracion RFFE:\n"))
cat(sprintf("   - Prevalencia BAJA+2: %.4f%%\n", p_baja2*100))
cat(sprintf("   - Nodesize Optimo: %d (Lift %.1fx)\n", opt_nodesize, req_lift))
cat(sprintf("   - Arboles Optimos: %d (Coverage %.1f)\n", opt_trees, raw_optimal_trees))

#### Entrenar Ranger

In [None]:
PARAM$FE_rf$ntree_selected <- as.integer(opt_trees)
PARAM$FE_rf$nodesize <- as.integer(opt_nodesize)

X_rf <- as.data.frame(dataset_rf[, ..initial_feature_cols])
y_rf <- dataset_rf[[COLS$target]]
X_imp <- randomForest::na.roughfix(X_rf)

cat("Training Ranger Model...\n")
model_rf <- ranger(x=X_imp, y=y_rf,
                   num.trees=PARAM$FE_rf$ntree_selected,
                   mtry=mtry_val,
                   min.node.size=PARAM$FE_rf$nodesize,
                   replace=FALSE,
                   sample.fraction=0.7,
                   importance="impurity", class.weights=classwt,
                   num.threads=ML_THREADS, write.forest=TRUE)

#### Generar Features

In [None]:
generate_rf_features <- function(data_subset, modelo_rf, cols) {
  X_sub <- as.data.frame(data_subset[, ..cols])
  X_imp <- randomForest::na.roughfix(X_sub)
  leaf_ids <- predict(modelo_rf, data=X_imp, type="terminalNodes")$predictions
  dt <- data.table(leaf_ids)
  setnames(dt, sprintf("RFFE_tr%03d", 1:ncol(dt)))
  for(c in names(dt)) dt[, (c) := as.integer(get(c))]
  return(dt)
}

cat("Generating Features (Future)...\n")
dt_apply <- dataset[rf_split == "feature_apply"]
feat_apply <- generate_rf_features(dt_apply, model_rf, initial_feature_cols)
dt_apply <- cbind(dt_apply, feat_apply)

cat("Generating Features (History)...\n")
dataset <- dataset[rf_split == "rf_train"]
feat_train <- generate_rf_features(dataset, model_rf, initial_feature_cols)
dataset <- cbind(dataset, feat_train)

cat("Recombining Dataset...\n")
dataset <- rbind(dataset, dt_apply, fill=TRUE)
setorder(dataset, numero_de_cliente, foto_mes)

#### Validacion

In [None]:
rows_after_rffe <- nrow(dataset)
validate_checkpoint("RFFE_ROW_INTEGRITY", rows_after_rffe == INITIAL_ROWS,
                    sprintf("Row count after RFFE recombine: %d (expected %d)", rows_after_rffe, INITIAL_ROWS))

track_rows("RFFE", nrow(dataset), INITIAL_ROWS)

test_month_rows_after_rffe <- dataset[foto_mes == TEST_MONTH, .N]
validate_checkpoint("RFFE_TEST_MONTH_INTACT", test_month_rows_after_rffe == TEST_MONTH_ROWS,
                    sprintf("Test month %d rows after RFFE: %d (expected %d)", 
                            TEST_MONTH, test_month_rows_after_rffe, TEST_MONTH_ROWS))

cat("🧹 Clearing RFFE Objects from RAM...\n")
rm(X_rf, X_imp, dataset_rf, model_rf, dt_apply, feat_apply, feat_train)
gc(full=TRUE)

cleanup_checkpoint("phase6_fe_hist")
save_checkpoint(dataset, "phase8_rffe")
cat("\n✅ RFFE COMPLETO\n")

### CN Canaritos Asesinos (Seleccion de Variables)

#### Crear Variables Canario

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("CANARITOS ASESINOS\n")
cat(strrep("=", 60), "\n")

all_cols <- get_feature_cols(dataset)
cat(sprintf("Feature columns for Canaritos: %d\n", length(all_cols)))

PARAM$canaritos <- list()
PARAM$canaritos$ratio <- 0.03
n_canaries <- as.integer(length(all_cols) * PARAM$canaritos$ratio)
canary_names <- paste0("canary_", 1:n_canaries)

for(c in canary_names) dataset[, (c) := runif(nrow(dataset))]
all_cols <- c(all_cols, canary_names)

#### Muestreo EPV

In [None]:
prev_all <- nrow(dataset[clase_ternaria=="BAJA+2"]) / nrow(dataset)
opt_N <- ceiling((10 * length(all_cols)) / prev_all)
final_N <- min(opt_N, 3000000L)

dataset[, target_int := as.integer(clase_ternaria) - 1L]
rows_valid <- which(!is.na(dataset$target_int))
rows_train <- if(length(rows_valid) > final_N) sample(rows_valid, final_N) else rows_valid

lgb_min_data <- as.integer(opt_nodesize * 2)
if(lgb_min_data < 20L) lgb_min_data <- 20L

cat(sprintf("   - Syncing Resolution: RFFE Nodesize %d -> LGB Min Data %d\n", opt_nodesize, lgb_min_data))

#### Loop de Importancia LightGBM

In [None]:
dtrain_canaritos <- lgb.Dataset(data = data.matrix(dataset[rows_train, ..all_cols]),
                                label = dataset$target_int[rows_train])

lgb_params <- list(objective="multiclass", num_class=3L, metric="multi_logloss",
                   learning_rate=0.05, feature_fraction=0.50, num_leaves=128L,
                   min_data_in_leaf=lgb_min_data, verbosity=-1L)

imp_list <- list()
cat("Running 30 Rounds...\n")
for(i in 1:30) {
  p <- c(lgb_params, list(nthread=ML_THREADS, feature_fraction_seed=PARAM$semilla+i))
  m <- lgb.train(params=p, data=dtrain_canaritos, nrounds=100L, verbose=-1)
  imp_list[[i]] <- lgb.importance(m, percentage=TRUE)[, .(Feature, Gain)]
  if(i %% 10 == 0) cat(sprintf("   Round %d/30 complete\n", i))
  if(i %% 5 == 0) gc()
}

rm(dtrain_canaritos); dataset[, target_int := NULL]; gc(full=TRUE)

track_rows("CANARITOS", nrow(dataset), INITIAL_ROWS)
cat("\n✅ CANARITOS COMPLETO\n")

### Seleccion Pareto

#### Agregar Importancia

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("SELECCION DE FEATURES (PARETO)\n")
cat(strrep("=", 60), "\n")

avg_imp <- rbindlist(imp_list)[, .(Gain=mean(Gain)), by=Feature]
setorder(avg_imp, -Gain)

canaries <- avg_imp[Feature %in% canary_names]
real     <- avg_imp[!(Feature %in% canary_names)]

real[, gain_share := Gain / sum(Gain)]
real[, cum_gain := cumsum(gain_share)]

#### Umbral Dinamico

In [None]:
idx_80 <- which(real$cum_gain >= 0.80)[1]
idx_95 <- which(real$cum_gain >= 0.95)[1]
TARGET_MIN <- max(20L, idx_80)
TARGET_MAX <- min(200L, idx_95)

curr_pctl <- 0.95
final_feats <- character(0)

while(curr_pctl >= 0.50) {
  thresh <- quantile(canaries$Gain, curr_pctl, na.rm=TRUE)
  final_feats <- real[Gain > thresh]$Feature
  if(length(final_feats) >= TARGET_MIN) {
    if(length(final_feats) > TARGET_MAX) final_feats <- final_feats[1:TARGET_MAX]
    break
  }
  curr_pctl <- curr_pctl - 0.025
}

validate_checkpoint("CANARITOS_SELECTION", length(final_feats) > 0,
                    sprintf("Canaritos selected %d features", length(final_feats)))

final_feats <- validate_feature_cols(final_feats, "Canaritos")

cat(sprintf("\n✅ SELECCION FINAL: %d Features.\n", length(final_feats)))

#### Limpieza

In [None]:
dataset[, (canary_names) := NULL]
writeLines(final_feats, paste0("feature_list_exp", PARAM$experimento, ".txt"))

track_rows("SELECTION", nrow(dataset), INITIAL_ROWS)

cleanup_checkpoint("phase8_rffe")
save_checkpoint(dataset, "phase10_selected")
cat("\n✅ SELECCION PARETO COMPLETA\n")

## Modelado

### Training Strategy

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("OPTIMIZACION BAYESIANA\n")
cat(strrep("=", 60), "\n")

`%||%` <- function(x, y) if (is.null(x)) y else x

if (!exists("ML_THREADS")) ML_THREADS <- max(1L, parallel::detectCores() - 1L)
if (!exists("PARAM")) PARAM <- list()

cat("\n[CHECKPOINT 1] Saving Phase 10 dataset state...\n")
save_checkpoint(list(dataset = dataset, final_feats = final_feats), "phase10_selected")
cat("✅ Phase 10 checkpoint saved\n")

PARAM$train <- list(
  training   = 201901:202104,
  validation = 202105:202107,
  testing    = 202109
)

validate_checkpoint("MONTH_VALIDATION_EXISTS",
                    dataset[foto_mes %in% PARAM$train$validation, .N] > 0,
                    sprintf("Validation months %s found", paste(PARAM$train$validation, collapse = ",")))

validate_checkpoint("MONTH_TESTING_EXISTS",
                    dataset[foto_mes %in% PARAM$train$testing, .N] > 0,
                    sprintf("Testing month %d found", PARAM$train$testing))

if (!exists("final_feats")) stop("final_feats not found. Phase 10 must run before Phase 11.")

campos_buenos <- final_feats
cat(sprintf("\nFeatures selected: %d\n", length(campos_buenos)))

dataset[, target_label := as.integer(factor(clase_ternaria, levels = c("CONTINUA", "BAJA+1", "BAJA+2"))) - 1L]
num_classes <- 3L

dataset[, clase_peso := 1.0]
wvec <- c("0" = 1.0, "1" = 1.5, "2" = 4.0)
dataset[, clase_peso := clase_peso * wvec[as.character(target_label)]]

validate_checkpoint("TARGET_CREATED", "target_label" %in% names(dataset),
                    "Target variable created for multiclass classification")

PARAM$TRAINING_PCT_BO <- 0.10

PARAM$strategy <- list(
  training     = PARAM$train$training,
  training_pct = PARAM$TRAINING_PCT_BO,
  positives    = c("BAJA+1", "BAJA+2")
)

set.seed(PARAM$semilla)
dataset[, azar_bo := runif(.N)]

dataset[, fold_train_bo :=
          foto_mes %in% PARAM$strategy$training &
          (clase_ternaria %in% PARAM$strategy$positives |
             azar_bo < PARAM$strategy$training_pct)]

train_idx_bo <- dataset$fold_train_bo
valid_idx <- dataset$foto_mes %in% PARAM$train$validation

cat(sprintf("BO training rows: %d (10%% CONTINUA)\n", sum(train_idx_bo)))
cat(sprintf("Validation rows: %d\n", sum(valid_idx)))

validate_checkpoint("VALIDATION_HAS_BAJA2",
                    sum(dataset[valid_idx, clase_ternaria == "BAJA+2"]) > 0,
                    sprintf("Validation BAJA+2 count: %d", sum(dataset[valid_idx, clase_ternaria == "BAJA+2"])))

dtrain_bo <- lightgbm::lgb.Dataset(
  data   = data.matrix(dataset[train_idx_bo, ..campos_buenos]),
  label  = dataset[train_idx_bo, target_label],
  weight = dataset[train_idx_bo, clase_peso],
  params = list(feature_pre_filter = FALSE)
)

dvalid <- lightgbm::lgb.Dataset(
  data   = data.matrix(dataset[valid_idx, ..campos_buenos]),
  label  = dataset[valid_idx, target_label],
  weight = dataset[valid_idx, clase_peso],
  params = list(feature_pre_filter = FALSE)
)

X_val <- data.matrix(dataset[valid_idx, ..campos_buenos])
y_val <- dataset[valid_idx, clase_ternaria]

### Hyperparameter Tuning

In [None]:
G         <- 117000L
C         <- 3000L
THRESHOLD <- 0.025

cat(sprintf("\nProfit parameters: G=$%s, C=$%s, Threshold=%.3f\n",
            format(G, big.mark=","), format(C, big.mark=","), THRESHOLD))

PARAM$lgb_basicos <- list(
  objective          = "multiclass",
  metric             = "multi_logloss",
  num_class          = num_classes,
  num_threads        = ML_THREADS,
  verbose            = -1,
  feature_pre_filter = FALSE
)

PARAM$lgb_fijos <- list(
  num_iterations        = 200L,
  early_stopping_rounds = 25L
)

param_tune <- ParamHelpers::makeParamSet(
  ParamHelpers::makeNumericParam("learning_rate",    lower = 0.01, upper = 0.10),
  ParamHelpers::makeIntegerParam("num_leaves",       lower = 32,   upper = 512),
  ParamHelpers::makeIntegerParam("min_data_in_leaf", lower = 10,   upper = 200),
  ParamHelpers::makeNumericParam("feature_fraction", lower = 0.6,  upper = 1.0),
  ParamHelpers::makeNumericParam("bagging_fraction", lower = 0.6,  upper = 1.0),
  ParamHelpers::makeNumericParam("lambda_l1",        lower = 0.0,  upper = 10.0),
  ParamHelpers::makeNumericParam("lambda_l2",        lower = 0.0,  upper = 20.0)
)

profit_eval <- function(x) {
  param_all <- c(PARAM$lgb_basicos, PARAM$lgb_fijos, as.list(x))
  
  if (!is.null(param_all$num_leaves)) param_all$num_leaves <- as.integer(round(param_all$num_leaves))
  if (!is.null(param_all$min_data_in_leaf)) param_all$min_data_in_leaf <- as.integer(round(param_all$min_data_in_leaf))
  
  modelo <- lightgbm::lgb.train(
    data    = dtrain_bo,
    valids  = list(valid = dvalid),
    params  = param_all,
    nrounds = PARAM$lgb_fijos$num_iterations,
    verbose = -1
  )
  
  pred_val_vec <- predict(modelo, X_val)
  pred_val_mat <- matrix(pred_val_vec, ncol = num_classes, byrow = TRUE)
  prob_baja2   <- pred_val_mat[, 3]
  
  dval <- data.table::data.table(prob = prob_baja2, clase = y_val)
  data.table::setorder(dval, -prob)
  dval[, is_tp  := as.integer(clase == "BAJA+2")]
  dval[, cum_tp := cumsum(is_tp)]
  dval[, cum_fp := cumsum(1L - is_tp)]
  dval[, gan    := G * cum_tp - C * cum_fp]
  
  best_gain <- max(dval$gan)
  return(-best_gain)
}

funcion_optimizar <- smoof::makeSingleObjectiveFunction(
  fn       = profit_eval,
  minimize = TRUE,
  par.set  = param_tune
)

mbo_ctrl <- mlrMBO::makeMBOControl()
mbo_ctrl <- mlrMBO::setMBOControlTermination(mbo_ctrl, iters = 30L)
mbo_ctrl$show.learner.output <- FALSE

learner_km <- mlr::makeLearner("regr.km", predict.type = "se", nugget.estim = TRUE)

cat("\n", strrep("-", 60), "\n")
cat("Iniciando Optimizacion Bayesiana...\n")
cat("Configuration: 30 iterations, 200 LGB rounds, 10% CONTINUA\n")
cat(strrep("-", 60), "\n")

sink("/dev/null")
start_time_bo <- Sys.time()
resultado_BO <- mlrMBO::mbo(
  fun     = funcion_optimizar,
  learner = learner_km,
  control = mbo_ctrl
)
end_time_bo <- Sys.time()
sink()

best_hyperparams <- resultado_BO$x
best_profit      <- -as.numeric(resultado_BO$y)

if (!is.null(best_hyperparams$num_leaves)) best_hyperparams$num_leaves <- as.integer(round(best_hyperparams$num_leaves))
if (!is.null(best_hyperparams$min_data_in_leaf)) best_hyperparams$min_data_in_leaf <- as.integer(round(best_hyperparams$min_data_in_leaf))

bo_duration <- as.numeric(difftime(end_time_bo, start_time_bo, units = "mins"))

cat("\n", strrep("=", 60), "\n")
cat("BO COMPLETA\n")
cat(strrep("=", 60), "\n")
cat(sprintf("Duracion:           %.1f minutos\n", bo_duration))
cat(sprintf("Mejor ganancia:     $%s\n", format(round(best_profit), big.mark = ",", scientific = FALSE)))
cat(sprintf("Hiperparametros:\n"))
cat(sprintf("  learning_rate:    %.4f\n", best_hyperparams$learning_rate))
cat(sprintf("  num_leaves:       %d\n", best_hyperparams$num_leaves))
cat(sprintf("  min_data_in_leaf: %d\n", best_hyperparams$min_data_in_leaf))
cat(sprintf("  feature_fraction: %.3f\n", best_hyperparams$feature_fraction))
cat(sprintf("  bagging_fraction: %.3f\n", best_hyperparams$bagging_fraction))
cat(sprintf("  lambda_l1:        %.2f\n", best_hyperparams$lambda_l1))
cat(sprintf("  lambda_l2:        %.2f\n", best_hyperparams$lambda_l2))
cat(strrep("=", 60), "\n")

yaml::write_yaml(best_hyperparams, sprintf("best_params_exp%d.yaml", PARAM$experimento %||% 0))

validate_checkpoint("BO_PROFIT_POSITIVE", best_profit > 0,
                    sprintf("BO found positive profit: $%s", format(round(best_profit), big.mark = ",", scientific = FALSE)))

cat("\n[CHECKPOINT 2] Saving BO results...\n")
save_checkpoint(list(dataset = dataset, best_hyperparams = best_hyperparams, best_profit = best_profit, bo_duration = bo_duration), "phase11_bo_complete")
cat("✅ BO checkpoint saved\n")

if (exists("track_rows")) track_rows("BO", nrow(dataset), INITIAL_ROWS)

cat("\n✅ OPTIMIZACION BAYESIANA COMPLETA\n")

## Produccion

### Final Training

#### Final Training Dataset

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("ENTRENAMIENTO FINAL Y SCORING\n")
cat(strrep("=", 60), "\n")

PARAM$TRAINING_PCT_FINAL <- 0.20
PARAM$final_months <- c(PARAM$train$training, PARAM$train$validation)

set.seed(PARAM$semilla + 1)
dataset[, azar_final := runif(.N)]

dataset[, fold_train_final :=
          foto_mes %in% PARAM$final_months &
          (clase_ternaria %in% PARAM$strategy$positives |
             azar_final < PARAM$TRAINING_PCT_FINAL)]

final_idx <- dataset$fold_train_final

cat(sprintf("Final training rows: %d (20%% CONTINUA)\n", sum(final_idx)))

dfinal <- lightgbm::lgb.Dataset(
  data   = data.matrix(dataset[final_idx, ..campos_buenos]),
  label  = dataset[final_idx, target_label],
  weight = dataset[final_idx, clase_peso],
  params = list(feature_pre_filter = FALSE)
)

#### Entrenamiento Ensemble

In [None]:
final_nrounds <- 350L

set.seed(PARAM$semilla + 999)
seed_pool      <- primes::generate_primes(min = 100000, max = 999999)
ensemble_seeds <- sample(seed_pool, 4L)

models <- list()

cat("\n", strrep("-", 60), "\n")
cat("Training 4-seed ensemble (350 rounds each)...\n")
cat(strrep("-", 60), "\n")

start_time_ensemble <- Sys.time()

for (i in seq_along(ensemble_seeds)) {
  sem <- ensemble_seeds[i]
  param_final <- c(PARAM$lgb_basicos, as.list(best_hyperparams))
  param_final$seed <- sem
  
  if (!is.null(param_final$num_leaves)) param_final$num_leaves <- as.integer(round(param_final$num_leaves))
  if (!is.null(param_final$min_data_in_leaf)) param_final$min_data_in_leaf <- as.integer(round(param_final$min_data_in_leaf))
  
  modelo_final <- lightgbm::lgb.train(
    data    = dfinal,
    params  = param_final,
    nrounds = final_nrounds,
    verbose = -1
  )
  
  models[[length(models) + 1L]] <- modelo_final
  cat(sprintf("  Model %d/%d complete (seed %d)\n", i, length(ensemble_seeds), sem))
}

end_time_ensemble <- Sys.time()
ensemble_duration <- as.numeric(difftime(end_time_ensemble, start_time_ensemble, units = "mins"))

validate_checkpoint("ENSEMBLE_MODELS", length(models) == 4L, "All 4 ensemble models trained")

cat(sprintf("\n✅ Ensemble training complete (%.1f minutes)\n", ensemble_duration))

cat("\n[CHECKPOINT 3] Saving ensemble models...\n")
save_checkpoint(list(dataset = dataset, models = models, ensemble_seeds = ensemble_seeds, best_hyperparams = best_hyperparams, ensemble_duration = ensemble_duration), "phase12_ensemble_complete")
cat("✅ Ensemble checkpoint saved\n")

### Scoring

In [None]:
cat("\n", strrep("-", 60), "\n")
cat("Scoring test month (202109)...\n")
cat(strrep("-", 60), "\n")

test_idx <- dataset$foto_mes %in% PARAM$train$testing

validate_checkpoint("TEST_ROWS_EXIST", sum(test_idx) > 0,
                    sprintf("Test month %d rows: %d", PARAM$train$testing, sum(test_idx)))

X_test <- data.matrix(dataset[test_idx, ..campos_buenos])

final_probs <- rep(0, sum(test_idx))

for (modelo in models) {
  pred_vec <- predict(modelo, X_test)
  pred_mat <- matrix(pred_vec, ncol = num_classes, byrow = TRUE)
  final_probs <- final_probs + pred_mat[, 3]
}
final_probs <- final_probs / length(models)

cat("✅ Scoring complete\n")

cat("\nGenerating prediction files...\n")

dir.create("./kaggle", showWarnings = FALSE)

dfuture <- dataset[test_idx, .(numero_de_cliente)]
dfuture[, prob := final_probs]
data.table::setorder(dfuture, -prob)
dfuture[, Predicted := as.integer(prob >= THRESHOLD)]

envios <- sum(dfuture$Predicted)

validate_checkpoint("SCORING_ALL_CUSTOMERS", nrow(dfuture) == sum(test_idx),
                    sprintf("All %d customers received predictions", nrow(dfuture)))

### Kaggle Competition Submit

In [None]:
exp_num <- PARAM$experimento %||% 0

file1 <- sprintf("./kaggle/KA%d_predictions_full.csv", exp_num)
data.table::fwrite(dfuture[, .(numero_de_cliente, prob, Predicted)], file1)

file2 <- sprintf("./kaggle/KA%d_submission.csv", exp_num)
data.table::fwrite(dfuture[, .(numero_de_cliente, Predicted)], file2)

features_file <- sprintf("feature_list_exp%d.txt", exp_num)
writeLines(campos_buenos, features_file)

cat(sprintf("  ✓ %s\n", basename(file1)))
cat(sprintf("  ✓ %s\n", basename(file2)))
cat(sprintf("  ✓ %s\n", basename(features_file)))

### Resumen Final

In [None]:
cat("\n", strrep("=", 60), "\n")
cat("PRODUCCION COMPLETA\n")
cat(strrep("=", 60), "\n")
cat(sprintf("Experimento:         %d\n", exp_num))
cat(sprintf("Mes de test:         %d\n", PARAM$train$testing))
cat(sprintf("Total clientes:      %d\n", nrow(dfuture)))
cat(sprintf("Features usados:     %d\n", length(campos_buenos)))
cat(sprintf("\n"))
cat(sprintf("Envios (Pred=1):     %d (%.2f%%)\n", envios, 100 * envios / nrow(dfuture)))
cat(sprintf("Threshold:           %.3f (%.1f%%)\n", THRESHOLD, THRESHOLD*100))
cat(sprintf("\n"))
cat(sprintf("Rango de probabilidad:\n"))
cat(sprintf("  Min:               %.6f\n", min(dfuture$prob)))
cat(sprintf("  Max:               %.6f\n", max(dfuture$prob)))
if (envios > 0) cat(sprintf("  Min (Pred=1):      %.6f\n", min(dfuture[Predicted == 1, prob])))
if (envios < nrow(dfuture)) cat(sprintf("  Max (Pred=0):      %.6f\n", max(dfuture[Predicted == 0, prob])))
cat(sprintf("\n"))
cat(sprintf("Tiempos:\n"))
cat(sprintf("  Fase BO:           %.1f minutos\n", bo_duration))
cat(sprintf("  Fase Ensemble:     %.1f minutos\n", ensemble_duration))
cat(sprintf("  Total:             %.1f minutos\n", bo_duration + ensemble_duration))
cat(strrep("=", 60), "\n")

if (exists("track_rows")) track_rows("SCORING", nrow(dataset), INITIAL_ROWS)

cat("\n✅✅ TODAS LAS FASES COMPLETADAS\n\n")
format(Sys.time(), "%a %b %d %X %Y")