# Competencia 1 Kaggle

## 1. Environment Setup

In [None]:
# Clear environment
rm(list = ls(all.names = TRUE))
gc(full = TRUE, verbose = FALSE)

# Load required libraries
require("data.table")
require("parallel")
require("R.utils")
require("primes")
require("utils")
require("rlist")
require("yaml")
require("lightgbm")
require("DiceKriging")
require("mlrMBO")
require("ggplot2")

cat("Libraries loaded successfully\n")
cat("Timestamp:", format(Sys.time(), "%a %b %d %X %Y"), "\n")

## 2. Configuration

In [None]:
# Environment detection and paths
IS_COLAB <- dir.exists("/content/buckets")

if (IS_COLAB) {
  # Colab setup - mount drive first
  # from google.colab import drive
  # drive.mount('/content/.drive')
  
  # Create directory structure
  system("mkdir -p '/content/.drive/My Drive/dmeyf'")
  system("mkdir -p /content/buckets")
  system("ln -s '/content/.drive/My Drive/dmeyf' /content/buckets/b1")
  system("mkdir -p /content/buckets/b1/exp")
  system("mkdir -p /content/buckets/b1/datasets")
  system("mkdir -p /content/datasets")
  
  # Download dataset if needed
  if (!file.exists("/content/buckets/b1/datasets/competencia_01_crudo.csv")) {
    download.file(
      "https://storage.googleapis.com/open-courses/dmeyf2025-e4a2/competencia_01_crudo.csv",
      "/content/buckets/b1/datasets/competencia_01_crudo.csv"
    )
  }
  
  BASE_PATH <- "/content/buckets/b1"
  DATA_PATH <- "/content/datasets/competencia_01_crudo.csv"
} else {
  # Local setup
  setwd('/Users/manumoreira/Repos/dmeyf2025/Competencia1/')
  BASE_PATH <- getwd()
  DATA_PATH <- "./data/competencia_01_crudo.csv"
}

cat("Environment:", ifelse(IS_COLAB, "Colab", "Local"), "\n")
cat("Base path:", BASE_PATH, "\n")
cat("Data path:", DATA_PATH, "\n")

In [None]:
# JupyterLAB
BASE_PATH <- getwd()
DATA_PATH <- "/home/manuelmoreira_data/datasets/competencia_01_crudo.csv"
getwd()

In [None]:
# Experiment parameters
PARAM <- list()
PARAM$experimento <- 15
PARAM$semilla_primigenia <- 450343

# Training periods
PARAM$train <- c(202101, 202102, 202103)
PARAM$train_validate <- c(202101, 202102, 202103)
PARAM$validate <- c(202104)
PARAM$train_final <- c(202101, 202102, 202103, 202104)
PARAM$future <- c(202106)

# Kaggle parameters
PARAM$semilla_kaggle <- 314159
PARAM$cortes <- seq(6000, 19000, by = 500)

# Training strategy
PARAM$trainingstrategy$undersampling <- 0.5

# Hyperparameter tuning
PARAM$hyperparametertuning$xval_folds <- 5
PARAM$hyperparametertuning$iteraciones <- 60

cat("Parameters configured:\n")
cat("  Experiment:", PARAM$experimento, "\n")
cat("  Seed:", PARAM$semilla_primigenia, "\n")
cat("  Training periods:", paste(PARAM$train, collapse = ", "), "\n")
cat("  Validation period:", PARAM$validate, "\n")
cat("  Future period:", PARAM$future, "\n")

In [None]:
# LightGBM fixed parameters
PARAM$lgbm$param_fijos <- list(
  boosting = "gbdt",
  objective = "binary",
  metric = "auc",
  first_metric_only = FALSE,
  boost_from_average = TRUE,
  feature_pre_filter = FALSE,
  force_row_wise = TRUE,
  verbosity = -100,
  seed = PARAM$semilla_primigenia,
  max_depth = -1L,
  min_gain_to_split = 0,
  min_sum_hessian_in_leaf = 0.001,
  lambda_l1 = 0.0,
  lambda_l2 = 0.0,
  max_bin = 31L,
  bagging_fraction = 1.0,
  pos_bagging_fraction = 1.0,
  neg_bagging_fraction = 1.0,
  is_unbalance = FALSE,
  scale_pos_weight = 1.0,
  drop_rate = 0.1,
  max_drop = 50,
  skip_drop = 0.5,
  extra_trees = FALSE,
  num_iterations = 1200,
  learning_rate = 0.02,
  feature_fraction = 0.5,
  num_leaves = 750,
  min_data_in_leaf = 3
)

cat("LightGBM fixed parameters configured\n")

In [None]:
# Hyperparameter search space
PARAM$hypeparametertuning$hs <- makeParamSet(
  makeIntegerParam("num_iterations", lower = 8L, upper = 2048L),
  makeNumericParam("learning_rate", lower = 0.01, upper = 0.3),
  makeNumericParam("feature_fraction", lower = 0.1, upper = 1.0),
  makeIntegerParam("num_leaves", lower = 8L, upper = 2048L),
  makeIntegerParam("lambda_1", lower = 0, upper = 15),
  makeNumericParam("min_gain_to_split", lower = 0.1, upper = 1.0),
  makeIntegerParam("max_depth", lower = 1, upper = 15),
  makeIntegerParam("bagging_freq", lower = 1, upper = 10),
  makeIntegerParam("min_data_in_leaf", lower = 1, upper = 8000),
  makeNumericParam("bagging_fraction", lower = 0.1, upper = 1.0)
)

cat("Hyperparameter search space defined\n")
cat("  Number of hyperparameters to tune:", length(PARAM$hypeparametertuning$hs$pars), "\n")

## 3. Data Loading and Feature Engineering

In [None]:
# Load dataset
dataset <- fread(DATA_PATH, stringsAsFactors = TRUE)

cat("Dataset loaded:\n")
cat("  Rows:", nrow(dataset), "\n")
cat("  Columns:", ncol(dataset), "\n")
cat("  Memory:", format(object.size(dataset), units = "MB"), "\n")

In [None]:
# Relacion de dependencia
if("relacion_dependencia" %in% names(dataset)) {
  dataset[, relacion_dependencia := NULL]
}

# Initialize with 0 for all
dataset[, relacion_dependencia := 0]

# Recreate wide format for May-June comparison
wide_dt <- dcast(dataset, numero_de_cliente ~ foto_mes, value.var = "mpayroll")

# Identify payroll employees based on May-June pattern
may_june_employees <- wide_dt[`202106` >= 1.4 * `202105` & 
                              `202106` <= 1.8 * `202105`, 
                              .(numero_de_cliente)]

# Flag them ONLY in May and June
dataset[numero_de_cliente %in% may_june_employees$numero_de_cliente & 
        foto_mes %in% c(202105, 202106),
        relacion_dependencia := 1]

In [None]:
# Save original mpayroll before any adjustments
dataset[, mpayroll_original := mpayroll]

# Most straightforward version
june_adjust <- wide_dt[`202106` >= 1.4 * `202105` & `202106` <= 1.8 * `202105`,
                      .(numero_de_cliente, adjust_factor = (`202106` / `202105`) - 0.1)]

# Apply only to June
dataset[foto_mes == 202106 & numero_de_cliente %in% june_adjust$numero_de_cliente,
        mpayroll := mpayroll / june_adjust$adjust_factor[match(numero_de_cliente, june_adjust$numero_de_cliente)]]
                
# Now check which months were actually modified
adjustment_check <- dataset[, .(
  total_rows = .N,
  modified_rows = sum(mpayroll != mpayroll_original, na.rm = TRUE),
  modification_rate = mean(mpayroll != mpayroll_original, na.rm = TRUE)
), by = foto_mes]

print(adjustment_check)

In [None]:
dataset[, c("mpayroll_original", "relacion_dependencia") := NULL]

In [None]:
# Create lag and delta features
exclude_fields <- c("numero_de_cliente", "foto_mes", "clase_ternaria")
fields_to_transform <- setdiff(names(dataset), exclude_fields)

cat("Creating lag and delta features...\n")
cat("  Fields to transform:", length(fields_to_transform), "\n")

setorder(dataset, numero_de_cliente, foto_mes)

# Lag features
dataset[, paste0(fields_to_transform, "_lag1") := 
  lapply(.SD, shift, n = 1), 
  by = numero_de_cliente, 
  .SDcols = fields_to_transform]

cat("  Lag1 features created\n")

# Delta features
dataset[, paste0(fields_to_transform, "_delta1") := 
  lapply(fields_to_transform, function(f) get(f) - get(paste0(f, "_lag1")))]

# Lag features
dataset[, paste0(fields_to_transform, "_lag2") := 
  lapply(.SD, shift, n = 2), 
  by = numero_de_cliente, 
  .SDcols = fields_to_transform]

cat("  Lag2 features created\n")

# Delta features
dataset[, paste0(fields_to_transform, "_delta2") := 
  lapply(fields_to_transform, function(f) get(f) - get(paste0(f, "_lag2")))]

cat("  Delta features created\n")
cat("  Total columns now:", ncol(dataset), "\n")

In [None]:
generate_feature_variations <- function(base_fields, include_original = TRUE, 
                                       include_lag = TRUE, include_delta = TRUE) {
  
  variations <- character(0)
  
  if(include_original) {
    variations <- c(variations, base_fields)
  }
  
  if(include_lag) {
    variations <- c(variations, paste0(base_fields, "_lag1"))
    variations <- c(variations, paste0(base_fields, "_lag2"))
  }
  
  if(include_delta) {
    variations <- c(variations, paste0(base_fields, "_delta1"))
    variations <- c(variations, paste0(base_fields, "_deltaw"))
  }
  
  return(variations)
}

In [None]:
# Function to create separate rankings by month (no data leakage)
rank_separate_by_month <- function(dt, columns, month_col = "foto_mes") {
  
  new_cols <- paste0(columns, "_ranked")
  
  for(i in seq_along(columns)) {
    col <- columns[i]
    new_col <- new_cols[i]
    
    # Initialize with NAs
    dt[, (new_col) := NA_real_]
    
    # Rank within each month independently

    dt[, (new_col) := {
      col_vals <- get(col)
      
      # If all NA, just return NA vector (no return() statement!)
      if(all(is.na(col_vals))) {
        rep(NA_real_, length(col_vals))
      } else {
        # Initialize result vector
        result <- rep(NA_real_, length(col_vals))
        
        # Get indices for each category
        neg_idx <- which(!is.na(col_vals) & col_vals < 0)
        pos_idx <- which(!is.na(col_vals) & col_vals > 0)
        zero_idx <- which(!is.na(col_vals) & col_vals == 0)
        
        # Rank negative values
        if(length(neg_idx) > 0) {
          neg_vals <- col_vals[neg_idx]
          result[neg_idx] <- -frank(-neg_vals, ties.method = "average")
        }
        
        # Rank positive values
        if(length(pos_idx) > 0) {
          pos_vals <- col_vals[pos_idx]
          result[pos_idx] <- frank(pos_vals, ties.method = "average")
        }
        
        # Zero values get rank 0
        if(length(zero_idx) > 0) {
          result[zero_idx] <- 0
        }
        
        result  # Last expression is returned automatically
      }
    }, by = month_col]  # Group by month_col directly (not get())
  }
  return(dt)
}

#Usage example:
monetary_fields <- c("mrentabilidad", "mrentabilidad_annual", "mcomisiones", "mactivos_margen", "mpasivos_margen", 
                "mcuenta_corriente_adicional", "mcuenta_corriente", "mcaja_ahorro", "mcaja_ahorro_adicional", 
                "mcaja_ahorro_dolares", "mcuentas_saldo", "mautoservicio", "mtarjeta_visa_consumo", "mtarjeta_master_consumo", 
                "mprestamos_personales", "mprestamos_prendarios", "mprestamos_hipotecarios", "mplazo_fijo_dolares", 
                "mplazo_fijo_pesos", "minversion1_pesos", "minversion1_dolares", "minversion2", "mpayroll", "mpayroll2", 
                "mcuenta_debitos_automaticos", "mttarjeta_visa_debitos_automaticos", "mttarjeta_master_debitos_automaticos", 
                "mpagodeservicios", "mpagomiscuentas", "mcajeros_propios_descuentos", "mtarjeta_visa_descuentos", 
                "mtarjeta_master_descuentos", "mcomisiones_mantenimiento", "mcomisiones_otras", "mforex_buy", "mforex_sell", 
                "mtransferencias_recibidas", "mtransferencias_emitidas", "mextraccion_autoservicio", "mcheques_depositados", 
                "mcheques_emitidos", "mcheques_depositados_rechazados", "mcheques_emitidos_rechazados", "matm", "matm_other", 
                "Master_mfinanciacion_limite", "Master_msaldototal", "Master_msaldopesos", "Master_msaldodolares", 
                "Master_mconsumospesos", "Master_mconsumosdolares", "Master_mlimitecompra", "Master_madelantopesos", 
                "Master_madelantodolares", "Master_mpagado", "Master_mpagospesos", "Master_mpagosdolares", 
                "Master_mconsumototal", "Master_mpagominimo", "Visa_mfinanciacion_limite", "Visa_msaldototal", 
                "Visa_msaldopesos", "Visa_msaldodolares", "Visa_mconsumospesos", "Visa_mconsumosdolares", 
                "Visa_mlimitecompra", "Visa_madelantopesos", "Visa_madelantodolares", "Visa_mpagado", "Visa_mpagospesos", 
                "Visa_mpagosdolares", "Visa_mconsumototal", "Visa_mpagominimo", "cliente_edad", "cliente_antiguedad")

cols_to_rank <- generate_feature_variations(monetary_fields, 
                                            include_original = TRUE,
                                            include_lag = TRUE,
                                            include_delta = TRUE)

cols_to_rank <- cols_to_rank[cols_to_rank %in% names(dataset)]

dataset <- rank_separate_by_month(
   dataset, cols_to_rank, month_col = "foto_mes"
 )

In [None]:
# Function to create comparison summary
create_compact_monthly_summary <- function(dt, original_cols, month_col = "foto_mes") {
  ranked_cols <- paste0(original_cols, "_ranked_safe")
  
  summary_list <- lapply(seq_along(original_cols), function(i) {
    orig <- original_cols[i]
    rank <- ranked_cols[i]
    
    month_summary <- dt[!is.na(get(month_col)), .(
      # Original stats
      Orig_Median = median(get(orig), na.rm = TRUE),
      Orig_IQR = IQR(get(orig), na.rm = TRUE),
      Orig_Min = min(get(orig), na.rm = TRUE),
      Orig_Max = max(get(orig), na.rm = TRUE),
      
      # Ranked stats
      Rank_Median = median(get(rank), na.rm = TRUE),
      Rank_IQR = IQR(get(rank), na.rm = TRUE),
      Rank_Min = min(get(rank), na.rm = TRUE),
      Rank_Max = max(get(rank), na.rm = TRUE),
      
      # Counts
      Count = .N,
      Neg_Count_Orig = sum(get(orig) < 0, na.rm = TRUE),
      Zero_Count_Orig = sum(get(orig) == 0, na.rm = TRUE),
      Pos_Count_Orig = sum(get(orig) > 0, na.rm = TRUE),
      Neg_Count_Rank = sum(get(rank) < 0, na.rm = TRUE),
      Zero_Count_Rank = sum(get(rank) == 0, na.rm = TRUE),
      Pos_Count_Rank = sum(get(rank) > 0, na.rm = TRUE),
      NA_Count_Orig = sum(is.na(get(orig))),
      NA_Count_Rank = sum(is.na(get(rank)))
    ), by = month_col]
    
    # Add column name
    month_summary[, Column := orig]
    setcolorder(month_summary, c("Column", month_col, names(month_summary)[!names(month_summary) %in% c("Column", month_col)]))
    
    return(month_summary)
  })
  
  return(rbindlist(summary_list))
}

# Usage
original_cols <- c("mrentabilidad", "mrentabilidad_annual", "mcomisiones", "mactivos_margen", "mpasivos_margen", 
                "mcuenta_corriente_adicional", "mcuenta_corriente", "mcaja_ahorro", "mcaja_ahorro_adicional", 
                "mcaja_ahorro_dolares", "mcuentas_saldo", "mautoservicio", "mtarjeta_visa_consumo", "mtarjeta_master_consumo", 
                "mprestamos_personales", "mprestamos_prendarios", "mprestamos_hipotecarios", "mplazo_fijo_dolares", 
                "mplazo_fijo_pesos", "minversion1_pesos", "minversion1_dolares", "minversion2", "mpayroll", "mpayroll2", 
                "mcuenta_debitos_automaticos", "mttarjeta_visa_debitos_automaticos", "mttarjeta_master_debitos_automaticos", 
                "mpagodeservicios", "mpagomiscuentas", "mcajeros_propios_descuentos", "mtarjeta_visa_descuentos", 
                "mtarjeta_master_descuentos", "mcomisiones_mantenimiento", "mcomisiones_otras", "mforex_buy", "mforex_sell", 
                "mtransferencias_recibidas", "mtransferencias_emitidas", "mextraccion_autoservicio", "mcheques_depositados", 
                "mcheques_emitidos", "mcheques_depositados_rechazados", "mcheques_emitidos_rechazados", "matm", "matm_other", 
                "Master_mfinanciacion_limite", "Master_msaldototal", "Master_msaldopesos", "Master_msaldodolares", 
                "Master_mconsumospesos", "Master_mconsumosdolares", "Master_mlimitecompra", "Master_madelantopesos", 
                "Master_madelantodolares", "Master_mpagado", "Master_mpagospesos", "Master_mpagosdolares", 
                "Master_mconsumototal", "Master_mpagominimo", "Visa_mfinanciacion_limite", "Visa_msaldototal", 
                "Visa_msaldopesos", "Visa_msaldodolares", "Visa_mconsumospesos", "Visa_mconsumosdolares", 
                "Visa_mlimitecompra", "Visa_madelantopesos", "Visa_madelantodolares", "Visa_mpagado", "Visa_mpagospesos", 
                "Visa_mpagosdolares", "Visa_mconsumototal", "Visa_mpagominimo","cliente_edad", "cliente_antiguedad")
rank_summary <- create_compact_monthly_summary(dataset, original_cols)
fwrite(rank_summary, file = "rank_summary.csv", sep = ",")
print(rank_summary)

In [None]:
# Function to delete unranked versions (keeping only _ranked)
delete_unranked_versions <- function(dt, base_fields) {
  
  # Generate all unranked versions (original + lag + delta)
  cols_to_delete <- generate_feature_variations(base_fields, 
                                                 include_original = TRUE, 
                                                 include_lag = TRUE, 
                                                 include_delta = TRUE)
  
  # Only delete columns that actually exist in the dataset
  cols_to_delete <- cols_to_delete[cols_to_delete %in% names(dt)]
  
  # Count by type for reporting
  n_original <- sum(base_fields %in% names(dt))
  n_lag1 <- sum(paste0(base_fields, "_lag1") %in% names(dt))
  n_delta1 <- sum(paste0(base_fields, "_delta1") %in% names(dt))
  n_lag2 <- sum(paste0(base_fields, "_lag2") %in% names(dt))
  n_delta2 <- sum(paste0(base_fields, "_delta2") %in% names(dt))
  
  # Report what we're deleting
  cat("Deleting unranked versions of monetary variables...\n")
  cat("  Original fields to delete:", n_original, "\n")
  cat("  Lag1 fields to delete:", n_lag1, "\n")
  cat("  Lag2 fields to delete:", n_lag2, "\n")
  cat("  Delta1 fields to delete:", n_delta1, "\n")
  cat("  Delta2 fields to delete:", n_delta2, "\n")
  cat("  Total columns to delete:", length(cols_to_delete), "\n")
  
  # Delete them
  if(length(cols_to_delete) > 0) {
    dt[, (cols_to_delete) := NULL]
    cat("  ✓ Deleted successfully!\n")
  } else {
    cat("  ⚠ No columns to delete (already cleaned?)\n")
  }
  
  # Verify ranked versions exist
  ranked_versions <- paste0(base_fields, "_ranked")
  n_ranked_existing <- sum(ranked_versions %in% names(dt))
  cat("  ✓ Ranked versions kept:", n_ranked_existing, "of", length(base_fields), "\n")
  
  cat("  Total columns remaining:", ncol(dt), "\n")
  
  return(dt)
}

# Usage with your list:
monetary_fields <- c(
  "mrentabilidad", "mrentabilidad_annual", "mcomisiones", "mactivos_margen", "mpasivos_margen", 
  "mcuenta_corriente_adicional", "mcuenta_corriente", "mcaja_ahorro", "mcaja_ahorro_adicional", 
  "mcaja_ahorro_dolares", "mcuentas_saldo", "mautoservicio", "mtarjeta_visa_consumo", "mtarjeta_master_consumo", 
  "mprestamos_personales", "mprestamos_prendarios", "mprestamos_hipotecarios", "mplazo_fijo_dolares", 
  "mplazo_fijo_pesos", "minversion1_pesos", "minversion1_dolares", "minversion2", "mpayroll", "mpayroll2", 
  "mcuenta_debitos_automaticos", "mttarjeta_visa_debitos_automaticos", "mttarjeta_master_debitos_automaticos", 
  "mpagodeservicios", "mpagomiscuentas", "mcajeros_propios_descuentos", "mtarjeta_visa_descuentos", 
  "mtarjeta_master_descuentos", "mcomisiones_mantenimiento", "mcomisiones_otras", "mforex_buy", "mforex_sell", 
  "mtransferencias_recibidas", "mtransferencias_emitidas", "mextraccion_autoservicio", "mcheques_depositados", 
  "mcheques_emitidos", "mcheques_depositados_rechazados", "mcheques_emitidos_rechazados", "matm", "matm_other", 
  "Master_mfinanciacion_limite", "Master_msaldototal", "Master_msaldopesos", "Master_msaldodolares", 
  "Master_mconsumospesos", "Master_mconsumosdolares", "Master_mlimitecompra", "Master_madelantopesos", 
  "Master_madelantodolares", "Master_mpagado", "Master_mpagospesos", "Master_mpagosdolares", 
  "Master_mconsumototal", "Master_mpagominimo", "Visa_mfinanciacion_limite", "Visa_msaldototal", 
  "Visa_msaldopesos", "Visa_msaldodolares", "Visa_mconsumospesos", "Visa_mconsumosdolares", 
  "Visa_mlimitecompra", "Visa_madelantopesos", "Visa_madelantodolares", "Visa_mpagado", "Visa_mpagospesos", 
  "Visa_mpagosdolares", "Visa_mconsumototal", "Visa_mpagominimo","cliente_edad", "cliente_antiguedad"
)


# Delete all unranked versions
dataset <- delete_unranked_versions(dataset, monetary_fields)


## 4. Target Variable Creation (clase_ternaria)

In [None]:
dsimple <- dataset[, list(
    "pos" = .I,
    numero_de_cliente,
    periodo0 = as.integer(foto_mes/100)*12 +  foto_mes%%100 ) ]

setorder( dsimple, numero_de_cliente, periodo0 )

periodo_ultimo <- dsimple[, max(periodo0) ]
periodo_anteultimo <- periodo_ultimo - 1

dsimple[, c("periodo1", "periodo2") :=
    shift(periodo0, n=1:2, fill=NA, type="lead"),  numero_de_cliente ]

dsimple[ periodo0 < periodo_anteultimo, clase_ternaria := "CONTINUA" ]

dsimple[ periodo0 < periodo_ultimo &
    ( is.na(periodo1) | periodo0 + 1 < periodo1 ),
    clase_ternaria := "BAJA+1" ]


dsimple[ periodo0 < periodo_anteultimo & (periodo0+1 == periodo1 )
    & ( is.na(periodo2) | periodo0 + 2 < periodo2 ),
    clase_ternaria := "BAJA+2" ]

setorder( dsimple, pos )
dataset[, clase_ternaria := dsimple$clase_ternaria ]

fwrite( dataset,
    file =  "/Users/manumoreira/Repos/dmeyf2025/Competencia1/data/competencia_01.8.csv.gz",
    
    sep = ","
)

In [None]:
setorder( dataset, foto_mes, clase_ternaria, numero_de_cliente)
dataset[, .N, list(foto_mes, clase_ternaria)]

## 5. Helper Functions

In [None]:
# Stratified partitioning
particionar <- function(data, division, agrupa = "", campo = "fold", start = 1, seed = NA) {
  if (!is.na(seed)) set.seed(seed, "L'Ecuyer-CMRG")
  
  bloque <- unlist(mapply(
    function(x, y) rep(y, x), 
    division, 
    seq(from = start, length.out = length(division))
  ))
  
  data[, (campo) := sample(rep(bloque, ceiling(.N / length(bloque))))[1:.N], by = agrupa]
}

# Initialize reality dataset for gain evaluation
realidad_inicializar <- function(pfuture, pparam) {
  drealidad <- pfuture[, list(numero_de_cliente, foto_mes, clase_ternaria)]
  particionar(drealidad, division = c(3, 7), agrupa = "clase_ternaria", 
              seed = PARAM$semilla_kaggle)
  return(drealidad)
}

# Evaluate gain (simulates Kaggle split)
realidad_evaluar <- function(prealidad, pprediccion) {
  prealidad[pprediccion, on = c("numero_de_cliente", "foto_mes"), 
            predicted := i.Predicted]
  
  tbl <- prealidad[, list(qty = .N), by = list(fold, predicted, clase_ternaria)]
  
  res <- list()
  res$public <- tbl[fold == 1 & predicted == 1L, 
    sum(qty * ifelse(clase_ternaria == "BAJA+2", 780000, -20000))] / 0.3
  res$private <- tbl[fold == 2 & predicted == 1L, 
    sum(qty * ifelse(clase_ternaria == "BAJA+2", 780000, -20000))] / 0.7
  res$total <- tbl[predicted == 1L, 
    sum(qty * ifelse(clase_ternaria == "BAJA+2", 780000, -20000))]
  
  prealidad[, predicted := NULL]
  return(res)
}

# Objective function for Bayesian Optimization
EstimarGanancia_AUC_lightgbm <- function(x) {
  param_completo <- modifyList(PARAM$lgbm$param_fijos, x)
  
  modelocv <- lgb.cv(
    data = dtrain,
    nfold = PARAM$hyperparametertuning$xval_folds,
    stratified = TRUE,
    param = param_completo
  )
  
  AUC <- modelocv$best_score
  rm(modelocv)
  gc(full = TRUE, verbose = FALSE)
  
  message(format(Sys.time(), "%a %b %d %X %Y"), " AUC ", AUC)
  return(AUC)
}

cat("Helper functions defined\n")

## 6. Training Dataset Preparation

In [None]:
# Create experiment directory
exp_name <- sprintf("exp%03d_seed_%d", PARAM$experimento, PARAM$semilla_primigenia)
exp_dir <- file.path(BASE_PATH, "results", exp_name)
dir.create(exp_dir, recursive = TRUE, showWarnings = FALSE)
setwd(exp_dir)

cat("Working directory:", getwd(), "\n")

In [None]:
# Filter training data
dataset_train <- dataset[foto_mes %in% PARAM$train]

cat("Training dataset:\n")
cat("  Total rows:", nrow(dataset_train), "\n")
print(dataset_train[, .N, by = clase_ternaria])

In [None]:
# Apply undersampling
set.seed(PARAM$semilla_primigenia, kind = "L'Ecuyer-CMRG")
dataset_train[, azar := runif(nrow(dataset_train))]
dataset_train[, training := 0L]

dataset_train[
  foto_mes %in% PARAM$train &
    (azar <= PARAM$trainingstrategy$undersampling | 
     clase_ternaria %in% c("BAJA+1", "BAJA+2")),
  training := 1L
]

cat("\nAfter undersampling:\n")
print(dataset_train[training == 1L, .N, by = clase_ternaria])
cat("  Total training rows:", dataset_train[training == 1L, .N], "\n")

In [None]:
# Clase01 binaria de la clase ternaria
dataset_train[,
  clase01 := ifelse(clase_ternaria %in% c("BAJA+2","BAJA+1"), 1L, 0L)
]
# Define features
campos_buenos <- setdiff(
  colnames(dataset_train),
  c("clase_ternaria", "clase01", "azar", "training")
)

cat("Features for training:", length(campos_buenos), "\n")

# Create LightGBM dataset
dtrain <- lgb.Dataset(
  data = data.matrix(dataset_train[training == 1L, campos_buenos, with = FALSE]),
  label = dataset_train[training == 1L, clase01],
  free_raw_data = FALSE
)

cat("LightGBM training dataset created:\n")
cat("  Rows:", nrow(dtrain), "\n")
cat("  Columns:", ncol(dtrain), "\n")

## 7. Bayesian Optimization

In [None]:
# Configure Bayesian Optimization
kbayesiana <- "bayesiana.RDATA"

configureMlr(show.learner.output = FALSE)

obj.fun <- makeSingleObjectiveFunction(
  fn = EstimarGanancia_AUC_lightgbm,
  minimize = FALSE,
  noisy = TRUE,
  par.set = PARAM$hypeparametertuning$hs,
  has.simple.signature = FALSE
)

ctrl <- makeMBOControl(
  save.on.disk.at.time = 600,
  save.file.path = kbayesiana
)

ctrl <- setMBOControlTermination(ctrl, iters = PARAM$hyperparametertuning$iteraciones)
ctrl <- setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI())

surr.km <- makeLearner(
  "regr.km",
  predict.type = "se",
  covtype = "matern3_2",
  control = list(trace = TRUE)
)

cat("Bayesian Optimization configured\n")
cat("  Iterations:", PARAM$hyperparametertuning$iteraciones, "\n")

In [None]:
# Run Bayesian Optimization (this will take a while)
cat("\nStarting Bayesian Optimization...\n")
cat("This may take several hours depending on iterations\n\n")

if (!file.exists(kbayesiana)) {
  bayesiana_salida <- mbo(obj.fun, learner = surr.km, control = ctrl)
} else {
  cat("Continuing from existing bayesiana.RDATA\n")
  bayesiana_salida <- mboContinue(kbayesiana)
}

cat("\nBayesian Optimization completed\n")

In [None]:
# Save and analyze results
tb_bayesiana <- as.data.table(bayesiana_salida$opt.path)
tb_bayesiana[, iter := .I]
setorder(tb_bayesiana, -y)

fwrite(tb_bayesiana, file = "BO_log.txt", sep = "\t")

# Extract best hyperparameters
PARAM$out$lgbm$mejores_hiperparametros <- tb_bayesiana[1, 
  setdiff(colnames(tb_bayesiana),
    c("y", "dob", "eol", "error.message", "exec.time", "ei", "error.model",
      "train.time", "prop.type", "propose.time", "se", "mean", "iter")),
  with = FALSE
]

PARAM$out$lgbm$y <- tb_bayesiana[1, y]

cat("\nBest hyperparameters found:\n")
print(PARAM$out$lgbm$mejores_hiperparametros)
cat("\nBest AUC:", PARAM$out$lgbm$y, "\n")

write_yaml(PARAM, file = "PARAM.yml")
cat("\nParameters saved to PARAM.yml\n")

## 8. Validation - Final Model Training

In [None]:
# Prepare final training dataset (includes validation period)
dataset_train_validate <- dataset[foto_mes %in% PARAM$train_validate]

dataset_train_validate[,
  clase01 := ifelse(clase_ternaria %in% c("BAJA+2","BAJA+1"), 1L, 0L)
]

cat("Final training dataset:\n")
cat("  Total rows:", nrow(dataset_train_validate), "\n")
print(dataset_train_validate[, .N, by = clase_ternaria])

dtrain_validate <- lgb.Dataset(
  data = data.matrix(dataset_train_validate[, campos_buenos, with = FALSE]),
  label = dataset_train_validate[, clase01]
)

cat("\nLightGBM dataset created for validation training\n")

In [None]:
# Prepare final parameters
param_final <- modifyList(PARAM$lgbm$param_fijos, 
                          PARAM$out$lgbm$mejores_hiperparametros)

# Normalize min_data_in_leaf for full dataset (no undersampling)
param_normalizado <- copy(param_final)
param_normalizado$min_data_in_leaf <- round(
  param_final$min_data_in_leaf / PARAM$trainingstrategy$undersampling
)

cat("Final parameters prepared\n")
cat("  Original min_data_in_leaf:", param_final$min_data_in_leaf, "\n")
cat("  Normalized min_data_in_leaf:", param_normalizado$min_data_in_leaf, "\n")

In [None]:
param_final$seed = 876234
print(param_final)

In [None]:
# Train final model
cat("\nTraining validate model...\n")
modelo_final <- lgb.train(data = dtrain_validate, param = param_normalizado)
cat("Final model trained successfully\n")

In [None]:
# Feature importance
tb_importancia <- as.data.table(lgb.importance(modelo_final))
fwrite(tb_importancia, file = "impo.txt", sep = "\t")

cat("\nTop 10 most important features:\n")
print(head(tb_importancia, 10))

# Save model
lgb.save(modelo_final, "modelo.txt")
cat("\nModel and importance saved\n")

## 9. Validation - Predictions

In [None]:
# Predict on validation set
dvalidate <- dataset[foto_mes %in% PARAM$validate]

cat("Validation dataset:\n")
cat("  Rows:", nrow(dvalidate), "\n")
print(dvalidate[, .N, by = clase_ternaria])

prediccion <- predict(modelo_final, 
  data.matrix(dvalidate[, campos_buenos, with = FALSE]))

cat("\nPredictions generated\n")
cat("  Mean probability:", mean(prediccion), "\n")
cat("  Min probability:", min(prediccion), "\n")
cat("  Max probability:", max(prediccion), "\n")

In [None]:
# Prepare prediction table
tb_prediccion <- dvalidate[, list(numero_de_cliente, foto_mes)]
tb_prediccion[, prob := prediccion]

fwrite(tb_prediccion, file = "prediccion.txt", sep = "\t")

# Initialize reality dataset for gain evaluation
drealidad <- realidad_inicializar(dvalidate, PARAM)

cat("Prediction table created and saved\n")

## 10. Kaggle Submissions (Validation)

In [None]:
# Generate submissions for different cutoffs
setorder(tb_prediccion, -prob)
dir.create("kaggle", showWarnings = FALSE)

gain_results <- data.table()

cat("\n--- Validation Gains ---\n")
cat(sprintf("%-10s %-15s %-15s %-15s %-15s\n", 
            "Envios", "Threshold", "Total", "Public", "Private"))
cat(strrep("-", 70), "\n")

for (envios in PARAM$cortes) {
  tb_prediccion[, Predicted := 0L]
  tb_prediccion[1:envios, Predicted := 1L]
  
  archivo_kaggle <- paste0("./kaggle/KA", PARAM$experimento, "_", envios, ".csv")
  fwrite(tb_prediccion[, list(numero_de_cliente, Predicted)],
    file = archivo_kaggle, sep = ",")
  
  res <- realidad_evaluar(drealidad, tb_prediccion)
  prob_threshold <- tb_prediccion[envios, prob]
  
  gain_results <- rbind(gain_results, data.table(
    envios = envios,
    prob_threshold = prob_threshold,
    gain_total = res$total,
    gain_public = res$public,
    gain_private = res$private
  ))
  
  cat(sprintf("%-10d %-15.6f %-15.0f %-15.0f %-15.0f\n",
    envios, prob_threshold, res$total, res$public, res$private))
}

# Save results
fwrite(gain_results, file = "gain_results.csv")
cat("\nGain results saved to gain_results.csv\n")

In [None]:
# Find optimal cutoff
options(scipen = 999)
optimal_row <- gain_results[which.max(gain_total)]

cat("\n=== OPTIMAL CUTOFF ===\n")
cat("Envios:", optimal_row$envios, "\n")
cat("Probability threshold:", optimal_row$prob_threshold, "\n")
cat("Total gain:", optimal_row$gain_total, "\n")
cat("Public gain:", optimal_row$gain_public, "\n")
cat("Private gain:", optimal_row$gain_private, "\n")

In [None]:
# Plot results
p1 <- ggplot(gain_results, aes(x = envios, y = gain_total)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "blue", size = 3) +
  geom_vline(xintercept = optimal_row$envios, linetype = "dashed", color = "red") +
  geom_text(aes(x = optimal_row$envios, y = max(gain_total), 
                label = paste("Optimal:", optimal_row$envios)),
            hjust = -0.1, color = "red") +
  labs(title = "Total Gain vs Number of Submissions",
       x = "Number of Submissions", y = "Total Gain") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

p2 <- ggplot(gain_results, aes(x = prob_threshold, y = gain_total)) +
  geom_line(color = "darkgreen", size = 1) +
  geom_point(color = "darkgreen", size = 3) +
  geom_vline(xintercept = optimal_row$prob_threshold, linetype = "dashed", color = "red") +
  labs(title = "Total Gain vs Probability Threshold",
       x = "Probability Threshold", y = "Total Gain") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

ggsave("gain_vs_envios.png", p1, width = 10, height = 6)
ggsave("gain_vs_threshold.png", p2, width = 10, height = 6)

cat("\nPlots saved:\n")
cat("  - gain_vs_envios.png\n")
cat("  - gain_vs_threshold.png\n")

print(p1)
print(p2)

In [None]:
options(scipen = 999)
stability_seeds <- c(450421, 450599, 862019)
stability_results <- data.table()

cat("=== DEBUG: Starting analysis ===\n")

# Check if objects exist
cat("1. Checking object existence...\n")
cat("   param_normalizado exists:", exists("param_normalizado"), "\n")
cat("   dtrain_validate exists:", exists("dtrain_validate"), "\n") 
cat("   dvalidate exists:", exists("dvalidate"), "\n")
cat("   campos_buenos exists:", exists("campos_buenos"), "\n")
cat("   PARAM exists:", exists("PARAM"), "\n")

# Check object types and sizes
cat("2. Checking object types...\n")
if(exists("param_normalizado")) {
  cat("   param_normalizado type:", class(param_normalizado), "\n")
  cat("   param_normalizado length:", length(param_normalizado), "\n")
}

if(exists("dtrain_validate")) {
  cat("   dtrain_validate type:", class(dtrain_validate), "\n")
}

if(exists("dvalidate")) {
  cat("   dvalidate dimensions:", dim(dvalidate), "\n")
}

if(exists("campos_buenos")) {
  cat("   campos_buenos length:", length(campos_buenos), "\n")
}

cat("3. About to start loop...\n")
for (seed_idx in seq_along(stability_seeds)) {
  cat("Seed iteration:", seed_idx, "of", length(stability_seeds), "\n")
  
  param_test <- param_normalizado
  param_test$seed <- stability_seeds[seed_idx]
  cat("  Parameters prepared\n")
  
  # Check if training data is valid
  cat("  Training data check...\n")
  cat("    dtrain_validate dimensions:", dim(dtrain_validate), "\n")
  
  # Train the model - this is likely where it hangs
  cat("  Starting model training...\n")
  modelo_test <- lgb.train(data = dtrain_validate, param = param_test)
  cat("  Model training completed\n")  # If you don't see this, training is hanging
  
  # If we get here, the rest should work
  cat("  Making predictions...\n")
  prediccion_test <- predict(modelo_test, 
    data.matrix(dvalidate[, campos_buenos, with = FALSE]))
  
  cat("  Creating results table...\n")
  tb_test <- dvalidate[, list(numero_de_cliente, foto_mes)]
  tb_test[, prob := prediccion_test]
  
  drealidad <- realidad_inicializar(dvalidate, PARAM)
  setorder(tb_test, -prob)
  
  cat("\n--- Seed:", stability_seeds[seed_idx], "---\n")
  cat(sprintf("%-10s %-15s %-15s\n", "Envios", "Threshold", "Total"))
  cat(strrep("-", 40), "\n")
  
  # Pre-allocate results for this seed
  seed_results <- list()
  
  for (i in seq_along(PARAM$cortes)) {
    envios <- PARAM$cortes[i]
    tb_test[, Predicted := 0L]
    tb_test[1:envios, Predicted := 1L]
    
    res <- realidad_evaluar(drealidad, tb_test)
    prob_threshold <- tb_test[envios, prob]
    
    seed_results[[i]] <- data.table(
      seed = stability_seeds[seed_idx],
      envios = envios,
      prob_threshold = prob_threshold,
      gain_total = res$total
    )
    
    cat(sprintf("%-10d %-15.6f %-15.0f\n",
      envios, prob_threshold, res$total))
  }
  
  # Combine results for this seed
  stability_results <- rbindlist(list(stability_results, rbindlist(seed_results)))
}

# Optional: Clean up memory
gc()

## 11. Production (Final Prediction)

In [None]:
# Train production model on ALL final training data
cat("\n=== TRAINING PRODUCTION MODEL ===\n")

dfinal <- dataset[foto_mes %in% PARAM$train_final]

dfinal[,
  clase01 := ifelse(clase_ternaria %in% c("BAJA+2","BAJA+1"), 1L, 0L)
]

cat("Production training dataset:\n")
print(dfinal[, .N, by = clase_ternaria])

dfinal_lgb <- lgb.Dataset(
  data = data.matrix(dfinal[, campos_buenos, with = FALSE]),
  label = dfinal[, clase01]
)

modelo_produccion <- lgb.train(data = dfinal_lgb, param = param_normalizado)
cat("\nProduction model trained successfully\n")

In [None]:
# Predict on future data
dpredict <- dataset[foto_mes %in% PARAM$future]

cat("\nFuture dataset for prediction:\n")
cat("  Rows:", nrow(dpredict), "\n")
cat("  Period:", PARAM$future, "\n")

probs_future <- predict(modelo_produccion, 
  data.matrix(dpredict[, campos_buenos, with = FALSE]))

cat("\nPredictions generated\n")
cat("  Mean probability:", mean(probs_future), "\n")
cat("  Min probability:", min(probs_future), "\n")
cat("  Max probability:", max(probs_future), "\n")

In [None]:
# Create final prediction table
tb_pred_final <- data.table(
  numero_de_cliente = dpredict$numero_de_cliente,
  prob = probs_future
)

setorder(tb_pred_final, -prob)

# Apply optimal cutoff from validation
#ENVIOS_OPTIMO <- gain_results[which.max(gain_total), envios]
ENVIOS_OPTIMO <- 11000

tb_pred_final[, Predicted := 0L]
tb_pred_final[1:ENVIOS_OPTIMO, Predicted := 1L]

cat("\n=== FINAL SUBMISSION ===\n")
cat("Optimal number of submissions:", ENVIOS_OPTIMO, "\n")
cat("Predicted BAJA+2:", sum(tb_pred_final$Predicted), "\n")
cat("Percentage:", round(100 * sum(tb_pred_final$Predicted) / nrow(tb_pred_final), 2), "%\n")

# Save final submission
fwrite(tb_pred_final[, list(numero_de_cliente, Predicted)], 
  file = "prediccion_final.csv", sep = ",")

# Also save with probabilities for analysis
fwrite(tb_pred_final, file = "prediccion_final_with_probs.csv", sep = ",")

cat("\nFiles saved:\n")
cat("  - prediccion_final.csv (for Kaggle submission)\n")
cat("  - prediccion_final_with_probs.csv (for analysis)\n")

## 12. Summary

In [None]:
# Final summary
cat("\n", strrep("=", 70), "\n")
cat("EXPERIMENT SUMMARY\n")
cat(strrep("=", 70), "\n\n")

cat("Configuration:\n")
cat("  Experiment:", PARAM$experimento, "\n")
cat("  Seed:", PARAM$semilla_primigenia, "\n")
cat("  Training periods:", paste(PARAM$train, collapse = ", "), "\n")
cat("  Validation period:", PARAM$validate, "\n")
cat("  Future period:", PARAM$future, "\n")
cat("  Undersampling:", PARAM$trainingstrategy$undersampling, "\n")

cat("\nBayesian Optimization:\n")
cat("  Iterations:", PARAM$hyperparametertuning$iteraciones, "\n")
cat("  Best AUC:", PARAM$out$lgbm$y, "\n")

cat("\nValidation Results:\n")
cat("  Optimal envios:", optimal_row$envios, "\n")
cat("  Optimal threshold:", round(optimal_row$prob_threshold, 6), "\n")
cat("  Expected total gain:", optimal_row$gain_total, "\n")
cat("  Expected public gain:", optimal_row$gain_public, "\n")
cat("  Expected private gain:", optimal_row$gain_private, "\n")

cat("\nFinal Prediction:\n")
cat("  Total customers:", nrow(tb_pred_final), "\n")
cat("  Predicted BAJA+2:", sum(tb_pred_final$Predicted), "\n")
cat("  Submission file: prediccion_final.csv\n")

cat("\nOutput directory:", getwd(), "\n")
cat("\nCompleted:", format(Sys.time(), "%a %b %d %X %Y"), "\n")
cat(strrep("=", 70), "\n")