# 610 WorkFlow Gerencial - Nicolas Horn (v6)

Basado en v2 con hiperparametros optimizados de BO local

**Tecnicas v6:**
- Catastrophe Analysis (13 variables en 202006 -> NA)
- Data Drifting por IPC (deflactacion)
- FE Historico: lags (1,2) + deltas (1,2) + trends (3,6)
- **Hiperparametros optimizados:**
  - num_leaves = 212 (de BO)
  - min_data_in_leaf = 1000 (ajustado, BO dio 2)
  - num_iterations = 729 (de BO)
  - feature_fraction = 0.5
  - learning_rate = 0.03

**5 semillas:** 153929, 838969, 922081, 795581, 194609

## Seteo Google Colab (Python3)

In [None]:
from google.colab import drive
drive.mount('/content/.drive')

In [None]:
%%shell

mkdir -p "/content/.drive/My Drive/labo1"
mkdir -p "/content/buckets"
ln -sf "/content/.drive/My Drive/labo1" /content/buckets/b1

mkdir -p ~/.kaggle
cp /content/buckets/b1/kaggle/kaggle.json ~/.kaggle 2>/dev/null || true
chmod 600 ~/.kaggle/kaggle.json 2>/dev/null || true

mkdir -p /content/buckets/b1/exp
mkdir -p /content/buckets/b1/datasets
mkdir -p /content/datasets

webfiles="https://storage.googleapis.com/open-courses/austral2025-af91/"
destino_local="/content/datasets"
destino_bucket="/content/buckets/b1/datasets"
archivo="gerencial_competencia_2025.csv.gz"

if ! test -f $destino_bucket/$archivo; then
  wget $webfiles/$archivo -O $destino_bucket/$archivo
fi

if ! test -f $destino_local/$archivo; then
  cp $destino_bucket/$archivo $destino_local/$archivo
fi

ls -lh $destino_local/$archivo

## Inicializacion R

**Cambiar Runtime a R**

In [None]:
format(Sys.time(), "%a %b %d %X %Y")

In [None]:
rm(list=ls(all.names=TRUE))
gc(full=TRUE, verbose=FALSE)

In [None]:
require("data.table")
if(!require("R.utils")) install.packages("R.utils")
require("R.utils")

## Parametros Globales

In [None]:
PARAM_GLOBAL <- list()
PARAM_GLOBAL$experimento_base <- 6160
PARAM_GLOBAL$dataset <- "gerencial_competencia_2025.csv.gz"

PARAM_GLOBAL$semillas <- c(153929, 838969, 922081, 795581, 194609)

resultados_totales <- list()

## Indices IPC

In [None]:
vfoto_mes <- c(
  202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
  202101, 202102, 202103, 202104, 202105, 202106, 202107
)

vIPC <- c(
  1.2118694724, 1.1881073259,
  1.1693969743, 1.1375456949, 1.1065619600,
  1.0681100000, 1.0370000000, 1.0000000000,
  0.9680542110, 0.9344152616, 0.8882274350,
  0.8532444140, 0.8251880213, 0.8003763543,
  0.7763107219
)

tb_indices <- data.table(foto_mes = vfoto_mes, IPC = vIPC)
print(tb_indices)

## Funcion Tendencia

In [None]:
calc_slope_fast <- function(y) {
  n <- length(y)
  valid <- !is.na(y)
  n_valid <- sum(valid)
  if (n_valid < 2) return(NA_real_)
  
  x <- 1:n
  x_valid <- x[valid]
  y_valid <- y[valid]
  
  sum_x <- sum(x_valid)
  sum_y <- sum(y_valid)
  sum_xy <- sum(x_valid * y_valid)
  sum_x2 <- sum(x_valid^2)
  
  denom <- n_valid * sum_x2 - sum_x^2
  if (denom == 0) return(NA_real_)
  
  (n_valid * sum_xy - sum_x * sum_y) / denom
}

## Loop Principal - 5 Semillas

In [None]:
for (seed_idx in 1:length(PARAM_GLOBAL$semillas)) {

  cat("\n\n========================================\n")
  cat("SEMILLA", seed_idx, "de", length(PARAM_GLOBAL$semillas), "\n")
  cat("Semilla:", PARAM_GLOBAL$semillas[seed_idx], "\n")
  cat("========================================\n\n")

  inicio_seed <- Sys.time()

  PARAM <- list()
  PARAM$semilla_primigenia <- PARAM_GLOBAL$semillas[seed_idx]
  PARAM$experimento <- PARAM_GLOBAL$experimento_base + seed_idx - 1
  PARAM$dataset <- PARAM_GLOBAL$dataset

  # Carpeta del Experimento
  if (!dir.exists("/content/buckets/b1/exp")) {
    dir.create("/content/buckets/b1/exp", showWarnings = FALSE, recursive = TRUE)
  }
  
  setwd("/content/buckets/b1/exp")
  experimento_folder <- paste0("WF", PARAM$experimento, "_seed", seed_idx, "_v6_BO")
  dir.create(experimento_folder, showWarnings=FALSE)
  setwd(paste0("/content/buckets/b1/exp/", experimento_folder))
  
  cat("Carpeta:", experimento_folder, "\n\n")

  # Carga del dataset
  cat("Cargando dataset...\n")
  dataset <- fread(paste0("/content/datasets/", PARAM$dataset))
  cat("Dataset:", nrow(dataset), "x", ncol(dataset), "\n\n")

  # Catastrophe Analysis
  cat("Catastrophe Analysis (13 variables -> NA)...\n")
  dataset[foto_mes==202006, internet:=NA]
  dataset[foto_mes==202006, mrentabilidad:=NA]
  dataset[foto_mes==202006, mrentabilidad_annual:=NA]
  dataset[foto_mes==202006, mcomisiones:=NA]
  dataset[foto_mes==202006, mactivos_margen:=NA]
  dataset[foto_mes==202006, mpasivos_margen:=NA]
  dataset[foto_mes==202006, mcuentas_saldo:=NA]
  dataset[foto_mes==202006, ctarjeta_visa_transacciones:=NA]
  dataset[foto_mes==202006, mtarjeta_visa_consumo:=NA]
  dataset[foto_mes==202006, mtarjeta_master_consumo:=NA]
  dataset[foto_mes==202006, ccallcenter_transacciones:=NA]
  dataset[foto_mes==202006, chomebanking_transacciones:=NA]
  dataset[foto_mes==202006, ctarjeta_master_transacciones:=NA]

  # Data Drifting - IPC
  cat("Data Drifting (IPC)...\n")
  campos_monetarios <- colnames(dataset)[colnames(dataset) %like% "^m"]
  dataset[tb_indices, on = "foto_mes", (campos_monetarios) := .SD * i.IPC, .SDcols = campos_monetarios]

  # FE Intra-mes
  cat("FE Intra-mes...\n")
  dataset[, kmes := foto_mes %% 100]
  if("mpayroll" %in% colnames(dataset) & "cliente_edad" %in% colnames(dataset))
    dataset[, mpayroll_sobre_edad := mpayroll / cliente_edad]

  # FE Historico
  cat("FE Historico (lags + deltas + trends)...\n")
  inicio_fe <- Sys.time()
  setorder(dataset, numero_de_cliente, foto_mes)

  cols_lagueables <- setdiff(colnames(dataset), c("numero_de_cliente", "foto_mes", "clase_ternaria"))

  # Lags
  dataset[, paste0(cols_lagueables, "_lag1") := shift(.SD, 1, NA, "lag"), by = numero_de_cliente, .SDcols = cols_lagueables]
  dataset[, paste0(cols_lagueables, "_lag2") := shift(.SD, 2, NA, "lag"), by = numero_de_cliente, .SDcols = cols_lagueables]

  # Deltas
  for (vcol in cols_lagueables) {
    dataset[, paste0(vcol, "_delta1") := get(vcol) - get(paste0(vcol, "_lag1"))]
    dataset[, paste0(vcol, "_delta2") := get(vcol) - get(paste0(vcol, "_lag2"))]
  }

  # Trends
  for (col in cols_lagueables) {
    dataset[, paste0(col, "_trend_3") := frollapply(get(col), 3, calc_slope_fast, align="right"), by = numero_de_cliente]
  }

  for (col in cols_lagueables) {
    dataset[, paste0(col, "_trend_6") := frollapply(get(col), 6, calc_slope_fast, align="right"), by = numero_de_cliente]
  }

  cat("FE completado en", round(difftime(Sys.time(), inicio_fe, units="mins"), 1), "min\n")
  cat("Dataset:", ncol(dataset), "columnas\n\n")

  # Training Strategy
  cat("Training Strategy...\n")
  PARAM$trainingstrategy <- list()
  PARAM$trainingstrategy$validate <- c(202105)
  PARAM$trainingstrategy$training <- c(202104, 202103, 202102, 202101, 202012, 202011, 202010, 202009, 202008, 202007, 202006, 202005)
  PARAM$trainingstrategy$final_train <- c(202105, 202104, 202103, 202102, 202101, 202012, 202011, 202010, 202009, 202008, 202007, 202006, 202005)
  PARAM$trainingstrategy$future <- c(202107)

  dataset[, clase01 := ifelse(clase_ternaria %in% c("BAJA+1", "BAJA+2"), 1, 0)]
  campos_buenos <- setdiff(colnames(dataset), c("clase_ternaria", "clase01", "azar"))

  set.seed(PARAM$semilla_primigenia, kind = "L'Ecuyer-CMRG")
  dataset[, azar := runif(nrow(dataset))]
  dataset[, fold_final_train := foto_mes %in% PARAM$trainingstrategy$final_train]

  cat("Features:", length(campos_buenos), "\n\n")

  # LightGBM
  if(!require("lightgbm")) install.packages("lightgbm")
  require("lightgbm")

  # Hiperparametros optimizados
  cat("Hiperparametros (BO + ajustes):\n")
  cat("  num_leaves = 212 (de BO)\n")
  cat("  min_data_in_leaf = 1000 (ajustado)\n")
  cat("  num_iterations = 729 (de BO)\n")
  cat("  feature_fraction = 0.5\n")
  cat("  learning_rate = 0.03\n\n")

  param_final <- list(
    objective = "binary",
    metric = "auc",
    first_metric_only = TRUE,
    boost_from_average = TRUE,
    feature_pre_filter = FALSE,
    verbosity = -100,
    force_row_wise = TRUE,
    seed = PARAM$semilla_primigenia,
    max_bin = 31,
    # Hiperparametros de BO + ajustes
    num_leaves = 212,
    min_data_in_leaf = 1000,
    num_iterations = 729,
    feature_fraction = 0.5,
    learning_rate = 0.03
  )

  # Final Training
  cat("Entrenando modelo final...\n")
  
  dfinal_train <- lgb.Dataset(
    data = data.matrix(dataset[fold_final_train == TRUE, campos_buenos, with = FALSE]),
    label = dataset[fold_final_train == TRUE, clase01],
    free_raw_data = TRUE
  )

  inicio_train <- Sys.time()
  final_model <- lgb.train(data = dfinal_train, param = param_final, verbose = -100)
  cat("Modelo entrenado en", round(difftime(Sys.time(), inicio_train, units="mins"), 1), "min\n\n")

  lgb.save(final_model, "modelo.txt")
  fwrite(as.data.table(lgb.importance(final_model)), file = "impo.txt", sep = "\t")

  # Scoring
  cat("Scoring 202107...\n")
  dfuture <- dataset[foto_mes %in% PARAM$trainingstrategy$future]
  prediccion <- predict(final_model, data.matrix(dfuture[, campos_buenos, with = FALSE]))

  tb_prediccion <- dfuture[, list(numero_de_cliente)]
  tb_prediccion[, prob := prediccion]
  fwrite(tb_prediccion, file = "prediccion.txt", sep = "\t")

  # Curva de Ganancia
  tb_prediccion[, clase_ternaria := dfuture$clase_ternaria]
  tb_prediccion[, ganancia := -3000.0]
  tb_prediccion[clase_ternaria == "BAJA+2", ganancia := 117000.0]

  setorder(tb_prediccion, -prob)
  tb_prediccion[, gan_acum := cumsum(ganancia)]
  tb_prediccion[, gan_suavizada := frollmean(gan_acum, 400, align="center", na.rm=TRUE)]

  resultado <- list()
  resultado$ganancia_suavizada_max <- max(tb_prediccion$gan_suavizada, na.rm=TRUE)
  resultado$envios <- which.max(tb_prediccion$gan_suavizada)
  resultado$semilla <- PARAM$semilla_primigenia
  resultado$seed_idx <- seed_idx

  fwrite(tb_prediccion, file = "ganancias.txt", sep = "\t")

  # Grafico
  tb_prediccion[, envios_num := .I]
  pdf("curva_de_ganancia.pdf")
  plot(x = tb_prediccion$envios_num, y = tb_prediccion$gan_acum, type = "l", col = "gray",
       xlim = c(0, 6000), ylim = c(0, 8000000),
       main = paste0("Seed ", seed_idx, " (v6 BO) - Gan=", as.integer(resultado$ganancia_suavizada_max)),
       xlab = "Envios", ylab = "Ganancia", panel.first = grid())
  dev.off()

  # Guardar
  if(!require("yaml")) install.packages("yaml")
  require("yaml")
  PARAM$resultado <- resultado
  write_yaml(PARAM, file = "PARAM.yml")

  resultados_totales[[seed_idx]] <- resultado

  fin_seed <- Sys.time()
  duracion <- as.numeric(difftime(fin_seed, inicio_seed, units = "mins"))

  rm(dataset, dfinal_train, final_model, tb_prediccion, dfuture)
  gc(full = TRUE, verbose = FALSE)

  cat("\n========================================\n")
  cat("Semilla", seed_idx, "completada en", round(duracion, 1), "min\n")
  cat("Ganancia:", formatC(resultado$ganancia_suavizada_max, format="f", big.mark=",", digits=0), "\n")
  cat("Envios:", resultado$envios, "\n")
  cat("========================================\n")
}

cat("\n*** TODAS LAS SEMILLAS PROCESADAS ***\n")

## Resumen Final

In [None]:
setwd("/content/buckets/b1/exp")

tb_resumen <- data.table(
  seed_idx = sapply(resultados_totales, function(x) x$seed_idx),
  semilla = sapply(resultados_totales, function(x) x$semilla),
  ganancia = sapply(resultados_totales, function(x) x$ganancia_suavizada_max),
  envios = sapply(resultados_totales, function(x) x$envios)
)

tb_resumen[, rank := rank(-ganancia)]

cat("\n========================================\n")
cat("RESUMEN FINAL - v6 (BO optimizado)\n")
cat("========================================\n\n")
print(tb_resumen)

cat("\nESTADISTICAS:\n")
cat("Ganancia promedio:", formatC(mean(tb_resumen$ganancia), format="f", big.mark=",", digits=0), "\n")
cat("Ganancia maxima:", formatC(max(tb_resumen$ganancia), format="f", big.mark=",", digits=0), "\n")
cat("Ganancia minima:", formatC(min(tb_resumen$ganancia), format="f", big.mark=",", digits=0), "\n")
cat("Desv estandar:", formatC(sd(tb_resumen$ganancia), format="f", big.mark=",", digits=0), "\n")

fwrite(tb_resumen, file = paste0("resumen_v6_exp", PARAM_GLOBAL$experimento_base, ".txt"), sep = "\t")
saveRDS(resultados_totales, file = paste0("resultados_v6_exp", PARAM_GLOBAL$experimento_base, ".rds"))

cat("\nCOMPARACION VERSIONES:\n")
cat("v2 (num_leaves=25, min_data=2764, iter=2009): 6,456,000\n")
cat("v4 (+rolling+RF):                             6,267,840\n")
cat("v6 (num_leaves=212, min_data=1000, iter=729):", formatC(mean(tb_resumen$ganancia), format="f", big.mark=",", digits=0), "\n")

In [None]:
format(Sys.time(), "%a %b %d %X %Y")