# 610 WorkFlow Gerencial - Nicolas Horn (KAGGLE v2 params + lag3/delta3)

Usando hiperparametros conservadores de v2 + lags/deltas extendidos

**Tecnicas:**
- Catastrophe Analysis (13 variables en 202006 -> NA)
- Data Drifting por IPC (deflactacion)
- FE Historico: lags (1,2,3) + deltas (1,2,3) + trends (3,6)
- **Hiperparametros v2 (conservadores):**
  - num_leaves = 25
  - min_data_in_leaf = 2764
  - num_iterations = 2009
  - feature_fraction = 0.5
  - learning_rate = 0.03

**5 semillas:** 153929, 838969, 922081, 795581, 194609
**Competencia:** labo-i-2025-virtual-gerencial
**Cortes:** 800 a 1400 (step 50)

## Seteo Google Colab (Python3)

In [None]:
from google.colab import drive
drive.mount('/content/.drive')

In [None]:
%%shell

mkdir -p "/content/.drive/My Drive/labo1"
mkdir -p "/content/buckets"
ln -sf "/content/.drive/My Drive/labo1" /content/buckets/b1

mkdir -p ~/.kaggle
cp /content/buckets/b1/kaggle/kaggle.json ~/.kaggle 2>/dev/null || true
chmod 600 ~/.kaggle/kaggle.json 2>/dev/null || true

mkdir -p /content/buckets/b1/exp
mkdir -p /content/buckets/b1/datasets
mkdir -p /content/datasets

webfiles="https://storage.googleapis.com/open-courses/austral2025-af91/"
destino_local="/content/datasets"
destino_bucket="/content/buckets/b1/datasets"
archivo="gerencial_competencia_2025.csv.gz"

if ! test -f $destino_bucket/$archivo; then
  wget $webfiles/$archivo -O $destino_bucket/$archivo
fi

if ! test -f $destino_local/$archivo; then
  cp $destino_bucket/$archivo $destino_local/$archivo
fi

ls -lh $destino_local/$archivo

## Inicializacion R

**Cambiar Runtime a R**

In [None]:
format(Sys.time(), "%a %b %d %X %Y")

In [None]:
rm(list=ls(all.names=TRUE))
gc(full=TRUE, verbose=FALSE)

In [None]:
require("data.table")
if(!require("R.utils")) install.packages("R.utils")
require("R.utils")

## Parametros Globales

In [None]:
PARAM_GLOBAL <- list()
PARAM_GLOBAL$experimento_base <- 6180
PARAM_GLOBAL$dataset <- "gerencial_competencia_2025.csv.gz"

PARAM_GLOBAL$semillas <- c(153929, 838969, 922081, 795581, 194609)

# Kaggle config
PARAM_GLOBAL$kaggle_competencia <- "labo-i-2025-virtual-gerencial"
PARAM_GLOBAL$kaggle_cortes <- seq(800, 1400, by = 50)

resultados_totales <- list()

## Indices IPC

In [None]:
vfoto_mes <- c(
  202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
  202101, 202102, 202103, 202104, 202105, 202106, 202107, 202108, 202109
)

vIPC <- c(
  1.2118694724, 1.1881073259,
  1.1693969743, 1.1375456949, 1.1065619600,
  1.0681100000, 1.0370000000, 1.0000000000,
  0.9680542110, 0.9344152616, 0.8882274350,
  0.8532444140, 0.8251880213, 0.8003763543,
  0.7763107219, 0.7566381305, 0.7289384687
)

tb_indices <- data.table(foto_mes = vfoto_mes, IPC = vIPC)
print(tb_indices)

## Funcion Tendencia

In [None]:
calc_slope_fast <- function(y) {
  n <- length(y)
  valid <- !is.na(y)
  n_valid <- sum(valid)
  if (n_valid < 2) return(NA_real_)
  
  x <- 1:n
  x_valid <- x[valid]
  y_valid <- y[valid]
  
  sum_x <- sum(x_valid)
  sum_y <- sum(y_valid)
  sum_xy <- sum(x_valid * y_valid)
  sum_x2 <- sum(x_valid^2)
  
  denom <- n_valid * sum_x2 - sum_x^2
  if (denom == 0) return(NA_real_)
  
  (n_valid * sum_xy - sum_x * sum_y) / denom
}

## Loop Principal - 5 Semillas

In [None]:
for (seed_idx in 1:length(PARAM_GLOBAL$semillas)) {

  cat("\n\n========================================\n")
  cat("SEMILLA", seed_idx, "de", length(PARAM_GLOBAL$semillas), "\n")
  cat("Semilla:", PARAM_GLOBAL$semillas[seed_idx], "\n")
  cat("========================================\n\n")

  inicio_seed <- Sys.time()

  PARAM <- list()
  PARAM$semilla_primigenia <- PARAM_GLOBAL$semillas[seed_idx]
  PARAM$experimento <- PARAM_GLOBAL$experimento_base + seed_idx - 1
  PARAM$dataset <- PARAM_GLOBAL$dataset

  # Carpeta del Experimento
  if (!dir.exists("/content/buckets/b1/exp")) {
    dir.create("/content/buckets/b1/exp", showWarnings = FALSE, recursive = TRUE)
  }
  
  setwd("/content/buckets/b1/exp")
  experimento_folder <- paste0("WF", PARAM$experimento, "_seed", seed_idx, "_v2params_lag3")
  dir.create(experimento_folder, showWarnings=FALSE)
  setwd(paste0("/content/buckets/b1/exp/", experimento_folder))
  dir.create("kaggle", showWarnings=FALSE)
  
  cat("Carpeta:", experimento_folder, "\n\n")

  # Carga del dataset
  cat("Cargando dataset...\n")
  dataset <- fread(paste0("/content/datasets/", PARAM$dataset))
  cat("Dataset:", nrow(dataset), "x", ncol(dataset), "\n\n")

  # Catastrophe Analysis
  cat("Catastrophe Analysis (13 variables -> NA)...\n")
  dataset[foto_mes==202006, internet:=NA]
  dataset[foto_mes==202006, mrentabilidad:=NA]
  dataset[foto_mes==202006, mrentabilidad_annual:=NA]
  dataset[foto_mes==202006, mcomisiones:=NA]
  dataset[foto_mes==202006, mactivos_margen:=NA]
  dataset[foto_mes==202006, mpasivos_margen:=NA]
  dataset[foto_mes==202006, mcuentas_saldo:=NA]
  dataset[foto_mes==202006, ctarjeta_visa_transacciones:=NA]
  dataset[foto_mes==202006, mtarjeta_visa_consumo:=NA]
  dataset[foto_mes==202006, mtarjeta_master_consumo:=NA]
  dataset[foto_mes==202006, ccallcenter_transacciones:=NA]
  dataset[foto_mes==202006, chomebanking_transacciones:=NA]
  dataset[foto_mes==202006, ctarjeta_master_transacciones:=NA]

  # Data Drifting - IPC
  cat("Data Drifting (IPC)...\n")
  campos_monetarios <- colnames(dataset)[colnames(dataset) %like% "^m"]
  dataset[tb_indices, on = "foto_mes", (campos_monetarios) := .SD * i.IPC, .SDcols = campos_monetarios]

  # FE Intra-mes
  cat("FE Intra-mes...\n")
  dataset[, kmes := foto_mes %% 100]
  if("mpayroll" %in% colnames(dataset) & "cliente_edad" %in% colnames(dataset))
    dataset[, mpayroll_sobre_edad := mpayroll / cliente_edad]

  # FE Historico (extendido con lag3 y delta3)
  cat("FE Historico (lags 1,2,3 + deltas 1,2,3 + trends 3,6)...\n")
  inicio_fe <- Sys.time()
  setorder(dataset, numero_de_cliente, foto_mes)

  cols_lagueables <- setdiff(colnames(dataset), c("numero_de_cliente", "foto_mes", "clase_ternaria"))

  # Lags 1, 2, 3
  cat("  Lags orden 1...\n")
  dataset[, paste0(cols_lagueables, "_lag1") := shift(.SD, 1, NA, "lag"), by = numero_de_cliente, .SDcols = cols_lagueables]
  cat("  Lags orden 2...\n")
  dataset[, paste0(cols_lagueables, "_lag2") := shift(.SD, 2, NA, "lag"), by = numero_de_cliente, .SDcols = cols_lagueables]
  cat("  Lags orden 3...\n")
  dataset[, paste0(cols_lagueables, "_lag3") := shift(.SD, 3, NA, "lag"), by = numero_de_cliente, .SDcols = cols_lagueables]

  # Deltas 1, 2, 3
  cat("  Deltas...\n")
  for (vcol in cols_lagueables) {
    dataset[, paste0(vcol, "_delta1") := get(vcol) - get(paste0(vcol, "_lag1"))]
    dataset[, paste0(vcol, "_delta2") := get(vcol) - get(paste0(vcol, "_lag2"))]
    dataset[, paste0(vcol, "_delta3") := get(vcol) - get(paste0(vcol, "_lag3"))]
  }

  # Trends 3 y 6
  cat("  Trends ventana 3...\n")
  for (col in cols_lagueables) {
    dataset[, paste0(col, "_trend_3") := frollapply(get(col), 3, calc_slope_fast, align="right"), by = numero_de_cliente]
  }

  cat("  Trends ventana 6...\n")
  for (col in cols_lagueables) {
    dataset[, paste0(col, "_trend_6") := frollapply(get(col), 6, calc_slope_fast, align="right"), by = numero_de_cliente]
  }

  cat("FE completado en", round(difftime(Sys.time(), inicio_fe, units="mins"), 1), "min\n")
  cat("Dataset:", ncol(dataset), "columnas\n\n")

  # Training Strategy - KAGGLE (entrenar con todo, predecir 202109)
  cat("Training Strategy (KAGGLE)...\n")
  PARAM$trainingstrategy <- list()
  PARAM$trainingstrategy$final_train <- c(202107, 202106, 202105, 202104, 202103, 202102, 202101, 202012, 202011, 202010, 202009, 202008, 202007, 202006, 202005)
  PARAM$trainingstrategy$future <- c(202109)

  dataset[, clase01 := ifelse(clase_ternaria %in% c("BAJA+1", "BAJA+2"), 1, 0)]
  campos_buenos <- setdiff(colnames(dataset), c("clase_ternaria", "clase01", "azar"))

  set.seed(PARAM$semilla_primigenia, kind = "L'Ecuyer-CMRG")
  dataset[, azar := runif(nrow(dataset))]
  dataset[, fold_final_train := foto_mes %in% PARAM$trainingstrategy$final_train]

  cat("Features:", length(campos_buenos), "\n\n")

  # LightGBM
  if(!require("lightgbm")) install.packages("lightgbm")
  require("lightgbm")

  # Hiperparametros v2 (conservadores)
  cat("Hiperparametros v2 (conservadores):\n")
  cat("  num_leaves = 25\n")
  cat("  min_data_in_leaf = 2764\n")
  cat("  num_iterations = 2009\n")
  cat("  feature_fraction = 0.5\n")
  cat("  learning_rate = 0.03\n\n")

  param_final <- list(
    objective = "binary",
    metric = "auc",
    first_metric_only = TRUE,
    boost_from_average = TRUE,
    feature_pre_filter = FALSE,
    verbosity = -100,
    force_row_wise = TRUE,
    seed = PARAM$semilla_primigenia,
    max_bin = 31,
    num_leaves = 25,
    min_data_in_leaf = 2764,
    num_iterations = 2009,
    feature_fraction = 0.5,
    learning_rate = 0.03
  )

  # Final Training
  cat("Entrenando modelo final...\n")
  
  dfinal_train <- lgb.Dataset(
    data = data.matrix(dataset[fold_final_train == TRUE, campos_buenos, with = FALSE]),
    label = dataset[fold_final_train == TRUE, clase01],
    free_raw_data = TRUE
  )

  inicio_train <- Sys.time()
  final_model <- lgb.train(data = dfinal_train, param = param_final, verbose = -100)
  cat("Modelo entrenado en", round(difftime(Sys.time(), inicio_train, units="mins"), 1), "min\n\n")

  lgb.save(final_model, "modelo.txt")
  fwrite(as.data.table(lgb.importance(final_model)), file = "impo.txt", sep = "\t")

  # Scoring 202109 (KAGGLE)
  cat("Scoring 202109 (KAGGLE)...\n")
  dfuture <- dataset[foto_mes %in% PARAM$trainingstrategy$future]
  prediccion <- predict(final_model, data.matrix(dfuture[, campos_buenos, with = FALSE]))

  tb_prediccion <- dfuture[, list(numero_de_cliente)]
  tb_prediccion[, prob := prediccion]
  fwrite(tb_prediccion, file = "prediccion.txt", sep = "\t")

  setorder(tb_prediccion, -prob)

  # Generar archivos CSV para Kaggle
  cat("Generando", length(PARAM_GLOBAL$kaggle_cortes), "archivos CSV para Kaggle...\n")
  for (envios in PARAM_GLOBAL$kaggle_cortes) {
    tb_prediccion[, Predicted := 0L]
    tb_prediccion[1:envios, Predicted := 1L]
    archivo_kaggle <- paste0("./kaggle/KA", PARAM$experimento, "_", envios, ".csv")
    fwrite(tb_prediccion[, list(numero_de_cliente, Predicted)], file = archivo_kaggle, sep = ",")
  }
  cat("Archivos generados en carpeta kaggle/\n\n")

  # Submit a Kaggle
  cat("Enviando a Kaggle...\n")
  for (envios in PARAM_GLOBAL$kaggle_cortes) {
    archivo_kaggle <- paste0("./kaggle/KA", PARAM$experimento, "_", envios, ".csv")
    mensaje_kaggle <- paste0("v2params_lag3_seed_", PARAM$semilla_primigenia, "_corte_", envios)
    cmd <- paste0("kaggle competitions submit -c ", PARAM_GLOBAL$kaggle_competencia,
                  " -f ", archivo_kaggle, " -m '", mensaje_kaggle, "'")
    tryCatch({
      system(cmd, intern = TRUE)
      cat("  -> corte", envios, "OK\n")
    }, error = function(e) {
      cat("  -> corte", envios, "ERROR\n")
    })
    Sys.sleep(5)
  }

  # Guardar
  if(!require("yaml")) install.packages("yaml")
  require("yaml")
  write_yaml(PARAM, file = "PARAM.yml")

  resultados_totales[[seed_idx]] <- list(
    semilla = PARAM$semilla_primigenia,
    experimento = PARAM$experimento
  )

  fin_seed <- Sys.time()
  duracion <- as.numeric(difftime(fin_seed, inicio_seed, units = "mins"))

  rm(dataset, dfinal_train, final_model, tb_prediccion, dfuture)
  gc(full = TRUE, verbose = FALSE)

  cat("\n========================================\n")
  cat("Semilla", seed_idx, "completada en", round(duracion, 1), "min\n")
  cat("========================================\n")
}

cat("\n*** TODAS LAS SEMILLAS PROCESADAS ***\n")

## Resumen Final

In [None]:
setwd("/content/buckets/b1/exp")

cat("\n========================================\n")
cat("RESUMEN FINAL - KAGGLE\n")
cat("========================================\n\n")

for (i in 1:length(resultados_totales)) {
  cat("Semilla", i, ":", resultados_totales[[i]]$semilla,
      "- Exp:", resultados_totales[[i]]$experimento, "\n")
}

cat("\nArchivos CSV en cada carpeta WF*/kaggle/\n")
cat("Submits enviados a:", PARAM_GLOBAL$kaggle_competencia, "\n")

In [None]:
format(Sys.time(), "%a %b %d %X %Y")