# 610 FINAL - Kaggle Competition Submit
## Nicolas Horn - Mejores Tecnicas de Data Science

**Este notebook es para SUBMIT A KAGGLE (mes 202109)**

**Tecnicas implementadas:**
- Catastrophe Analysis (13 variables en 202006 â†’ NA)
- Data Drifting por IPC (deflacion)
- FE Historico: lags (1,2) + deltas (1,2) + trends (3,6)
- Rolling Statistics: media movil (3) + desviacion estandar (3)
- FE Random Forest simplificado (10 arboles x 8 hojas)
- 5 semillas con submit automatico a Kaggle

**Competencia:** labo-i-2025-virtual-gerencial

## Seteo del ambiente en Google Colab

Runtime en **Python 3** para esta celda

In [None]:
from google.colab import drive
drive.mount('/content/.drive')

In [None]:
%%shell

# LIMPIAR TODO EL DIRECTORIO BUCKETS
rm -rf /content/buckets

# Crear directorios desde cero
mkdir -p "/content/.drive/My Drive/labo1"
mkdir -p "/content/buckets"

# Crear el enlace simbolico correctamente
ln -s "/content/.drive/My Drive/labo1" /content/buckets/b1

# Configurar Kaggle
mkdir -p ~/.kaggle
if [ -f "/content/buckets/b1/kaggle/kaggle.json" ]; then
  cp /content/buckets/b1/kaggle/kaggle.json ~/.kaggle/
  chmod 600 ~/.kaggle/kaggle.json
  echo "Kaggle configurado correctamente"
else
  echo "kaggle.json no encontrado"
fi

# Crear directorios necesarios
mkdir -p /content/buckets/b1/exp
mkdir -p /content/buckets/b1/datasets
mkdir -p /content/datasets

# URLs y rutas
webfiles="https://storage.googleapis.com/open-courses/austral2025-af91"
destino_local="/content/datasets"
destino_bucket="/content/buckets/b1/datasets"

# Descargar gerencial_competencia_2025.csv.gz
archivo="gerencial_competencia_2025.csv.gz"
echo "Procesando $archivo..."

if [ ! -f "$destino_bucket/$archivo" ] || [ ! -s "$destino_bucket/$archivo" ]; then
  echo "  Descargando desde web..."
  wget "$webfiles/$archivo" -O "$destino_bucket/$archivo"
fi

if [ ! -f "$destino_local/$archivo" ] || [ ! -s "$destino_local/$archivo" ]; then
  echo "  Copiando a datasets locales..."
  cp "$destino_bucket/$archivo" "$destino_local/$archivo"
fi

ls -lh "$destino_local/$archivo"
echo "=== Configuracion completada ==="

## Inicializacion R

**Cambiar Runtime a R** antes de continuar

In [None]:
format(Sys.time(), "%a %b %d %X %Y")

In [None]:
rm(list=ls(all.names=TRUE))
gc(full=TRUE, verbose=FALSE)

In [None]:
require("data.table")
if(!require("R.utils")) install.packages("R.utils")
require("R.utils")
if(!require("lightgbm")) install.packages("lightgbm")
require("lightgbm")
if(!require("yaml")) install.packages("yaml")
require("yaml")

## Parametros Globales

In [None]:
PARAM_GLOBAL <- list()
PARAM_GLOBAL$experimento_base <- 6150  # FINAL para Kaggle
PARAM_GLOBAL$dataset <- "gerencial_competencia_2025.csv.gz"

# 5 semillas
PARAM_GLOBAL$semillas <- c(153929, 838969, 922081, 795581, 194609)

# Competencia Kaggle
PARAM_GLOBAL$kaggle_competencia <- "labo-i-2025-virtual-gerencial"
PARAM_GLOBAL$kaggle_cortes <- seq(800, 1400, by = 50)

# Lista para resultados
resultados_totales <- list()

## Indices IPC para Data Drifting

In [None]:
vfoto_mes <- c(
  202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
  202101, 202102, 202103, 202104, 202105, 202106, 202107, 202108, 202109
)

vIPC <- c(
  1.2118694724, 1.1881073259,
  1.1693969743, 1.1375456949, 1.1065619600,
  1.0681100000, 1.0370000000, 1.0000000000,
  0.9680542110, 0.9344152616, 0.8882274350,
  0.8532444140, 0.8251880213, 0.8003763543,
  0.7763107219, 0.7566381305, 0.7289384687
)

tb_indices <- data.table(foto_mes = vfoto_mes, IPC = vIPC)
print(tb_indices)

## Funciones Auxiliares

In [None]:
# Funcion para calcular tendencia (pendiente de regresion lineal)
calc_slope_fast <- function(y) {
  n <- length(y)
  valid <- !is.na(y)
  n_valid <- sum(valid)
  if (n_valid < 2) return(NA_real_)
  
  x <- 1:n
  x_valid <- x[valid]
  y_valid <- y[valid]
  
  sum_x <- sum(x_valid)
  sum_y <- sum(y_valid)
  sum_xy <- sum(x_valid * y_valid)
  sum_x2 <- sum(x_valid^2)
  
  denom <- n_valid * sum_x2 - sum_x^2
  if (denom == 0) return(NA_real_)
  
  (n_valid * sum_xy - sum_x * sum_y) / denom
}

In [None]:
# Funcion FE Random Forest simplificado
AgregaVarRandomForest <- function(dataset, campos_buenos, semilla) {
  
  cat("  Iniciando FE Random Forest...\n")
  gc(verbose = FALSE)
  
  dataset[, clase01_rf := 0L]
  dataset[clase_ternaria %in% c("BAJA+2", "BAJA+1"), clase01_rf := 1L]
  
  meses_train_rf <- c(202101, 202102, 202103)
  dataset[, entrenamiento_rf := as.integer(foto_mes %in% meses_train_rf)]
  
  rf_param <- list(
    num_iterations = 10,
    num_leaves = 8,
    min_data_in_leaf = 500,
    feature_fraction_bynode = 0.3,
    boosting = "rf",
    bagging_fraction = (1.0 - 1.0/exp(1.0)),
    bagging_freq = 1,
    feature_fraction = 1.0,
    max_bin = 31L,
    objective = "binary",
    first_metric_only = TRUE,
    boost_from_average = TRUE,
    feature_pre_filter = FALSE,
    force_row_wise = TRUE,
    verbosity = -100,
    seed = semilla
  )
  
  dtrain_rf <- lgb.Dataset(
    data = data.matrix(dataset[entrenamiento_rf == TRUE, campos_buenos, with = FALSE]),
    label = dataset[entrenamiento_rf == TRUE, clase01_rf],
    free_raw_data = FALSE
  )
  
  modelo_rf <- lgb.train(data = dtrain_rf, param = rf_param, verbose = -100)
  
  cat("  RF entrenado, generando features...\n")
  
  qarbolitos <- rf_param$num_iterations
  periodos <- dataset[, unique(foto_mes)]
  
  for (periodo in periodos) {
    datamatrix <- data.matrix(dataset[foto_mes == periodo, campos_buenos, with = FALSE])
    prediccion <- predict(modelo_rf, datamatrix, type = "leaf")
    
    for (arbolito in 1:qarbolitos) {
      hojas_arbol <- unique(prediccion[, arbolito])
      for (nodo_id in hojas_arbol) {
        nombre_col <- paste0("rf_", sprintf("%02d", arbolito), "_", sprintf("%02d", nodo_id))
        dataset[foto_mes == periodo, (nombre_col) := as.integer(nodo_id == prediccion[, arbolito])]
      }
    }
    rm(prediccion, datamatrix)
  }
  
  dataset[, clase01_rf := NULL]
  dataset[, entrenamiento_rf := NULL]
  rm(modelo_rf, dtrain_rf)
  gc(verbose = FALSE)
  
  cat("  FE Random Forest completado\n")
  return(dataset)
}

## Loop Principal - 5 Semillas con Submit a Kaggle

In [None]:
for (seed_idx in 1:length(PARAM_GLOBAL$semillas)) {

  cat("\n\n========================================\n")
  cat("SEMILLA ", seed_idx, " de ", length(PARAM_GLOBAL$semillas), "\n")
  cat("Semilla: ", PARAM_GLOBAL$semillas[seed_idx], "\n")
  cat("========================================\n\n")

  inicio_seed <- Sys.time()

  PARAM <- list()
  PARAM$semilla_primigenia <- PARAM_GLOBAL$semillas[seed_idx]
  PARAM$experimento <- PARAM_GLOBAL$experimento_base + seed_idx - 1
  PARAM$dataset <- PARAM_GLOBAL$dataset

  # ==========================================================================
  # Carpeta del Experimento
  # ==========================================================================
  
  setwd("/content/buckets/b1/exp")
  experimento_folder <- paste0("WF", PARAM$experimento, "_seed", seed_idx, "_FINAL_kaggle")
  dir.create(experimento_folder, showWarnings=FALSE)
  setwd(paste0("/content/buckets/b1/exp/", experimento_folder))
  dir.create("kaggle", showWarnings=FALSE)
  
  cat("Carpeta: ", experimento_folder, "\n\n")

  # ==========================================================================
  # Carga del dataset
  # ==========================================================================
  
  cat("Cargando dataset...\n")
  dataset <- fread(paste0("/content/datasets/", PARAM$dataset))
  cat("Dataset:", nrow(dataset), "x", ncol(dataset), "\n\n")

  # ==========================================================================
  # Catastrophe Analysis (13 variables)
  # ==========================================================================
  
  cat("Catastrophe Analysis...\n")
  dataset[foto_mes==202006, internet:=NA]
  dataset[foto_mes==202006, mrentabilidad:=NA]
  dataset[foto_mes==202006, mrentabilidad_annual:=NA]
  dataset[foto_mes==202006, mcomisiones:=NA]
  dataset[foto_mes==202006, mactivos_margen:=NA]
  dataset[foto_mes==202006, mpasivos_margen:=NA]
  dataset[foto_mes==202006, mcuentas_saldo:=NA]
  dataset[foto_mes==202006, ctarjeta_visa_transacciones:=NA]
  dataset[foto_mes==202006, mtarjeta_visa_consumo:=NA]
  dataset[foto_mes==202006, mtarjeta_master_consumo:=NA]
  dataset[foto_mes==202006, ccallcenter_transacciones:=NA]
  dataset[foto_mes==202006, chomebanking_transacciones:=NA]
  dataset[foto_mes==202006, ctarjeta_master_transacciones:=NA]

  # ==========================================================================
  # Data Drifting - IPC
  # ==========================================================================
  
  cat("Data Drifting (IPC)...\n")
  campos_monetarios <- colnames(dataset)[colnames(dataset) %like% "^m"]
  dataset[tb_indices, on = "foto_mes", (campos_monetarios) := .SD * i.IPC, .SDcols = campos_monetarios]

  # ==========================================================================
  # FE Intra-mes
  # ==========================================================================
  
  cat("FE Intra-mes...\n")
  dataset[, kmes := foto_mes %% 100]
  if("mpayroll" %in% colnames(dataset) & "cliente_edad" %in% colnames(dataset))
    dataset[, mpayroll_sobre_edad := mpayroll / cliente_edad]

  # ==========================================================================
  # FE Historico
  # ==========================================================================
  
  cat("FE Historico...\n")
  inicio_fe <- Sys.time()
  setorder(dataset, numero_de_cliente, foto_mes)
  
  cols_lagueables <- setdiff(colnames(dataset), c("numero_de_cliente", "foto_mes", "clase_ternaria"))
  
  # Lags
  dataset[, paste0(cols_lagueables, "_lag1") := shift(.SD, 1, NA, "lag"), by = numero_de_cliente, .SDcols = cols_lagueables]
  dataset[, paste0(cols_lagueables, "_lag2") := shift(.SD, 2, NA, "lag"), by = numero_de_cliente, .SDcols = cols_lagueables]
  
  # Deltas
  for (vcol in cols_lagueables) {
    dataset[, paste0(vcol, "_delta1") := get(vcol) - get(paste0(vcol, "_lag1"))]
    dataset[, paste0(vcol, "_delta2") := get(vcol) - get(paste0(vcol, "_lag2"))]
  }
  
  # Trends
  for (col in cols_lagueables) {
    dataset[, paste0(col, "_trend_3") := frollapply(get(col), 3, calc_slope_fast, align="right"), by = numero_de_cliente]
    dataset[, paste0(col, "_trend_6") := frollapply(get(col), 6, calc_slope_fast, align="right"), by = numero_de_cliente]
  }
  
  # Rolling Stats
  for (col in cols_lagueables) {
    dataset[, paste0(col, "_rmean_3") := frollmean(get(col), 3, align="right", na.rm=TRUE), by = numero_de_cliente]
    dataset[, paste0(col, "_rsd_3") := frollapply(get(col), 3, sd, na.rm=TRUE, align="right"), by = numero_de_cliente]
  }
  
  cat("  FE Historico en", round(difftime(Sys.time(), inicio_fe, units="mins"), 1), "min\n")
  cat("  Dataset:", ncol(dataset), "columnas\n")

  # ==========================================================================
  # FE Random Forest
  # ==========================================================================
  
  cat("FE Random Forest...\n")
  campos_para_rf <- setdiff(colnames(dataset), c("clase_ternaria", "numero_de_cliente", "foto_mes"))
  dataset <- AgregaVarRandomForest(dataset, campos_para_rf, PARAM$semilla_primigenia)
  cat("  Dataset final:", ncol(dataset), "columnas\n\n")

  # ==========================================================================
  # Training Strategy para Kaggle (202109)
  # ==========================================================================
  
  cat("Training Strategy...\n")
  
  # Para Kaggle: entrenar con TODO hasta 202107, predecir 202109
  PARAM$trainingstrategy$final_train <- c(
    202107, 202106, 202105, 202104, 202103, 202102, 202101,
    202012, 202011, 202010, 202009, 202008, 202007, 202006, 202005
  )
  PARAM$trainingstrategy$future <- c(202109)
  
  dataset[, clase01 := ifelse(clase_ternaria %in% c("BAJA+1", "BAJA+2"), 1, 0)]
  campos_buenos <- setdiff(colnames(dataset), c("clase_ternaria", "clase01", "azar"))
  
  dataset[, fold_final_train := foto_mes %in% PARAM$trainingstrategy$final_train]
  
  cat("  Features:", length(campos_buenos), "\n\n")

  # ==========================================================================
  # Final Training
  # ==========================================================================
  
  cat("Entrenando modelo final...\n")
  
  dfinal_train <- lgb.Dataset(
    data = data.matrix(dataset[fold_final_train == TRUE, campos_buenos, with = FALSE]),
    label = dataset[fold_final_train == TRUE, clase01],
    free_raw_data = TRUE
  )
  
  param_final <- list(
    objective = "binary",
    metric = "auc",
    first_metric_only = TRUE,
    boost_from_average = TRUE,
    feature_pre_filter = FALSE,
    verbosity = -100,
    force_row_wise = TRUE,
    seed = PARAM$semilla_primigenia,
    max_bin = 31,
    num_leaves = 25,
    min_data_in_leaf = 2764,
    num_iterations = 2009,
    feature_fraction = 0.5,
    learning_rate = 0.03
  )
  
  inicio_train <- Sys.time()
  final_model <- lgb.train(data = dfinal_train, param = param_final, verbose = -100)
  cat("  Modelo entrenado en", round(difftime(Sys.time(), inicio_train, units="mins"), 1), "min\n\n")
  
  lgb.save(final_model, "modelo.txt")
  fwrite(as.data.table(lgb.importance(final_model)), file = "impo.txt", sep = "\t")

  # ==========================================================================
  # Scoring (202109)
  # ==========================================================================
  
  cat("Scoring 202109...\n")
  dfuture <- dataset[foto_mes %in% PARAM$trainingstrategy$future]
  prediccion <- predict(final_model, data.matrix(dfuture[, campos_buenos, with = FALSE]))
  
  tb_prediccion <- dfuture[, list(numero_de_cliente)]
  tb_prediccion[, prob := prediccion]
  fwrite(tb_prediccion, file = "prediccion.txt", sep = "\t")

  # ==========================================================================
  # Submit a Kaggle
  # ==========================================================================
  
  cat("\nSubmit a Kaggle...\n")
  setorder(tb_prediccion, -prob)
  
  for (envios in PARAM_GLOBAL$kaggle_cortes) {
    tb_prediccion[, Predicted := 0L]
    tb_prediccion[1:envios, Predicted := 1L]
    
    archivo_kaggle <- paste0("./kaggle/KA", PARAM$experimento, "_", envios, ".csv")
    fwrite(tb_prediccion[, list(numero_de_cliente, Predicted)], file = archivo_kaggle, sep = ",")
    
    comando <- paste(
      "kaggle competitions submit",
      "-c", PARAM_GLOBAL$kaggle_competencia,
      "-f", archivo_kaggle,
      paste0("-m 'seed", seed_idx, "_envios=", envios, "_semilla=", PARAM$semilla_primigenia, "'")
    )
    
    salida <- system(comando, intern = TRUE)
    cat("  ", envios, ": ", salida, "\n")
  }

  # ==========================================================================
  # Guardar y limpiar
  # ==========================================================================
  
  resultado <- list(
    seed_idx = seed_idx,
    semilla = PARAM$semilla_primigenia,
    experimento = PARAM$experimento
  )
  resultados_totales[[seed_idx]] <- resultado
  
  write_yaml(PARAM, file = "PARAM.yml")
  
  fin_seed <- Sys.time()
  duracion <- as.numeric(difftime(fin_seed, inicio_seed, units = "mins"))
  
  rm(dataset, dfinal_train, final_model, tb_prediccion, dfuture)
  gc(full = TRUE, verbose = FALSE)
  
  cat("\n========================================\n")
  cat("Semilla", seed_idx, "completada en", round(duracion, 1), "min\n")
  cat("========================================\n\n")

}

cat("\n***************************************\n")
cat("TODOS LOS SUBMITS COMPLETADOS\n")
cat("***************************************\n")

## Resumen Final

In [None]:
setwd("/content/buckets/b1/exp")

tb_resumen <- data.table(
  seed_idx = sapply(resultados_totales, function(x) x$seed_idx),
  semilla = sapply(resultados_totales, function(x) x$semilla),
  experimento = sapply(resultados_totales, function(x) x$experimento)
)

cat("\n========================================\n")
cat("RESUMEN - SUBMITS A KAGGLE\n")
cat("Competencia:", PARAM_GLOBAL$kaggle_competencia, "\n")
cat("========================================\n\n")
print(tb_resumen)

cat("\nCortes enviados:", paste(PARAM_GLOBAL$kaggle_cortes, collapse=", "), "\n")
cat("Total submits:", length(PARAM_GLOBAL$semillas) * length(PARAM_GLOBAL$kaggle_cortes), "\n")

fwrite(tb_resumen, file = paste0("resumen_kaggle_exp", PARAM_GLOBAL$experimento_base, ".txt"), sep = "\t")
saveRDS(resultados_totales, file = paste0("resultados_kaggle_exp", PARAM_GLOBAL$experimento_base, ".rds"))

cat("\nFINALIZADO\n")

In [None]:
format(Sys.time(), "%a %b %d %X %Y")