# Workflow 618 - ONLY TREND Features

Feature Engineering con SOLO features de tendencia (trend_3, trend_6).

L√≥gica id√©ntica a z610 baseline.

In [None]:
# CELDA 1: Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# CELDA 2: Descargar dataset
import os
import urllib.request

# Crear directorio para datasets
os.makedirs('/content/drive/MyDrive/labo2025v/datasets', exist_ok=True)

# URL del dataset
dataset_url = 'https://storage.googleapis.com/open-courses/austral2025-af91/gerencial_competencia_2025.csv.gz'
dataset_path = '/content/drive/MyDrive/labo2025v/datasets/gerencial_competencia_2025.csv.gz'

# Descargar solo si no existe
if not os.path.exists(dataset_path):
    print(f'Descargando dataset desde {dataset_url}...')
    urllib.request.urlretrieve(dataset_url, dataset_path)
    print(f'‚úì Dataset descargado: {dataset_path}')
else:
    print(f'‚úì Dataset ya existe: {dataset_path}')

# Verificar tama√±o
size_mb = os.path.getsize(dataset_path) / (1024**2)
print(f'  Tama√±o: {size_mb:.2f} MB')

In [None]:
%%writefile /content/workflow_618.R

# ============================================================================
# WORKFLOW 618 ONLY TREND - Implementaci√≥n completa en Google Colab
# ============================================================================

format(Sys.time(), "%a %b %d %X %Y")

# ============================================================================
# CONFIGURACI√ìN INICIAL
# ============================================================================

cat("\n")
cat("========================================\n")
cat(" WORKFLOW 618 ONLY TREND\n")
cat("========================================\n\n")

# Directorios
BASE_DIR <- "/content/drive/MyDrive/labo2025v"
DATASETS_DIR <- file.path(BASE_DIR, "datasets")
EXP_DIR <- file.path(BASE_DIR, "exp", "exp_only_trend_colab")

dir.create(EXP_DIR, showWarnings = FALSE, recursive = TRUE)

# Configuraci√≥n de semillas
SEMILLAS <- c(153929, 838969, 922081, 795581, 194609)

cat("Configuraci√≥n:\n")
cat(paste("  Base dir:", BASE_DIR, "\n"))
cat(paste("  Datasets:", DATASETS_DIR, "\n"))
cat(paste("  Experimentos:", EXP_DIR, "\n"))
cat(paste("  Semillas:", length(SEMILLAS), "\n\n"))

# ============================================================================
# CARGAR PAQUETES
# ============================================================================

cat("Cargando paquetes...\n")

library(data.table)
library(lightgbm)
library(DiceKriging)
library(mlr)
library(mlrMBO)
library(ParamHelpers)

setDTthreads(1)  # Colab tiene CPUs limitadas

cat("‚úì Paquetes cargados\n\n")

# ============================================================================
# VERIFICAR DATASET
# ============================================================================

dataset_file <- file.path(DATASETS_DIR, "gerencial_competencia_2025.csv.gz")

cat("Verificando dataset...\n")
if (!file.exists(dataset_file)) {
  stop("ERROR: Dataset no encontrado en ", dataset_file)
}

file_size <- file.info(dataset_file)$size / (1024^2)
cat(paste("‚úì Dataset encontrado:", round(file_size, 2), "MB\n\n"))

# ============================================================================
# LOOP DE SEMILLAS
# ============================================================================

resultados_finales <- list()

for (seed_idx in 1:length(SEMILLAS)) {
  
  semilla <- SEMILLAS[seed_idx]
  
  cat("\n")
  cat(paste(rep("=", 80), collapse=""), "\n")
  cat(paste("SEMILLA", seed_idx, "/", length(SEMILLAS), "- Valor:", semilla, "\n"))
  cat(paste(rep("=", 80), collapse=""), "\n\n")
  
  inicio_seed <- Sys.time()
  
  # Crear directorio para esta semilla
  exp_folder <- paste0("WF618", seed_idx - 1, "_seed", seed_idx, "_ONLY_TREND")
  seed_dir <- file.path(EXP_DIR, exp_folder)
  dir.create(seed_dir, showWarnings = FALSE, recursive = TRUE)
  
  cat(paste("Directorio:", seed_dir, "\n\n"))
  
  # ==========================================================================
  # CARGA DE DATASET O CACHE
  # ==========================================================================
  
  dataset_cache_file <- file.path(seed_dir, "dataset_con_FE_ONLY_TREND.rds")
  
  if (file.exists(dataset_cache_file)) {
    cat("üì¶ Cargando dataset desde cache...\n")
    inicio_cache <- Sys.time()
    dataset <- readRDS(dataset_cache_file)
    fin_cache <- Sys.time()
    tiempo_cache <- as.numeric(difftime(fin_cache, inicio_cache, units = "secs"))
    
    cat(paste("‚úì Cache cargado en", round(tiempo_cache, 1), "seg\n"))
    cat(paste("  Dimensiones:", nrow(dataset), "filas x", ncol(dataset), "cols\n\n"))
    
  } else {
    
    # ========================================================================
    # CARGAR DATASET ORIGINAL
    # ========================================================================
    
    cat("üìÇ Cargando dataset original...\n")
    inicio_carga <- Sys.time()
    dataset <- fread(dataset_file)
    fin_carga <- Sys.time()
    tiempo_carga <- as.numeric(difftime(fin_carga, inicio_carga, units = "secs"))
    
    cat(paste("‚úì Dataset cargado en", round(tiempo_carga, 1), "seg\n"))
    cat(paste("  Dimensiones:", nrow(dataset), "filas x", ncol(dataset), "cols\n\n"))
    
    # ========================================================================
    # CATASTROPHE ANALYSIS
    # ========================================================================
    
    cat("üî• Aplicando Catastrophe Analysis...\n")
    
    # Asignar NA a 13 variables en foto_mes 202006
    dataset[foto_mes == 202006, internet := NA]
    dataset[foto_mes == 202006, mrentabilidad := NA]
    dataset[foto_mes == 202006, mrentabilidad_annual := NA]
    dataset[foto_mes == 202006, mcomisiones := NA]
    dataset[foto_mes == 202006, mactivos_margen := NA]
    dataset[foto_mes == 202006, mpasivos_margen := NA]
    dataset[foto_mes == 202006, mcuentas_saldo := NA]
    dataset[foto_mes == 202006, ctarjeta_visa_transacciones := NA]
    dataset[foto_mes == 202006, mtarjeta_visa_consumo := NA]
    dataset[foto_mes == 202006, mtarjeta_master_consumo := NA]
    dataset[foto_mes == 202006, ccallcenter_transacciones := NA]
    dataset[foto_mes == 202006, chomebanking_transacciones := NA]
    dataset[foto_mes == 202006, ctarjeta_master_transacciones := NA]
    
    cat("‚úì 13 variables en 202006 ‚Üí NA\n\n")
    
    # ========================================================================
    # FEATURE ENGINEERING - SOLO TRENDS
    # ========================================================================
    
    cat("‚öôÔ∏è  Feature Engineering - SOLO TRENDS...\n")
    inicio_fe <- Sys.time()
    
    # Variables base (excluir ID, fecha, clase)
    cols_lagueables <- setdiff(
      colnames(dataset),
      c("numero_de_cliente", "foto_mes", "clase_ternaria")
    )
    
    cat(paste("  Variables base:", length(cols_lagueables), "\n"))
    
    # Ordenar por cliente y mes
    setorder(dataset, numero_de_cliente, foto_mes)
    
    # GENERAR TRENDS (ventanas 3 y 6)
    cat("  Generando trends...\n")
    inicio_trends <- Sys.time()
    cols_antes_trends <- ncol(dataset)
    
    for (ventana in c(3, 6)) {
      cat(paste("    Ventana", ventana, "..."))
      
      for (col in cols_lagueables) {
        trend_col <- paste0(col, "_trend_", ventana)
        
        dataset[, (trend_col) := {
          if (.N >= ventana) {
            valores <- tail(get(col), ventana)
            if (all(is.na(valores))) {
              NA_real_
            } else {
              x <- 1:ventana
              y <- valores
              validos <- !is.na(y)
              if (sum(validos) >= 2) {
                coef(lm(y[validos] ~ x[validos]))[2]
              } else {
                NA_real_
              }
            }
          } else {
            NA_real_
          }
        }, by = numero_de_cliente]
      }
      
      cat(" OK\n")
    }
    
    fin_trends <- Sys.time()
    cols_trends <- ncol(dataset) - cols_antes_trends
    tiempo_trends <- as.numeric(difftime(fin_trends, inicio_trends, units = "mins"))
    
    cat(paste("  ‚úì Trends generadas:", cols_trends, "variables en",
              round(tiempo_trends, 1), "min\n"))
    
    fin_fe <- Sys.time()
    tiempo_fe <- as.numeric(difftime(fin_fe, inicio_fe, units = "mins"))
    
    cat(paste("‚úì Feature Engineering completado en", round(tiempo_fe, 1), "min\n"))
    cat(paste("  Dataset final:", nrow(dataset), "filas x", ncol(dataset), "cols\n\n"))
    
    # Guardar en cache
    cat("üíæ Guardando cache...\n")
    inicio_save <- Sys.time()
    saveRDS(dataset, dataset_cache_file, compress = "xz")
    fin_save <- Sys.time()
    tiempo_save <- as.numeric(difftime(fin_save, inicio_save, units = "secs"))
    
    file_size_mb <- file.info(dataset_cache_file)$size / (1024^2)
    cat(paste("‚úì Cache guardado:", round(file_size_mb, 1), "MB en",
              round(tiempo_save, 1), "seg\n\n"))
  }
  
  # ==========================================================================
  # TRAINING STRATEGY
  # ==========================================================================
  
  cat("üéØ Configurando Training Strategy...\n")
  
  # Clase binaria
  dataset[, clase01 := ifelse(clase_ternaria %in% c("BAJA+1", "BAJA+2"), 1, 0)]
  
  # Periodos
  training_months <- c(
    202104, 202103, 202102, 202101,
    202012, 202011, 202010, 202009, 202008, 202007,
    202006, 202005
  )
  
  validate_month <- 202105
  
  final_train_months <- c(
    202105, 202104, 202103, 202102, 202101,
    202012, 202011, 202010, 202009, 202008, 202007,
    202006, 202005
  )
  
  future_month <- 202107
  
  cat(paste("  Training:", paste(range(training_months), collapse=" a "), "\n"))
  cat(paste("  Validation:", validate_month, "\n"))
  cat(paste("  Final train:", paste(range(final_train_months), collapse=" a "), "\n"))
  cat(paste("  Future:", future_month, "\n\n"))
  
  # Features (excluir IDs y clases)
  campos_buenos <- setdiff(
    colnames(dataset),
    c("numero_de_cliente", "foto_mes", "clase_ternaria", "clase01")
  )
  
  cat(paste("  Features para modelo:", length(campos_buenos), "\n\n"))
  
  # Crear datasets LightGBM
  dtrain <- lgb.Dataset(
    data = data.matrix(dataset[foto_mes %in% training_months, campos_buenos, with = FALSE]),
    label = dataset[foto_mes %in% training_months, clase01],
    free_raw_data = FALSE
  )
  
  dvalidate <- lgb.Dataset(
    data = data.matrix(dataset[foto_mes == validate_month, campos_buenos, with = FALSE]),
    label = dataset[foto_mes == validate_month, clase01],
    free_raw_data = FALSE
  )
  
  cat(paste("  Train set:", nrow(dataset[foto_mes %in% training_months]), "filas\n"))
  cat(paste("  Validation set:", nrow(dataset[foto_mes == validate_month]), "filas\n\n"))
  
  # ==========================================================================
  # BAYESIAN OPTIMIZATION
  # ==========================================================================
  
  cat("üîç Bayesian Optimization (10 iteraciones)...\n")
  inicio_bo <- Sys.time()
  
  set.seed(semilla)
  
  # Par√°metros fijos
  param_fijos <- list(
    objective = "binary",
    metric = "auc",
    first_metric_only = TRUE,
    boost_from_average = TRUE,
    feature_pre_filter = FALSE,
    verbosity = -100,
    force_row_wise = TRUE,
    seed = semilla,
    max_bin = 31,
    learning_rate = 0.03,
    feature_fraction = 0.5,
    num_iterations = 2048,
    early_stopping_rounds = 200
  )
  
  # Par√°metros a optimizar
  configuracion_bo <- makeParamSet(
    makeIntegerParam("num_leaves", lower = 2L, upper = 256L),
    makeIntegerParam("min_data_in_leaf", lower = 2L, upper = 8192L)
  )
  
  # Funci√≥n objetivo
  EstimarGanancia_AUC_lightgbm <- function(x) {
    param_completo <- modifyList(param_fijos, x)
    
    modelo_train <- lgb.train(
      data = dtrain,
      valids = list(valid = dvalidate),
      eval = "auc",
      param = param_completo,
      verbose = -100
    )
    
    AUC <- modelo_train$record_evals$valid$auc$eval[[modelo_train$best_iter]]
    attr(AUC, "extras") <- list("num_iterations" = modelo_train$best_iter)
    
    rm(modelo_train)
    gc(full = TRUE, verbose = FALSE)
    
    return(AUC)
  }
  
  # Configurar BO
  configureMlr(show.learner.output = FALSE)
  
  obj.fun <- makeSingleObjectiveFunction(
    fn = EstimarGanancia_AUC_lightgbm,
    minimize = FALSE,
    noisy = FALSE,
    par.set = configuracion_bo,
    has.simple.signature = FALSE
  )
  
  # Control de BO
  ctrl <- makeMBOControl()
  ctrl <- setMBOControlTermination(ctrl, iters = 10L)
  ctrl <- setMBOControlInfill(ctrl, crit = makeMBOInfillCritEI())
  
  surr.km <- makeLearner(
    "regr.km",
    predict.type = "se",
    covtype = "matern3_2",
    control = list(trace = FALSE)
  )
  
  # Ejecutar BO
  bayesiana_salida <- mbo(obj.fun, learner = surr.km, control = ctrl)
  
  fin_bo <- Sys.time()
  tiempo_bo <- as.numeric(difftime(fin_bo, inicio_bo, units = "mins"))
  
  # Extraer mejores hiperpar√°metros
  tb_bayesiana <- as.data.table(bayesiana_salida$opt.path)
  setorder(tb_bayesiana, -y, -num_iterations)
  
  fwrite(tb_bayesiana, file.path(seed_dir, "BO_log.txt"), sep = "\t")
  
  mejores_hiperparametros <- tb_bayesiana[
    1,
    setdiff(colnames(tb_bayesiana),
            c("y", "dob", "eol", "error.message", "exec.time", "ei", "error.model",
              "train.time", "prop.type", "propose.time", "se", "mean", "iter")),
    with = FALSE
  ]
  
  mejor_auc <- tb_bayesiana[1, y]
  
  cat(paste("‚úì BO completado en", round(tiempo_bo, 1), "min\n"))
  cat(paste("  Mejor AUC:", round(mejor_auc, 6), "\n"))
  cat(paste("  num_leaves:", mejores_hiperparametros$num_leaves, "\n"))
  cat(paste("  min_data_in_leaf:", mejores_hiperparametros$min_data_in_leaf, "\n"))
  cat(paste("  num_iterations:", mejores_hiperparametros$num_iterations, "\n\n"))
  
  # ==========================================================================
  # ENTRENAMIENTO MODELO FINAL
  # ==========================================================================
  
  cat("üöÄ Entrenando modelo final...\n")
  inicio_train_final <- Sys.time()
  
  # Dataset final train
  dfinal_train <- lgb.Dataset(
    data = data.matrix(dataset[foto_mes %in% final_train_months, campos_buenos, with = FALSE]),
    label = dataset[foto_mes %in% final_train_months, clase01],
    free_raw_data = FALSE
  )
  
  cat(paste("  Final train set:", nrow(dataset[foto_mes %in% final_train_months]), "filas\n"))
  
  # Par√°metros finales
  param_fijos_final <- param_fijos
  param_fijos_final$num_iterations <- NULL
  param_fijos_final$early_stopping_rounds <- NULL
  
  param_final <- c(param_fijos_final, mejores_hiperparametros)
  
  set.seed(semilla)
  
  final_model <- lgb.train(
    data = dfinal_train,
    param = param_final,
    verbose = -100
  )
  
  fin_train_final <- Sys.time()
  tiempo_train_final <- as.numeric(difftime(fin_train_final, inicio_train_final, units = "mins"))
  
  cat(paste("‚úì Modelo final entrenado en", round(tiempo_train_final, 1), "min\n\n"))
  
  # Guardar modelo
  lgb.save(final_model, file.path(seed_dir, "modelo.txt"))
  
  # Importancia de variables
  tb_importancia <- as.data.table(lgb.importance(final_model))
  fwrite(tb_importancia, file.path(seed_dir, "impo.txt"), sep = "\t")
  
  # ==========================================================================
  # SCORING
  # ==========================================================================
  
  cat("üìä Generando predicciones...\n")
  
  dfuture <- dataset[foto_mes == future_month]
  
  cat(paste("  Future set:", nrow(dfuture), "filas\n"))
  
  # Predicciones
  prediccion <- predict(
    final_model,
    data.matrix(dfuture[, campos_buenos, with = FALSE])
  )
  
  # Tabla de predicciones
  tb_prediccion <- dfuture[, list(numero_de_cliente)]
  tb_prediccion[, prob := prediccion]
  
  fwrite(tb_prediccion, file.path(seed_dir, "prediccion.txt"), sep = "\t")
  
  # Calcular ganancia
  tb_prediccion[, clase_ternaria := dfuture$clase_ternaria]
  
  # Ganancias (z610: 117000 para BAJA+2, -3000 para resto)
  tb_prediccion[, ganancia := -3000.0]
  tb_prediccion[clase_ternaria == "BAJA+2", ganancia := 117000.0]
  
  # Ordenar y acumular
  setorder(tb_prediccion, -prob)
  tb_prediccion[, gan_acum := cumsum(ganancia)]
  
  # Media m√≥vil de ancho 400
  tb_prediccion[,
                gan_suavizada := frollmean(
                  x = gan_acum,
                  n = 400,
                  align = "center",
                  na.rm = TRUE,
                  hasNA = TRUE
                )]
  
  # Ganancia m√°xima suavizada
  ganancia_suavizada_max <- max(tb_prediccion$gan_suavizada, na.rm = TRUE)
  envios_optimos <- which.max(tb_prediccion$gan_suavizada)
  
  cat(paste("  Ganancia m√°xima suavizada:", formatC(ganancia_suavizada_max, format="f", big.mark=",", digits=0), "\n"))
  cat(paste("  Env√≠os √≥ptimos:", envios_optimos, "\n"))
  
  # Guardar ganancias
  fwrite(tb_prediccion, file.path(seed_dir, "ganancias.txt"), sep = "\t")
  
  # Crear submission
  tb_prediccion[, envios := .I]
  submission <- tb_prediccion[envios <= envios_optimos, .(numero_de_cliente)]
  fwrite(submission, file.path(seed_dir, paste0("submission_", seed_idx, ".csv")))
  
  cat(paste("‚úì Submission generado:", nrow(submission), "env√≠os\n\n"))
  
  # ==========================================================================
  # FIN SEMILLA
  # ==========================================================================
  
  fin_seed <- Sys.time()
  duracion_total <- as.numeric(difftime(fin_seed, inicio_seed, units = "mins"))
  
  cat(paste(rep("=", 80), collapse=""), "\n")
  cat(paste("‚úÖ SEMILLA", seed_idx, "COMPLETADA en", round(duracion_total, 1), "min\n"))
  cat(paste("   Ganancia:", formatC(ganancia_suavizada_max, format="f", big.mark=",", digits=0), "\n"))
  cat(paste("   Env√≠os:", nrow(submission), "\n"))
  cat(paste(rep("=", 80), collapse=""), "\n\n")
  
  # Guardar resultado
  resultados_finales[[seed_idx]] <- list(
    seed_idx = seed_idx,
    semilla = semilla,
    ganancia = ganancia_suavizada_max,
    envios = nrow(submission),
    envios_optimos = envios_optimos,
    duracion_min = duracion_total,
    mejor_auc = mejor_auc
  )
  
  # Liberar memoria
  rm(dataset, dtrain, dvalidate, dfinal_train, final_model, dfuture, tb_prediccion, submission)
  gc(full = TRUE, verbose = FALSE)
}

# ============================================================================
# RESUMEN FINAL
# ============================================================================

cat("\n")
cat(paste(rep("=", 80), collapse=""), "\n")
cat("RESUMEN FINAL\n")
cat(paste(rep("=", 80), collapse=""), "\n\n")

# Crear tabla resumen
resumen_df <- do.call(rbind, lapply(resultados_finales, function(x) {
  data.frame(
    seed_idx = x$seed_idx,
    semilla = x$semilla,
    ganancia = x$ganancia,
    envios = x$envios,
    envios_optimos = x$envios_optimos,
    duracion_min = round(x$duracion_min, 1),
    mejor_auc = round(x$mejor_auc, 6)
  )
}))

# Ordenar por ganancia
resumen_df <- resumen_df[order(-resumen_df$ganancia), ]
resumen_df$rank <- rank(-resumen_df$ganancia)

# Guardar resumen
fwrite(resumen_df, file.path(EXP_DIR, "resumen_only_trend_colab.txt"), sep = "\t")
saveRDS(resultados_finales, file.path(EXP_DIR, "resultados_only_trend_colab.rds"))

cat("Resultados por semilla:\n\n")
print(resumen_df)

# Mejor semilla
mejor <- resumen_df[which.max(resumen_df$ganancia), ]
cat("\nüèÜ MEJOR SEMILLA:\n")
cat(paste("  Semilla:", mejor$semilla, "\n"))
cat(paste("  Ganancia:", formatC(mejor$ganancia, format="f", big.mark=",", digits=0), "\n"))
cat(paste("  Env√≠os:", mejor$envios, "\n"))
cat(paste("  Duraci√≥n:", mejor$duracion_min, "min\n"))

cat("\n‚ú® WORKFLOW COMPLETADO ‚ú®\n\n")

In [None]:
# CELDA 3: Ejecutar workflow
!Rscript /content/workflow_618.R