In [None]:
!pip install catboost

import pandas as pd
import numpy as np
import os
import time
from catboost import CatBoostClassifier

# --- Configuración Kaggle ---
os.environ['KAGGLE_CONFIG_DIR'] = "."

# --- Descargar y descomprimir datos ---
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia
!unzip -o udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip

# --- Cargar datasets ---
train = pd.read_csv("train.csv", na_values=["", " ", "NO SABE", "SIN INFORMACION", "NS/NR"])
test = pd.read_csv("test.csv", na_values=["", " ", "NO SABE", "SIN INFORMACION", "NS/NR"])

# --- Limpieza y renombrado de columnas ---
train.columns = train.columns.str.strip().str.replace(" ", "_").str.upper()
test.columns = test.columns.str.strip().str.replace(" ", "_").str.upper()

# --- Rellenar valores faltantes ---
for col in train.select_dtypes(include=[np.number]).columns:
    med = train[col].median()
    train[col] = train[col].fillna(med)
    if col in test.columns:
        test[col] = test[col].fillna(med)

for col in train.select_dtypes(include="object").columns:
    train[col] = train[col].fillna("DESCONOCIDO")
    if col in test.columns:
        test[col] = test[col].fillna("DESCONOCIDO")

# --- Feature Engineering ---
for df in [train, test]:
    df['PROG_VALOR'] = df['E_PRGM_ACADEMICO'] + "_" + df['E_VALORMATRICULAUNIVERSIDAD']
    df['PROG_DEPT'] = df['E_PRGM_ACADEMICO'] + "_" + df['E_PRGM_DEPARTAMENTO']

# --- Variable objetivo ---
target_map = {'bajo': 0, 'medio-bajo': 1, 'medio-alto': 2, 'alto': 3}
train['RENDIMIENTO_GLOBAL'] = train['RENDIMIENTO_GLOBAL'].replace(target_map)

# --- Separar X / y ---
X = train.drop(columns=["ID", "RENDIMIENTO_GLOBAL"])
y = train["RENDIMIENTO_GLOBAL"]

# --- Convertir categóricas ---
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

# --- Crear conjunto de validación interno ---
idx_val = X.sample(8000, random_state=42).index
X_val = X.loc[idx_val]
y_val = y.loc[idx_val]

start = time.time()

# --- Definir modelo CatBoost ---
model = CatBoostClassifier(
    iterations=1200,
    learning_rate=0.045,
    depth=7,
    l2_leaf_reg=6,
    loss_function='MultiClass',
    bootstrap_type='Bayesian',
    bagging_temperature=1,
    random_strength=1.0,
    auto_class_weights='Balanced',
    task_type='CPU',
    eval_metric='Accuracy',
    verbose=200
)

# --- Entrenamiento del modelo ---
model.fit(
    X, y,
    cat_features=categorical_features,
    eval_set=(X_val, y_val),
    use_best_model=True,
    early_stopping_rounds=300
)

# --- Predicción sobre test ---
pred_test = model.predict(test.drop(columns=["ID"])).astype(int).flatten()

inv_target_map = {v: k for k, v in target_map.items()}
pred_labels = [inv_target_map[p] for p in pred_test]

# --- Generar archivo de envío ---
submission = pd.DataFrame({
    "ID": test["ID"],
    "RENDIMIENTO_GLOBAL": pred_labels
})

submission.to_csv("submission_catboost_bayesian_fast_v2.csv", index=False)

print("\nArchivo generado: submission_catboost_bayesian_fast_v2.csv")
print(f"Tiempo total: {time.time()-start:.2f} segundos")



udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip
  inflating: submission_example.csv  
  inflating: test.csv                
  inflating: train.csv               


  train['RENDIMIENTO_GLOBAL'] = train['RENDIMIENTO_GLOBAL'].replace(target_map)


0:	learn: 0.4081003	test: 0.4140590	best: 0.4140590 (0)	total: 14.1s	remaining: 4h 42m 12s
200:	learn: 0.4433608	test: 0.4544146	best: 0.4544146 (200)	total: 54m 11s	remaining: 4h 29m 21s
400:	learn: 0.4481324	test: 0.4575419	best: 0.4580457 (393)	total: 1h 50m 24s	remaining: 3h 40m
600:	learn: 0.4508203	test: 0.4579184	best: 0.4599156 (492)	total: 2h 46m 36s	remaining: 2h 46m 3s
800:	learn: 0.4531933	test: 0.4604391	best: 0.4615648 (772)	total: 3h 42m 59s	remaining: 1h 51m 4s
1000:	learn: 0.4552361	test: 0.4609439	best: 0.4615648 (772)	total: 4h 39m 12s	remaining: 55m 30s
1199:	learn: 0.4570113	test: 0.4624481	best: 0.4627017 (1189)	total: 5h 36m 35s	remaining: 0us

bestTest = 0.462701664
bestIteration = 1189

Shrink model to first 1190 iterations.

Archivo generado: submission_catboost_bayesian_fast_v2.csv
Tiempo total: 20231.49 segundos
