In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os

# --- Configuración Kaggle ---
os.environ['KAGGLE_CONFIG_DIR'] = "."

# --- Descargar y descomprimir datos ---
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia
!unzip -o udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip

# --- Cargar datasets ---
train = pd.read_csv("train.csv", na_values=["", " ", "NO SABE", "SIN INFORMACION", "NS/NR"])
test = pd.read_csv("test.csv", na_values=["", " ", "NO SABE", "SIN INFORMACION", "NS/NR"])

# --- Limpieza y renombrado de columnas ---
train.columns = train.columns.str.strip().str.replace(" ", "_").str.upper()
test.columns = test.columns.str.strip().str.replace(" ", "_").str.upper()

# --- Rellenar valores faltantes ---
for col in train.select_dtypes(include=[np.number]).columns:
    med = train[col].median()
    train[col] = train[col].fillna(med)
    if col in test.columns:
        test[col] = test[col].fillna(med)

for col in train.select_dtypes(include="object").columns:
    train[col] = train[col].fillna("DESCONOCIDO")
    if col in test.columns:
        test[col] = test[col].fillna("DESCONOCIDO")

# --- Feature engineering ---
train['PROG_VALOR'] = train['E_PRGM_ACADEMICO'] + "_" + train['E_VALORMATRICULAUNIVERSIDAD']
train['PROG_DEPT'] = train['E_PRGM_ACADEMICO'] + "_" + train['E_PRGM_DEPARTAMENTO']
test['PROG_VALOR'] = test['E_PRGM_ACADEMICO'] + "_" + test['E_VALORMATRICULAUNIVERSIDAD']
test['PROG_DEPT'] = test['E_PRGM_ACADEMICO'] + "_" + test['E_PRGM_DEPARTAMENTO']

# --- Variable objetivo ---
target_map = {'bajo': 0, 'medio-bajo': 1, 'medio-alto': 2, 'alto': 3}
train['RENDIMIENTO_GLOBAL'] = train['RENDIMIENTO_GLOBAL'].replace(target_map)

# --- Separar X / y ---
X = train.drop(columns=["ID", "RENDIMIENTO_GLOBAL"])
y = train["RENDIMIENTO_GLOBAL"]

# --- Convertir categóricas a 'category' ---
for col in X.select_dtypes(include="object").columns:
    X[col] = X[col].astype('category').cat.codes
    test[col] = test[col].astype('category').cat.codes

# --- Split entrenamiento / validación ---
xtr, xts, ytr, yts = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Modelo XGBoost ---
model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softmax",
    num_class=4,
    random_state=42,
    tree_method="hist"
)

# --- Entrenamiento ---
model.fit(xtr, ytr)

# --- Evaluación ---
y_pred = model.predict(xts)
print("Accuracy validación:", accuracy_score(yts, y_pred))

# --- Predicción para Kaggle ---
X_test = test.drop(columns=["ID"])
pred_test = model.predict(X_test)

# --- Convertir predicciones a etiquetas ---
inv_target_map = {v: k for k, v in target_map.items()}
pred_labels = [inv_target_map[p] for p in pred_test]


submission = pd.DataFrame({
    "ID": test["ID"],
    "RENDIMIENTO_GLOBAL": pred_labels
})

submission.to_csv("submission_xgboost.csv", index=False)
print("Archivo submission_xgboost.csv generado correctamente.")



udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip
  inflating: submission_example.csv  
  inflating: test.csv                
  inflating: train.csv               


  train['RENDIMIENTO_GLOBAL'] = train['RENDIMIENTO_GLOBAL'].replace(target_map)


Accuracy validación: 0.4364115523465704
Archivo submission_xgboost.csv generado correctamente.
