In [2]:
import os
import json
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold

In [3]:
os.environ['KAGGLE_CONFIG_DIR'] = "."


In [3]:
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia

Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 2.06GB/s]


In [4]:
!unzip udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip


Archive:  udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip
  inflating: submission_example.csv  
  inflating: test.csv                
  inflating: train.csv               


In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

TARGET = "RENDIMIENTO_GLOBAL"
ID = "ID"

X = train.drop(columns=[TARGET])
y = train[TARGET]
X_test = test.copy()

print("Datos cargados. Shapes:", X.shape, X_test.shape)


Datos cargados. Shapes: (692500, 20) (296786, 20)


In [5]:
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numéricas: {len(num_cols)} | Categóricas: {len(cat_cols)}")


Numéricas: 6 | Categóricas: 14


In [6]:
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

X[num_cols] = num_imputer.fit_transform(X[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

print("Imputación completada.")



Imputación completada.


In [7]:
label_enc = LabelEncoder()
label_cols = [c for c in cat_cols if X[c].nunique() <= 10]

for col in label_cols:
    X[col] = label_enc.fit_transform(X[col])
    X_test[col] = label_enc.transform(X_test[col])

print("Columnas codificadas con LabelEncoder:", label_cols)



Columnas codificadas con LabelEncoder: ['E_VALORMATRICULAUNIVERSIDAD', 'E_HORASSEMANATRABAJA', 'F_ESTRATOVIVIENDA', 'F_TIENEINTERNET', 'F_TIENELAVADORA', 'F_TIENEAUTOMOVIL', 'E_PRIVADO_LIBERTAD', 'E_PAGOMATRICULAPROPIO', 'F_TIENECOMPUTADOR', 'F_TIENEINTERNET.1']


In [8]:
onehot_cols = [c for c in cat_cols if c not in label_cols]

X = pd.get_dummies(X, columns=onehot_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=onehot_cols, drop_first=True)

X, X_test = X.align(X_test, join="left", axis=1, fill_value=0)

print("OHE + alineación completadas. Nuevas shapes:")
print(X.shape, X_test.shape)


OHE + alineación completadas. Nuevas shapes:
(692500, 1015) (296786, 1015)


In [9]:
# =============================================
# ELIMINAR COLUMNAS CONSTANTES Y CUASI-CONSTANTES
# =============================================

# 1. Eliminar columnas constantes
cols_before = X.shape[1]
constant_cols = [c for c in X.columns if X[c].nunique() == 1]

X = X.drop(columns=constant_cols)
X_test = X_test.drop(columns=constant_cols, errors="ignore")

print(f"Columnas eliminadas por ser constantes: {len(constant_cols)}")

# 2. Eliminar columnas cuasi-constantes (más del 99.9% iguales)
quasi_constant_cols = []

for col in X.columns:
    top_freq = X[col].value_counts(normalize=True, dropna=False).iloc[0]
    if top_freq > 0.999:
        quasi_constant_cols.append(col)

X = X.drop(columns=quasi_constant_cols)
X_test = X_test.drop(columns=quasi_constant_cols, errors="ignore")

print(f"Columnas cuasi-constantes eliminadas (>99.9% igual): {len(quasi_constant_cols)}")
print(f"Shape final después de reducción ligera: {X.shape}")


Columnas eliminadas por ser constantes: 0
Columnas cuasi-constantes eliminadas (>99.9% igual): 830
Shape final después de reducción ligera: (692500, 185)


In [10]:
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

print("Escalado completado.")



Escalado completado.


In [11]:
model = RandomForestClassifier(
    n_estimators=120,       # <--- menos árboles (= velocidad)
    max_depth=20,           # <--- limita crecimiento
    min_samples_split=5,    # <--- menos divisiones pequeñas
    n_jobs=-1,
    random_state=42
)

print("Entrenando modelo...")
model.fit(X, y)
print("Entrenamiento completado.")


Entrenando modelo...
Entrenamiento completado.


In [12]:
test_pred = model.predict(X_test)

submission = pd.DataFrame({
    ID: test[ID],
    TARGET: test_pred
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,ID,RENDIMIENTO_GLOBAL
0,550236,alto
1,98545,medio-alto
2,499179,alto
3,782980,bajo
4,785185,bajo


In [13]:
!kaggle competitions submit -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia -f submission.csv -m "Modelo solución optimizado"


100% 3.95M/3.95M [00:00<00:00, 8.81MB/s]
Successfully submitted to UDEA/ai4eng 20252 - Pruebas Saber Pro Colombia