In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC

In [2]:
os.environ['KAGGLE_CONFIG_DIR'] = "."

In [None]:
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia

Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 1.33GB/s]


In [None]:
!unzip udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip

Archive:  udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip
  inflating: submission_example.csv  
  inflating: test.csv                
  inflating: train.csv               


In [3]:
TARGET = "RENDIMIENTO_GLOBAL"
ID = "ID"

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]
X_test = test.copy()

print("Datos cargados correctamente.")
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Datos cargados correctamente.
Train shape: (692500, 20)
Test shape: (296786, 20)


In [4]:
num_cols = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

print(f"Columnas numéricas: {len(num_cols)}")
print(f"Columnas categóricas: {len(cat_cols)}")


Columnas numéricas: 6
Columnas categóricas: 14


In [5]:
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

print("Imputación completada.")


Imputación completada.


In [6]:
label_enc = LabelEncoder()
label_cols = [c for c in cat_cols if X_train[c].nunique() <= 10]

for col in label_cols:
    X_train[col] = label_enc.fit_transform(X_train[col])
    X_test[col] = label_enc.transform(X_test[col])

print("Columnas con LabelEncoder:", label_cols)

Columnas con LabelEncoder: ['E_VALORMATRICULAUNIVERSIDAD', 'E_HORASSEMANATRABAJA', 'F_ESTRATOVIVIENDA', 'F_TIENEINTERNET', 'F_TIENELAVADORA', 'F_TIENEAUTOMOVIL', 'E_PRIVADO_LIBERTAD', 'E_PAGOMATRICULAPROPIO', 'F_TIENECOMPUTADOR', 'F_TIENEINTERNET.1']


In [7]:
onehot_cols = [c for c in cat_cols if c not in label_cols]

X_train = pd.get_dummies(X_train, columns=onehot_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=onehot_cols, drop_first=True)

# Alinear columnas
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

print("One-Hot Encoding y alineación completados.")
print("Shape final de X_train:", X_train.shape)


One-Hot Encoding y alineación completados.
Shape final de X_train: (692500, 1015)


In [8]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

print("Escalado de variables numéricas completado.")


Escalado de variables numéricas completado.


In [9]:
X_sample = X_train.sample(frac=0.02, random_state=42)
y_sample = y_train.loc[X_sample.index]

print("Subset creado.")
print("Tamaño subset:", X_sample.shape)


Subset creado.
Tamaño subset: (13850, 1015)


In [10]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_sample, y_sample,
    test_size=0.2,
    random_state=42,
    stratify=y_sample
)

print("Split completado.")

Split completado.


In [11]:
svm_model = SVC(kernel="rbf", probability=False)

print("Entrenando SVM (RBF) con subset...")
svm_model.fit(X_tr, y_tr)

y_pred = svm_model.predict(X_val)
acc = accuracy_score(y_val, y_pred)

print(f"Accuracy (SVM con subset 2%): {acc:.4f}")

Entrenando SVM (RBF) con subset...
Accuracy (SVM con subset 2%): 0.4065


In [12]:
print("Entrenando SVM (RBF) con subset del 2%...")
svm_model.fit(X_tr, y_tr)
print("Modelo entrenado.")

# Evaluación SOLO en la validación
y_pred = svm_model.predict(X_val)
acc = accuracy_score(y_val, y_pred)

print(f"Accuracy final (SVM, subset 2%): {acc:.4f}")

# Mostrar primeras predicciones
pd.DataFrame({
    "y_real": y_val.values[:10],
    "y_pred": y_pred[:10]
})

Entrenando SVM (RBF) con subset del 2%...
Modelo entrenado.
Accuracy final (SVM, subset 2%): 0.4065


Unnamed: 0,y_real,y_pred
0,alto,alto
1,medio-bajo,medio-bajo
2,alto,alto
3,medio-bajo,medio-bajo
4,medio-bajo,medio-bajo
5,bajo,bajo
6,alto,alto
7,medio-alto,bajo
8,alto,alto
9,bajo,bajo
