<a href="https://colab.research.google.com/github/mateor32/modelos1-20252-competenciaKaggle/blob/main/99%20-%20modelo_soluci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
from google.colab import drive
drive.mount('/content/drive')

print("¬°Drive conectado!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
¬°Drive conectado!


In [77]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split # Necesario para Early Stopping
import warnings

In [78]:
warnings.filterwarnings('ignore')

In [79]:
BASE_PATH = "/content/drive/MyDrive/Competencia_Saber_Pro_Final/"

print("Cargando datos...")

Cargando datos...


In [80]:
# Par√°metros originales que dieron el score de 0.43 (posici√≥n 23)
LGBM_BASE_PARAMS = {
    'objective': 'multiclass',
    'num_class': 4,
    'n_estimators': 800,
    'learning_rate': 0.04,
    'random_state': 42,
    'verbose': -1,
    'n_jobs': -1
}

In [82]:
print("1. Cargando datos y limpieza inicial...")
try:
    df_train = pd.read_csv(f"{BASE_PATH}train.csv")
    df_test = pd.read_csv(f"{BASE_PATH}test.csv")
except FileNotFoundError:
    print("Error: Aseg√∫rate de que BASE_PATH sea correcta y Drive est√© montado.")

1. Cargando datos y limpieza inicial...


In [83]:
def clean_cols(df):
    """Limpia nombres de columnas replicando el proceso de 02 - preprocesado.ipynb"""
    df.columns = df.columns.str.strip()
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.str.replace('.', '_', regex=False)
    df.columns = df.columns.str.upper()
    return df


In [84]:
df_train = clean_cols(df_train)
df_test = clean_cols(df_test)

In [85]:
TARGET_COL = 'RENDIMIENTO_GLOBAL'
y = df_train[TARGET_COL]
X = df_train.drop(TARGET_COL, axis=1)
test_ids = df_test['ID']
X_test = df_test.copy()

In [86]:
# Mapeo de Etiquetas (num√©ricas para el modelo)
unique_labels = sorted(y.unique())
label_map = {label: i for i, label in enumerate(unique_labels)}
reverse_label_map = {i: label for label, i in label_map.items()}
y_encoded = y.map(label_map)

In [87]:
# ----------------------------------------------------------------------
# 2. PREPROCESAMIENTO ROBUSTO (Imputaci√≥n + OHE + Normalizaci√≥n)
# ----------------------------------------------------------------------

print("2. Aplicando preprocesamiento OHE y MinMax Scaling...")


2. Aplicando preprocesamiento OHE y MinMax Scaling...


In [88]:
# Identificar columnas
num_cols = [col for col in X.select_dtypes(include=['int64', 'float64']).columns.tolist() if col != 'ID']
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

In [90]:
# Imputaci√≥n (Moda y Media)
imputer_cte = SimpleImputer(strategy='most_frequent')
X[cat_cols] = imputer_cte.fit_transform(X[cat_cols])
X_test[cat_cols] = imputer_cte.transform(X_test[cat_cols])

imputer_num = SimpleImputer(strategy='mean')
X[num_cols] = imputer_num.fit_transform(X[num_cols])
X_test[num_cols] = imputer_num.transform(X_test[num_cols])

In [91]:
# One-Hot Encoding (OHE)
combined_encoded = pd.concat([X.drop('ID', axis=1), X_test.drop('ID', axis=1)], keys=['train', 'test'])
combined_encoded = pd.get_dummies(combined_encoded.drop(TARGET_COL, axis=1, errors='ignore'), drop_first=True, dtype=bool)

X_train_features = combined_encoded.loc['train'].reset_index(drop=True)
X_test_features = combined_encoded.loc['test'].reset_index(drop=True)


In [92]:
# üõ†Ô∏è FIX: Sanitizaci√≥n y Unicidad de Nombres (Crucial para LightGBM)
print("Sanitizando nombres de columnas...")
def sanitize_and_uniquify_columns(df):
    """Limpia caracteres especiales y asegura unicidad de nombres."""
    df.columns = df.columns.str.replace(r'[<>()\[\]{},.:;\'\"-/]', '_', regex=True)
    df.columns = df.columns.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    df.columns = [''.join(c if c.isalnum() else '_' for c in col) for col in df.columns]

    cols = df.columns.tolist()
    seen = {}
    new_cols = []
    for item in cols:
        counter = seen.get(item, 0)
        if counter > 0:
            new_cols.append(f'{item}_{counter}')
        else:
            new_cols.append(item)
        seen[item] = counter + 1

    df.columns = new_cols
    return df

X_train_features = sanitize_and_uniquify_columns(X_train_features)
X_test_features = sanitize_and_uniquify_columns(X_test_features)

Sanitizando nombres de columnas...


In [93]:
# Normalizaci√≥n MinMax (necesario para la versi√≥n original)
print("Aplicando MinMax Scaling...")
column_normalizar = [col for col in X_train_features.columns if 'INDICADOR_' in col or 'PERIODO_ACADEMICO' in col]
scaler = MinMaxScaler()
X_train_features[column_normalizar] = scaler.fit_transform(X_train_features[column_normalizar])
X_test_features[column_normalizar] = scaler.transform(X_test_features[column_normalizar])

Aplicando MinMax Scaling...


In [94]:
# ----------------------------------------------------------------------
# 3. MODELADO: LIGHTGBM BASE (SCORE STABLE)
# ----------------------------------------------------------------------

print("3. Entrenando LightGBM base (800 estimadores) con Early Stopping...")

3. Entrenando LightGBM base (800 estimadores) con Early Stopping...


In [95]:
# Crear un set de validaci√≥n (10% para Early Stopping)
X_t, X_val, y_t, y_val = train_test_split(
    X_train_features,
    y_encoded,
    test_size=0.1,
    random_state=42,
    stratify=y_encoded
)

model = lgb.LGBMClassifier(**LGBM_BASE_PARAMS)

In [96]:
# Entrenamiento con Early Stopping
model.fit(
    X_t, y_t,
    eval_set=[(X_val, y_val)],
    eval_metric='multi_logloss',
    callbacks=[lgb.early_stopping(100, verbose=True)]
)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[800]	valid_0's multi_logloss: 1.19726


In [97]:
# Predicci√≥n
predictions_numeric = model.predict(X_test_features)

In [98]:
# Mapear de vuelta a las etiquetas de texto
final_predictions = pd.Series(predictions_numeric).map(reverse_label_map)



In [99]:
# Generar el archivo de sumisi√≥n
submission_df = pd.DataFrame({
    'ID': test_ids,
    'RENDIMIENTO_GLOBAL': final_predictions
})

submission_filename = 'submission_99_ORIGINAL_REVERSION.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"\n=========================================================")
print(f"üéâ √âXITO: Revertido a la versi√≥n original. Archivo generado: {submission_filename}")
print("=========================================================")


üéâ √âXITO: Revertido a la versi√≥n original. Archivo generado: submission_99_ORIGINAL_REVERSION.csv
