<a href="https://colab.research.google.com/github/mateor32/modelos1-20252-competenciaKaggle/blob/main/04%20-%20modelo_con_preprocesado_Random_Forest_sin_Normalizacion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

print("¡Drive conectado!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
¡Drive conectado!


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier # CAMBIO DE MODELO
from sklearn.impute import SimpleImputer
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
# ----------------------------------------------------------------------
# 1. CARGA DE DATOS Y LIMPIEZA INICIAL
# ----------------------------------------------------------------------

BASE_PATH = "/content/drive/MyDrive/Competencia_Saber_Pro_Final/"

print("1. Cargando datos...")
try:
    df_train = pd.read_csv(f"{BASE_PATH}train.csv")
    df_test = pd.read_csv(f"{BASE_PATH}test.csv")
except FileNotFoundError:
    print("Error: Asegúrate de que BASE_PATH sea correcta y Drive esté montado.")

def clean_cols(df):
    df.columns = df.columns.str.strip()
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.str.replace('.', '_', regex=False)
    df.columns = df.columns.str.upper()
    return df

df_train = clean_cols(df_train)
df_test = clean_cols(df_test)

TARGET_COL = 'RENDIMIENTO_GLOBAL'
y = df_train[TARGET_COL]
X = df_train.drop(TARGET_COL, axis=1)
test_ids = df_test['ID']
X_test = df_test.copy()

1. Cargando datos...


In [None]:
# ----------------------------------------------------------------------
# 2. PREPROCESAMIENTO (Imputación + OHE)
# ----------------------------------------------------------------------

print("2. Aplicando preprocesamiento: Imputación y OHE.")
print("    NOTA: Se omite la normalización por usar un modelo de árboles.")

2. Aplicando preprocesamiento: Imputación y OHE.
    NOTA: Se omite la normalización por usar un modelo de árboles.


In [None]:
# Identificar columnas
num_cols = [col for col in X.select_dtypes(include=['int64', 'float64']).columns.tolist() if col != 'ID']
cat_cols = X.select_dtypes(include=['object']).columns.tolist()


In [None]:
# Imputación (Moda y Media)
imputer_cte = SimpleImputer(strategy='most_frequent')
X[cat_cols] = imputer_cte.fit_transform(X[cat_cols])
X_test[cat_cols] = imputer_cte.transform(X_test[cat_cols])

imputer_num = SimpleImputer(strategy='mean')
X[num_cols] = imputer_num.fit_transform(X[num_cols])
X_test[num_cols] = imputer_num.transform(X_test[num_cols])

In [None]:
# One-Hot Encoding (OHE)
combined_encoded = pd.concat([X.drop('ID', axis=1), X_test.drop('ID', axis=1)], keys=['train', 'test'])
combined_encoded = pd.get_dummies(combined_encoded.drop(TARGET_COL, axis=1, errors='ignore'), drop_first=True, dtype=bool)

X_train_features = combined_encoded.loc['train'].reset_index(drop=True)
X_test_features = combined_encoded.loc['test'].reset_index(drop=True)

In [None]:
# FIX de Sanitización y Unicidad de Nombres (Obligatorio)
print("Sanitizando nombres de columnas para evitar errores...")
def sanitize_and_uniquify_columns(df):
    df.columns = df.columns.str.replace(r'[<>()\[\]{},.:;\'\"-/]', '_', regex=True)
    df.columns = df.columns.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    df.columns = [''.join(c if c.isalnum() else '_' for c in col) for col in df.columns]

    cols = df.columns.tolist()
    seen = {}
    new_cols = []
    for item in cols:
        counter = seen.get(item, 0)
        if counter > 0:
            new_cols.append(f'{item}_{counter}')
        else:
            new_cols.append(item)
        seen[item] = counter + 1

    df.columns = new_cols
    return df

X_train_features = sanitize_and_uniquify_columns(X_train_features)
X_test_features = sanitize_and_uniquify_columns(X_test_features)

Sanitizando nombres de columnas para evitar errores...


In [None]:
# ----------------------------------------------------------------------
# 3. MODELADO Y GENERACIÓN DE SUBMISIÓN (Random Forest)
# ----------------------------------------------------------------------

print("3. Entrenando Random Forest y generando predicciones...")

# Mapeo de Etiquetas (numéricas)
unique_labels = sorted(y.unique())
label_map = {label: i for i, label in enumerate(unique_labels)}
reverse_label_map = {i: label for label, i in label_map.items()}
y_encoded = y.map(label_map)

3. Entrenando Random Forest y generando predicciones...


In [None]:
# MODELO DIFERENTE: Random Forest (configurado para un dataset grande)
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,          # Controla la complejidad del árbol
    random_state=42,
    n_jobs=-1,             # Usa todos los núcleos disponibles
    min_samples_split=5
)

In [None]:
# Entrenamiento
model.fit(X_train_features, y_encoded)

In [None]:
# Predicción y Mapeo
predictions_numeric = model.predict(X_test_features)
final_predictions = pd.Series(predictions_numeric).map(reverse_label_map)

In [None]:
# Generar el archivo de sumisión
submission_df = pd.DataFrame({
    'ID': test_ids,
    'RENDIMIENTO_GLOBAL': final_predictions
})

submission_filename = 'submission_04_Random_Forest.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"\n¡Notebook 04 completado! Archivo: {submission_filename}")


¡Notebook 04 completado! Archivo: submission_04_Random_Forest.csv
