<a href="https://colab.research.google.com/github/mateor32/modelos1-20252-competenciaKaggle/blob/main/03%20-%20modelo%20con%20preprocesado%20SVC%20con%20Normalizaci%C3%B3n%20por%20Estandarizaci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

print("¡Drive conectado!")

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC # CAMBIO DE MODELO: SVC Lineal
from sklearn.preprocessing import StandardScaler # CAMBIO DE SCALER: StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.multiclass import OneVsRestClassifier
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
# ----------------------------------------------------------------------
# 1. CARGA DE DATOS Y LIMPIEZA INICIAL
# ----------------------------------------------------------------------

# AJUSTA ESTA RUTA POR LA DE TU CARPETA EN DRIVE
BASE_PATH = "/content/drive/MyDrive/Competencia_Saber_Pro_Final/"

print("1. Cargando datos...")
try:
    df_train = pd.read_csv(f"{BASE_PATH}train.csv")
    df_test = pd.read_csv(f"{BASE_PATH}test.csv")
except FileNotFoundError:
    print("Error: Asegúrate de que BASE_PATH sea correcta y Drive esté montado.")

def clean_cols(df):
    df.columns = df.columns.str.strip()
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.str.replace('.', '_', regex=False)
    df.columns = df.columns.str.upper()
    return df

df_train = clean_cols(df_train)
df_test = clean_cols(df_test)

TARGET_COL = 'RENDIMIENTO_GLOBAL'
y = df_train[TARGET_COL]
X = df_train.drop(TARGET_COL, axis=1)
test_ids = df_test['ID']
X_test = df_test.copy()

In [None]:
# ----------------------------------------------------------------------
# 2. PREPROCESAMIENTO (Imputación + OHE + Standard Scaling)
# ----------------------------------------------------------------------

print("2. Aplicando preprocesamiento: Imputación, OHE y Standard Scaling...")

# Identificar columnas
num_cols = [col for col in X.select_dtypes(include=['int64', 'float64']).columns.tolist() if col != 'ID']
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# Imputación (Moda y Media)
imputer_cte = SimpleImputer(strategy='most_frequent')
X[cat_cols] = imputer_cte.fit_transform(X[cat_cols])
X_test[cat_cols] = imputer_cte.transform(X_test[cat_cols])

imputer_num = SimpleImputer(strategy='mean')
X[num_cols] = imputer_num.fit_transform(X[num_cols])
X_test[num_cols] = imputer_num.transform(X_test[num_cols])

# One-Hot Encoding (OHE)
combined_encoded = pd.concat([X.drop('ID', axis=1), X_test.drop('ID', axis=1)], keys=['train', 'test'])
combined_encoded = pd.get_dummies(combined_encoded.drop(TARGET_COL, axis=1, errors='ignore'), drop_first=True, dtype=bool)

X_train_features = combined_encoded.loc['train'].reset_index(drop=True)
X_test_features = combined_encoded.loc['test'].reset_index(drop=True)

In [None]:
# FIX de Sanitización y Unicidad de Nombres (Obligatorio)
print("Sanitizando nombres de columnas para evitar errores de LightGBM/scikit-learn...")
def sanitize_and_uniquify_columns(df):
    df.columns = df.columns.str.replace(r'[<>()\[\]{},.:;\'\"-/]', '_', regex=True)
    df.columns = df.columns.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    df.columns = [''.join(c if c.isalnum() else '_' for c in col) for col in df.columns]

    cols = df.columns.tolist()
    seen = {}
    new_cols = []
    for item in cols:
        counter = seen.get(item, 0)
        if counter > 0:
            new_cols.append(f'{item}_{counter}')
        else:
            new_cols.append(item)
        seen[item] = counter + 1

    df.columns = new_cols
    return df

X_train_features = sanitize_and_uniquify_columns(X_train_features)
X_test_features = sanitize_and_uniquify_columns(X_test_features)

In [None]:
# PASO DIFERENTE: Standard Scaling (ESENCIAL para SVC)
print("Aplicando Standard Scaling (diferente a MinMaxScaler)...")
column_normalizar = [col for col in X_train_features.columns if 'INDICADOR_' in col or 'PERIODO_ACADEMICO' in col]
scaler = StandardScaler()
X_train_features[column_normalizar] = scaler.fit_transform(X_train_features[column_normalizar])
X_test_features[column_normalizar] = scaler.transform(X_test_features[column_normalizar])

In [None]:
# ----------------------------------------------------------------------
# 3. MODELADO Y GENERACIÓN DE SUBMISIÓN (LinearSVC)
# ----------------------------------------------------------------------

print("3. Entrenando LinearSVC y generando predicciones...")

In [None]:
# Mapeo de Etiquetas (numéricas para el modelo)
unique_labels = sorted(y.unique())
label_map = {label: i for i, label in enumerate(unique_labels)}
reverse_label_map = {i: label for label, i in label_map.items()}
y_encoded = y.map(label_map)

In [None]:
#MODELO DIFERENTE: LinearSVC (SVC más escalable linealmente)
# Lo envolvemos en OneVsRestClassifier para manejar clasificación multi-clase
# Ajustamos C y tolerancía debido al gran dataset.
model_svc = LinearSVC(
    penalty='l2',
    loss='hinge',       # Usamos 'hinge' para SVC tradicional
    C=0.1,              # Regularización
    dual=True,          # Configuración optimizada para n_samples > n_features
    max_iter=5000,      # Aumentamos iteraciones para convergencia
    random_state=42
)

In [None]:
# SVC no soporta nativamente multi-clase, usamos OVR
model = OneVsRestClassifier(model_svc, n_jobs=-1)

In [None]:
# Entrenamiento
model.fit(X_train_features, y_encoded)

In [None]:
# Predicción y Mapeo
predictions_numeric = model.predict(X_test_features)
final_predictions = pd.Series(predictions_numeric).map(reverse_label_map)

In [None]:
# Generar el archivo de sumisión
submission_df = pd.DataFrame({
    'ID': test_ids,
    'RENDIMIENTO_GLOBAL': final_predictions
})

submission_filename = 'submission_03_SVM.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"\n¡Notebook 03 completado! Archivo: {submission_filename}")