In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, RobustScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import RidgeCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings

warnings.filterwarnings('ignore')
SEED = 42

def load_and_enrich_data(train_path, test_path):
    print("--- 1. Carga y Enriquecimiento de Datos (Focus: Extremos) ---")
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    target = 'DBWT'

    train['is_train'] = 1
    test['is_train'] = 0

    # Alinear columnas
    missing_cols = set(train.columns) - set(test.columns) - {target}
    for c in missing_cols:
        test[c] = 0

    df = pd.concat([train, test], axis=0, ignore_index=True)

    # --- A. GESTIÓN DE EXTREMOS (UserGuide 2018) ---

    # 1. PLURALIDAD (DPLURAL): Clave para bajo peso (Gemelos/Trillizos)
    # Si existe, la limpiamos. Si no, intentamos inferirla o ignoramos.
    if 'DPLURAL' in df.columns:
        # User Guide: 1=Single, 2=Twin, 3=Triplet, etc.
        # Creamos flag 'MULTIPLE_BIRTH' porque la diferencia masiva es 1 vs Resto
        df['MULTIPLE_BIRTH'] = df['DPLURAL'].apply(lambda x: 1 if x > 1 else 0)

    # 2. GESTACIÓN (GESTREC3 / COMBGEST): Clave para prematuros
    if 'GESTREC3' in df.columns:
        # 1=Under 37 weeks (Prematuro), 2=37+, 3=Unknown
        # Ojo con el 3 (Unknown), mejor tratarlo como categoría aparte
        df['IS_PREMATURE'] = df['GESTREC3'].apply(lambda x: 1 if x == 1 else 0)

    # 3. SEXO: Pequeña pero constante diferencia
    if 'SEX' in df.columns:
        df['IS_MALE'] = df['SEX'].apply(lambda x: 1 if x == 'M' else 0)

    # --- B. LIMPIEZA STANDARD ---
    cig_cols = ['CIG_0', 'CIG_1', 'CIG_2', 'CIG_3']
    for col in cig_cols:
        if col in df.columns:
            df[col] = df[col].replace([99, 999], np.nan)

    if set(cig_cols).issubset(df.columns):
        df['TOTAL_CIGS'] = df[cig_cols].sum(axis=1, min_count=1)

    # RF (Risk Factors) a conteo numérico
    rf_cols = [c for c in df.columns if c.startswith('RF_')]
    if rf_cols:
        for col in rf_cols:
            df[col] = df[col].apply(lambda x: 1 if str(x).upper() in ['Y', '1', 'YES'] else 0)
        df['RISK_SCORE'] = df[rf_cols].sum(axis=1)

    # Imputación
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    df[cat_cols] = df[cat_cols].fillna('Missing')

    return df, target

def get_aggressive_stacking():
    """
    Stacking configurado para ser menos conservador (menos regularización).
    """
    print("--- 2. Configurando Stacking 'Agresivo' ---")

    # Preprocesador
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), lambda X: X.select_dtypes(include=['int64', 'float64']).columns),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False),
             lambda X: X.select_dtypes(include=['object', 'category']).columns)
        ])

    # Modelos Base: Reducimos regularización para permitir capturar colas
    # XGBoost: bajamos min_child_weight y reg_lambda
    xgb = XGBRegressor(
        n_estimators=1500,
        learning_rate=0.03, # Más lento pero más preciso
        max_depth=7,        # Más profundidad = más complejidad
        min_child_weight=1, # Permite hojas con pocos datos (extremos)
        reg_lambda=1,       # Regularización estándar (antes era más alta implícitamente)
        n_jobs=-1,
        random_state=SEED
    )

    lgbm = LGBMRegressor(
        n_estimators=1500,
        learning_rate=0.03,
        num_leaves=40,      # Más hojas
        min_child_samples=10, # Permitir grupos pequeños
        n_jobs=-1,
        random_state=SEED,
        verbose=-1
    )

    cat = CatBoostRegressor(
        iterations=1500,
        learning_rate=0.03,
        depth=7,
        l2_leaf_reg=1, # Menor regularización L2
        verbose=0,
        random_seed=SEED,
        allow_writing_files=False
    )

    # Meta Learner: Ridge con alpha bajo para confiar más en los base learners
    meta_learner = RidgeCV(alphas=[0.1, 1.0, 10.0])

    stacking = StackingRegressor(
        estimators=[('xgb', xgb), ('lgbm', lgbm), ('cat', cat)],
        final_estimator=meta_learner,
        cv=5,
        n_jobs=-1
    )

    final_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', stacking)
    ])

    # Mantenemos log1p porque es matemáticamente sólido para pesos
    return TransformedTargetRegressor(regressor=final_pipeline, func=np.log1p, inverse_func=np.expm1)

def post_process_variance(y_pred, y_train_sample):
    """
    Técnica de 'Variance Inflation' o 'Histogram Matching' simple.
    Ajusta la distribución de predicciones para igualar la desviación típica del train.

    Fórmula: y_new = (y - mu_pred) * (sigma_train / sigma_pred) + mu_train
    """
    print("--- 4. Post-Procesamiento: Calibración de Varianza ---")

    mu_train = np.mean(y_train_sample)
    sigma_train = np.std(y_train_sample)

    mu_pred = np.mean(y_pred)
    sigma_pred = np.std(y_pred)

    print(f"   Original Pred Std: {sigma_pred:.2f} | Target Train Std: {sigma_train:.2f}")

    # Factor de escala (Limitado para no romper nada, ej. max 1.5x)
    scale = sigma_train / sigma_pred

    # Aplicamos corrección suave (mezcla entre original y estirado para seguridad)
    # y_new = (y_pred - mu_pred) * scale + mu_train
    # A menudo basta con escalar la desviación alrededor de la propia media predicha
    # si confiamos en el bias del modelo. Usaremos la media del train para corregir bias también.

    y_calibrated = (y_pred - mu_pred) * scale + mu_train

    print(f"   Predicciones re-escaladas (Factor: {scale:.4f})")

    # Safety Check: No bajar de 0
    y_calibrated = np.maximum(y_calibrated, 100)

    return y_calibrated

def main():
    TRAIN_FILE = 'train.csv'
    TEST_FILE = 'test.csv'

    try:
        df, target_col = load_and_enrich_data(TRAIN_FILE, TEST_FILE)
    except FileNotFoundError:
        print("Error: Faltan archivos.")
        return

    train_df = df[df['is_train'] == 1].drop(['is_train'], axis=1)
    test_df = df[df['is_train'] == 0].drop(['is_train', target_col], axis=1)

    X = train_df.drop(target_col, axis=1)
    y = train_df[target_col]

    # Entrenar
    model = get_aggressive_stacking()
    print("--- 3. Entrenando Stacking Final ---")
    model.fit(X, y)

    # Predecir
    raw_preds = model.predict(test_df)

    # --- PASO CRÍTICO: CALIBRACIÓN ---
    final_preds = post_process_variance(raw_preds, y)

    # Guardar
    submission = pd.DataFrame({'id': test_df.index, 'DBWT': final_preds})
    submission.to_csv('final_calibrated_submission.csv', index=False)
    print("¡Hecho! Generado 'final_calibrated_submission.csv'")

    # Mini-reporte
    print("\n--- Estadísticas Finales ---")
    print(f"Min: {final_preds.min():.2f}")
    print(f"Max: {final_preds.max():.2f}")
    print(f"Std: {np.std(final_preds):.2f}")

if __name__ == "__main__":
    main()

--- 1. Carga y Enriquecimiento de Datos (Focus: Extremos) ---
--- 2. Configurando Stacking 'Agresivo' ---
--- 3. Entrenando Stacking Final ---
--- 4. Post-Procesamiento: Calibración de Varianza ---
   Original Pred Std: 367.98 | Target Train Std: 579.88
   Predicciones re-escaladas (Factor: 1.5758)
¡Hecho! Generado 'final_calibrated_submission.csv'

--- Estadísticas Finales ---
Min: 505.73
Max: 4735.06
Std: 579.88


In [5]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings

warnings.filterwarnings('ignore')
SEED = 42

# --- REUTILIZAMOS LA LÓGICA DE CARGA ---
def load_and_enrich_data(train_path, test_path):
    print("--- 1. Carga y Enriquecimiento (Fase Semi-Supervisada) ---")
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    target = 'DBWT'

    train['is_train'] = 1
    test['is_train'] = 0

    # Alinear columnas
    missing_cols = set(train.columns) - set(test.columns) - {target}
    for c in missing_cols:
        test[c] = 0

    df = pd.concat([train, test], axis=0, ignore_index=True)

    # Lógica de Extremos
    if 'DPLURAL' in df.columns:
        df['MULTIPLE_BIRTH'] = df['DPLURAL'].apply(lambda x: 1 if x > 1 else 0)

    if 'GESTREC3' in df.columns:
        df['IS_PREMATURE'] = df['GESTREC3'].apply(lambda x: 1 if x == 1 else 0)

    if 'SEX' in df.columns:
        df['IS_MALE'] = df['SEX'].apply(lambda x: 1 if x == 'M' else 0)

    # Limpieza
    cig_cols = ['CIG_0', 'CIG_1', 'CIG_2', 'CIG_3']
    for col in cig_cols:
        if col in df.columns:
            df[col] = df[col].replace([99, 999], np.nan)

    if set(cig_cols).issubset(df.columns):
        df['TOTAL_CIGS'] = df[cig_cols].sum(axis=1, min_count=1)

    rf_cols = [c for c in df.columns if c.startswith('RF_')]
    if rf_cols:
        for col in rf_cols:
            df[col] = df[col].apply(lambda x: 1 if str(x).upper() in ['Y', '1', 'YES'] else 0)
        df['RISK_SCORE'] = df[rf_cols].sum(axis=1)

    # Imputación
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    df[cat_cols] = df[cat_cols].fillna('Missing')

    return df, target

def get_model():
    # Misma configuración agresiva que funcionó
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), lambda X: X.select_dtypes(include=['int64', 'float64']).columns),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False),
             lambda X: X.select_dtypes(include=['object', 'category']).columns)
        ])

    xgb = XGBRegressor(n_estimators=1500, learning_rate=0.03, max_depth=7, min_child_weight=1, reg_lambda=1, n_jobs=-1, random_state=SEED)
    lgbm = LGBMRegressor(n_estimators=1500, learning_rate=0.03, num_leaves=40, min_child_samples=10, n_jobs=-1, random_state=SEED, verbose=-1)
    cat = CatBoostRegressor(iterations=1500, learning_rate=0.03, depth=7, l2_leaf_reg=1, verbose=0, random_seed=SEED, allow_writing_files=False)
    meta = RidgeCV(alphas=[0.1, 1.0, 10.0])

    stacking = StackingRegressor(
        estimators=[('xgb', xgb), ('lgbm', lgbm), ('cat', cat)],
        final_estimator=meta,
        cv=5, n_jobs=-1
    )

    return TransformedTargetRegressor(regressor=Pipeline([('preprocessor', preprocessor), ('model', stacking)]), func=np.log1p, inverse_func=np.expm1)

def main():
    TRAIN_FILE = 'train.csv'
    TEST_FILE = 'test.csv'

    # 1. CARGA INICIAL
    try:
        df, target_col = load_and_enrich_data(TRAIN_FILE, TEST_FILE)
    except FileNotFoundError: return

    train_df = df[df['is_train'] == 1].drop(['is_train'], axis=1)
    test_df = df[df['is_train'] == 0].drop(['is_train', target_col], axis=1)

    X = train_df.drop(target_col, axis=1)
    y = train_df[target_col]

    # 2. ENTRENAMIENTO INICIAL (STAGE 1)
    print("--- STAGE 1: Entrenamiento Inicial ---")
    model = get_model()
    model.fit(X, y)

    # 3. GENERAR PSEUDO-LABELS
    print("--- Generando Pseudo-Etiquetas para el Test Set ---")
    pseudo_preds = model.predict(test_df)

    # Post-proceso inicial para asegurar calidad de pseudo-labels
    # (Usamos la media/std del train para calibrar las pseudo-labels antes de reentrenar)
    mu_train, sigma_train = np.mean(y), np.std(y)
    mu_pred, sigma_pred = np.mean(pseudo_preds), np.std(pseudo_preds)
    pseudo_preds_calibrated = (pseudo_preds - mu_pred) * (sigma_train / sigma_pred) + mu_train

    # 4. CREAR DATASET AUMENTADO (TRAIN + PSEUDO_TEST)
    print("--- STAGE 2: Re-Entrenamiento con Pseudo-Labeling ---")
    X_test = test_df.copy()
    X_test[target_col] = pseudo_preds_calibrated # Asignamos la predicción como si fuera real

    # Concatenamos Train real con Test "falso"
    # Unimos X e y de train
    train_full = X.copy()
    train_full[target_col] = y

    # Dataset gigante
    augmented_train = pd.concat([train_full, X_test], axis=0)

    X_aug = augmented_train.drop(target_col, axis=1)
    y_aug = augmented_train[target_col]

    # 5. ENTRENAMIENTO FINAL (STAGE 2)
    final_model = get_model() # Instancia nueva limpia
    final_model.fit(X_aug, y_aug)

    # 6. PREDICCIÓN FINAL
    final_preds = final_model.predict(test_df)

    # Calibración Final (Variance Inflation sobre el resultado final)
    # Usamos la std original del train real, no del aumentado
    mu_fin, sigma_fin = np.mean(final_preds), np.std(final_preds)
    final_preds_calibrated = (final_preds - mu_fin) * (sigma_train / sigma_fin) + mu_train
    final_preds_calibrated = np.maximum(final_preds_calibrated, 100)

    submission = pd.DataFrame({'id': test_df.index, 'DBWT': final_preds_calibrated})
    submission.to_csv('pseudo_labeling_submission.csv', index=False)
    print("¡Generado 'pseudo_labeling_submission.csv'!")
    print(f"Stats Finales -> Min: {final_preds_calibrated.min():.2f}, Max: {final_preds_calibrated.max():.2f}, Std: {np.std(final_preds_calibrated):.2f}")

if __name__ == "__main__":
    main()

--- 1. Carga y Enriquecimiento (Fase Semi-Supervisada) ---
--- STAGE 1: Entrenamiento Inicial ---
--- Generando Pseudo-Etiquetas para el Test Set ---
--- STAGE 2: Re-Entrenamiento con Pseudo-Labeling ---
¡Generado 'pseudo_labeling_submission.csv'!
Stats Finales -> Min: 437.70, Max: 4993.67, Std: 579.88
