# Starter notebook
Copy (fork) and edit as many copies of this notebook as you require 

In [None]:
# venv\Scripts\activate

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 36)
pd.set_option("display.max_colwidth", 72)

seed = 42
import numpy as np
np.random.seed(seed)

# graphics
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 20})
plt.rcParams["figure.figsize"] = (11, 6.8)
#plt.style.use('fivethirtyeight')

import seaborn as sns
sns.set(font_scale=1)
#sns.set_style("whitegrid")

import plotly.io as pio
# for use in JupyterLab 4
pio.renderers.default = 'iframe'
# for use in Google Colab
#pio.renderers.default = 'colab'
import plotly as py
import plotly.express as px

## Read in the training data

In [None]:
train = pd.read_csv("train.csv")
train

## This is the test data that you are asked to make predictions for

In [None]:
test = pd.read_csv("test.csv")
test

In [None]:
# Structural cleaning before splitting
def structural_cleaning(df, is_train=True):
   df = df.copy()

   # A. Filter target (only train)
   if is_train and 'DBWT' in df.columns:
       df = df[df['DBWT'] != 9999]

   # B. Null codes
   # Map 99, 999 ...
   cols_99 = ['CIG_0', 'FAGECOMB', 'M_Ht_In', 'PREVIS']
   for col in cols_99:
       if col in df.columns:
           df[col] = df[col].replace(99, np.nan)

   cols_9 = ['FEDUC', 'MEDUC', 'BFACIL', 'ATTEND']
   for col in cols_9:
        if col in df.columns:
            df[col] = df[col].replace(9, np.nan)

   if 'BMI' in df.columns:
        df['BMI'] = df['BMI'].replace(99.9, np.nan)

   # C. Structural nullity (888 -> 0)
   cols_888 = ['ILLB_R', 'ILOP_R', 'ILP_R']
   for col in cols_888:
        if col in df.columns:
            df[col] = df[col].replace(888, 0)
            df[col] = df[col].replace(999, np.nan)

   return df

In [None]:
# Apply structural cleaning to every df
train_cleaned = structural_cleaning(train, is_train=True)
test_cleaned = structural_cleaning(test, is_train=False)

Splitting is based into 3 conjuntos
1. Conjunto train: Is used so the model can learn parameters
2. Conjunto validation: To select the best model and set hyperparameters to estimate errors of generalization during development.
3. Conjunto test: Untouchable until the end. Gives a non-biassed estimation about how the model works irl.

In [None]:
# The target variable is 'Delivery Birth Weight' (DBWT)
from sklearn.model_selection import train_test_split

# Separate target variable to isolate the features (X)
y = train_cleaned.pop('DBWT')
X = train_cleaned

# 80/20 ratio
# Split the 20% of data for final testing
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# From the 80% training data, split again into training and validation sets (75/25 ratio)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

print(f"Train shape: {X_train.shape} (aprox 60%)")
print(f"Val shape:   {X_val.shape}   (aprox 20%)")
print(f"Test shape:  {X_test.shape}  (aprox 20%)")

Implementing a baseline signifies that every model turned in after this must best this result no matter what.

In [None]:
from sklearn.metrics import root_mean_squared_error

# Define the prediction (mean of training target)
baseline_pred = y_train.mean()
print(f"Baseline value: {baseline_pred:.2f}")

# Create a dummy prediction vector array for validation
y_val_pred = np.full(len(y_val), baseline_pred)

# Evaluate RMSE
rmse = root_mean_squared_error(y_val, y_val_pred)
print(f"Baseline RMSE on validation set: {rmse:.2f}")

Statistical imputation is the process of replacing missing data points in a dataset with plausible substituted values to maintaint data integrity. I did the splitting before filling in data to avoid data leakage.

Extracting all the info from `UserGuide2018-508` shared in the Kaggle description I made a full data cleanup to try a standard scaler.

In [None]:
# Columns that do not give predictive value
drop_cols = ['id', 'DLMP_MM', 'DOB_TT'] # Exact dates that give noise

# Numerical columns -> Candidates for median + scaling
# Education has order: We try as a numerical
num_cols = [
    'BMI', 'CIG_0', 'FAGECOMB', 'ILLB_R', 'ILOP_R', 'ILP_R', 'MAGER',
    'M_Ht_In', 'PRECARE', 'PREVIS', 'PRIORDEAD', 'PRIORLIVE', 'PRIORTERM',
    'PWgt_R', 'RF_CESARN', 'WTGAIN', 'FEDUC', 'MEDUC'
]

# Categorical columns -> Candidates for mode + one-hot encoding
# Text variables that do not have order
cat_cols = [
    'ATTEND', 'BFACIL', 'DMAR', 'DOB_MM', 'DOB_WK', 'LD_INDL',
    'MBSTATE_REC', 'PAY', 'PAY_REC', 'RDMETH_REC', 'RESTATUS',
    'RF_CESAR', 'SEX', 'NO_INFEC', 'NO_MMORB', 'NO_RISKS'
]

In [None]:
# Training stats learning
def learn_imputation_stats(df):
   stats = {}

   # A. Numerical -> Median
   for col in num_cols:
      if col in df.columns:
         stats[col] = df[col].median()

   # B. Categorical -> Mode
   for col in cat_cols:
      if col in df.columns:
         # dropna=True to avoid NaN in mode calculation
         stats[col] = df[col].mode(dropna=True)[0]

   return stats
imputation_stats = learn_imputation_stats(X_train)

B. Really important for FAGECOMB -> Not having a father is predictive

In [None]:
from sklearn.preprocessing import StandardScaler

def process_dataset(df, stats, scaler=None, is_train=False):
   df = df.copy()

   # A. Drop useless columns
   df = df.drop([c for c in drop_cols if c in df.columns], axis=1)

   # A.2 Feature Engineering
   # 1. Peso total de la madre (Física)
   if 'PWgt_R' in df.columns and 'WTGAIN' in df.columns:
        df['Mother_Total_Weight'] = df['PWgt_R'] + df['WTGAIN']

    # 2. Intensidad de tabaquismo (Interacción)
   if 'CIG_0' in df.columns and 'MAGER' in df.columns:
        df['Smoking_Intensity'] = df['CIG_0'] * df['MAGER']

   # B. Missing indicators
   cols_with_heavy_nan = ['FAGECOMB']
   for col in cols_with_heavy_nan:
      if col in df.columns:
         # Creates a binary column: 1 if missing, 0 if not
         df[f'{col}_is_missing'] = df[col].isna().astype(int)

   # C. Imputation (fill in the blanks)
   for col, value in stats.items():
      if col in df.columns:
         df[col] = df[col].fillna(value)

   # Security check: if any NaN remains
   df = df.fillna(0)

   # D. One-hot encoding for categorical variables to dummy
   # Change to string first to secure numerical codes are treated as categories
   current_cat_cols = [c for c in cat_cols if c in df.columns]
   for col in current_cat_cols:
      df[col] = df[col].astype(str)

   df = pd.get_dummies(df, columns=current_cat_cols, drop_first=True) # drop_first avoids colineality

   # E. Scaling (StandardScaler)
   current_num_cols = [c for c in num_cols if c in df.columns]

   if is_train:
      scaler = StandardScaler()
      df[current_num_cols] = scaler.fit_transform(df[current_num_cols])
      return df, scaler
   else:
      # if Val/Test we use the scaler already trained (NOT RE-FIT)
      if scaler is not None:
         df[current_num_cols] = scaler.transform(df[current_num_cols])
      return df


Pipeline execution
1. Process TRAIN & obtain adjusted scaler
2. Process VALIDATION using stats & Train scaler
3. Process TEST
4. PROCESS TEST for Submission

In [None]:
# 1
X_train_ready, scaler_fitted = process_dataset(X_train, imputation_stats, is_train=True)
# 2
X_val_ready = process_dataset(X_val, imputation_stats, scaler=scaler_fitted, is_train=False)
# 3
X_test_ready = process_dataset(X_test, imputation_stats, scaler=scaler_fitted, is_train=False)

One-Hot Encoding in the **D** step can generate distinct cols if in test missis some weird category. I force everyone to have EXACTLY the cols of X_train_ready
expected_cols = X_train_ready.columns

In [None]:
expected_cols = X_train_ready.columns

def align_columns(df, target_cols):
    # 1. Add missing cols (rellenas con 0)
    missing_cols = set(target_cols) - set(df.columns)
    for c in missing_cols:
        df[c] = 0

    # 2. Remove extra columns (that weren't in train)
    # 3. Reordenar para que coincidan índice a índice
    return df[target_cols]

X_val_ready = align_columns(X_val_ready, expected_cols)
X_test_ready = align_columns(X_test_ready, expected_cols)

print(f"Train final shape: {X_train_ready.shape}")
print(f"Val final shape:   {X_val_ready.shape}")
print(f"Test final shape:  {X_test_ready.shape}")

In [None]:
from sklearn.linear_model import LinearRegression

# Entrenar
model = LinearRegression()
model.fit(X_train_ready, y_train)

# Predecir y evaluar
preds = model.predict(X_val_ready)
rmse = root_mean_squared_error(y_val, preds)

print(f"Linear Regression RMSE: {rmse:.2f}")

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from scipy.stats import randint, uniform

# 1. Definir el espacio de búsqueda (Hyperparameter Space)
# Estos son los "knobs to twiddle" mencionados en el Cap. 10
param_dist = {
    'n_estimators': randint(500, 3000),      # Cuántos árboles (Sección 10.4)
    'learning_rate': uniform(0.01, 0.2),     # Velocidad de aprendizaje (Sección 10.5)
    'max_depth': randint(3, 10),             # Profundidad de los árboles (Sección 10.3)
    'subsample': uniform(0.6, 0.4),          # Evitar overfitting muestreando filas
    'colsample_bytree': uniform(0.6, 0.4),   # Evitar overfitting muestreando columnas
    'reg_alpha': uniform(0, 10),             # Regularización L1 (Sección 7.13.1)
    'reg_lambda': uniform(0, 10)             # Regularización L2
}

# 2. Inicializar el regresor base
xgb_model = xgb.XGBRegressor(
    n_jobs=-1,
    random_state=42,
    objective='reg:squarederror'
)

# 3. Configurar la Búsqueda Aleatoria
# n_iter=50 significa que probará 50 combinaciones distintas
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,
    scoring='neg_root_mean_squared_error', # Kaggle usa RMSE
    cv=3,                                  # Validación cruzada de 3 pliegues (Sección 5.2)
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# 4. Entrenar (¡Esto buscará el mejor modelo posible!)
print("Buscando los mejores hiperparámetros (paciencia)...")
random_search.fit(X_train_ready, y_train)

# 5. Resultados
best_model = random_search.best_estimator_
print(f"\nMejores parámetros encontrados: {random_search.best_params_}")
print(f"Mejor Score (RMSE CV negativo): {-random_search.best_score_:.2f}")

# 6. Validación final con tu Hold-out Set
preds_val = best_model.predict(X_val_ready)
rmse_val = root_mean_squared_error(y_val, preds_val)
print(f"RMSE Final en Validación: {rmse_val:.2f}")

In [None]:
# Process the true kaggle test
test_kaggle_ready = process_dataset(test_cleaned, imputation_stats, scaler=scaler_fitted, is_train=False)
test_kaggle_ready = align_columns(test_kaggle_ready, expected_cols)

print(f"Test Kaggle shape: {test_kaggle_ready.shape}") # Debería ser (2000, 74)

In [None]:
# [NUEVA CELDA PARA ENSEMBLING Y SUBMISSION - VERSIÓN XGBOOST MODERNO]
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

print("--- 1. ENTRENANDO XGBOOST ---")

# CORRECCIÓN: 'early_stopping_rounds' se define AHORA, al crear el modelo
model_xgb = xgb.XGBRegressor(
    n_estimators=1500,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.7,
    n_jobs=-1,
    random_state=42,
    objective='reg:squarederror',
    early_stopping_rounds=50  # <--- AQUI ES DONDE DEBE IR EN VERSIONES NUEVAS
)

# En el .fit() ya NO ponemos early_stopping_rounds, pero SÍ mantenemos el eval_set
# para que el modelo tenga datos con los que medir cuándo parar.
model_xgb.fit(
    X_train_ready, y_train,
    eval_set=[(X_val_ready, y_val)],
    verbose=False
)

print(f"XGBoost entrenado. Mejor iteración: {model_xgb.best_iteration}")


print("\n--- 2. ENTRENANDO LIGHTGBM ---")
model_lgb = lgb.LGBMRegressor(
    n_estimators=1500,
    learning_rate=0.02,
    num_leaves=31,
    feature_fraction=0.7,
    bagging_fraction=0.7,
    bagging_freq=1,
    n_jobs=-1,
    random_state=42,
    verbose=-1
)

# LightGBM usa callbacks (esto sigue igual, es correcto)
callbacks = [lgb.early_stopping(stopping_rounds=50, verbose=False)]

model_lgb.fit(
    X_train_ready, y_train,
    eval_set=[(X_val_ready, y_val)],
    eval_metric='rmse',
    callbacks=callbacks
)
print(f"LightGBM entrenado. Mejor iteración: {model_lgb.best_iteration_}")


print("\n--- 3. EVALUACIÓN INTERNA (BLENDING) ---")
# Predecimos (automáticamente usan la mejor iteración gracias al early stopping)
val_preds_xgb = model_xgb.predict(X_val_ready)
val_preds_lgb = model_lgb.predict(X_val_ready)

# PROMEDIO SIMPLE
val_preds_ensemble = (val_preds_xgb * 0.5) + (val_preds_lgb * 0.5)

rmse_xgb = np.sqrt(mean_squared_error(y_val, val_preds_xgb))
rmse_lgb = np.sqrt(mean_squared_error(y_val, val_preds_lgb))
rmse_ens = np.sqrt(mean_squared_error(y_val, val_preds_ensemble))

print(f"RMSE XGBoost:  {rmse_xgb:.2f}")
print(f"RMSE LightGBM: {rmse_lgb:.2f}")
print(f"RMSE ENSAMBLE: {rmse_ens:.2f} (¡Tu referencia real!)")


print("\n--- 4. GENERANDO SUBMISSION PARA KAGGLE ---")
# Predicciones finales sobre el test de competición
kaggle_preds_xgb = model_xgb.predict(test_kaggle_ready)
kaggle_preds_lgb = model_lgb.predict(test_kaggle_ready)

# Blending final
final_predictions = (kaggle_preds_xgb * 0.5) + (kaggle_preds_lgb * 0.5)

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

print("--- CONFIGURANDO EL APILAMIENTO (STACKING) ---")

# 1. Definir los modelos base (Base Learners)
# Usamos configuraciones robustas para cada uno
estimators = [
    ('xgb', xgb.XGBRegressor(
        n_estimators=2000,
        learning_rate=0.01,  # Más lento = más precisión
        max_depth=6,
        subsample=0.7,
        colsample_bytree=0.7,
        n_jobs=-1,
        random_state=42,
        objective='reg:squarederror',
        # Ojo: StackingRegressor maneja el fit internamente, no usamos early_stopping aquí
        # para simplificar la compatibilidad, pero compensamos con learning_rate bajo.
    )),
    ('lgb', lgb.LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.01,
        num_leaves=31,
        feature_fraction=0.7,
        bagging_fraction=0.7,
        bagging_freq=1,
        n_jobs=-1,
        random_state=42,
        verbose=-1
    )),
    ('cat', CatBoostRegressor(
        iterations=2000,
        learning_rate=0.01,
        depth=6,
        l2_leaf_reg=3,       # Regularización específica de CatBoost
        verbose=0,           # Silencioso
        random_state=42
    ))
]

# 2. Definir el Meta-Modelo (Final Estimator)
# RidgeCV es una regresión lineal con regularización L2 integrada (Sección 7.13.1)
# Es ideal para combinar predicciones porque maneja la colinealidad.
meta_model = RidgeCV()

# 3. Construir el Stacking Regressor
# cv=5 asegura que las predicciones intermedias se generen con validación cruzada (Sección 9.8)
stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1,
    passthrough=False # False = El meta-modelo solo ve las predicciones, no los datos originales
)

print("Entrenando Stacking Regressor (esto tardará un poco)...")
# Entrenamos en TODO el conjunto de entrenamiento disponible
stacking_model.fit(X_train_ready, y_train)

print("¡Entrenamiento completado!")

# --- EVALUACIÓN ---
print("\n--- EVALUACIÓN EN VALIDACIÓN ---")
val_preds = stacking_model.predict(X_val_ready)
rmse_val = np.sqrt(mean_squared_error(y_val, val_preds))
print(f"RMSE Stacking (Train/Val split): {rmse_val:.2f}")

# --- GENERACIÓN DE SUBMISSION ---
print("\n--- GENERANDO SUBMISSION FINAL ---")
kaggle_preds = stacking_model.predict(test_kaggle_ready)

In [None]:
kaggle_predictions = model.predict(test_kaggle_ready)
kaggle_predictions_xgb = best_model.predict(test_kaggle_ready)

## Submit your predictions in a `submission.csv` file for scoring on the [leaderboard](https://www.kaggle.com/competitions/u-tad-birth-weight-point-prediction-2025/leaderboard)
To submit your notebook click on **Submit to competition** and then **Submit**.

In [None]:
# do not modify this code
submission = pd.read_csv("sample_submission.csv")
submission["DBWT"] = final_predictions
submission.to_csv('submission.csv',index=False)

In [None]:
submission.head()