In [None]:
# pipeline.py
# ============================================================
# Santander Customer Satisfaction - End-to-End ML Pipeline
# ============================================================

import argparse
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# ------------------------------------------------------------
# 1. Data Loading
# ------------------------------------------------------------
def load_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test


# ------------------------------------------------------------
# 2. Basic Feature Cleaning
# ------------------------------------------------------------
def basic_cleaning(train, test):
    # Remove zero-variance features
    remove_cols = [c for c in train.columns if c != "TARGET" and train[c].var() == 0]
    train.drop(columns=remove_cols, inplace=True)
    test.drop(columns=remove_cols, inplace=True)

    # Remove sparse features (99% quantile == 0)
    sparse_cols = [
        c for c in train.columns
        if c != "TARGET" and np.percentile(train[c], 99) == 0
    ]
    train.drop(columns=sparse_cols, inplace=True)
    test.drop(columns=sparse_cols, inplace=True)

    return train, test


# ------------------------------------------------------------
# 3. Domain Feature Engineering
# ------------------------------------------------------------
def feature_engineering(train, test):
    # VAR3
    for df in [train, test]:
        df["var3"].replace(-999999, 2, inplace=True)

    # VAR15
    for df in [train, test]:
        df["var15_below_23"] = (df["var15"] < 23).astype(int)

    # VAR38
    q975 = np.quantile(train["var38"], 0.975)
    for df in [train, test]:
        df["var38_clipped"] = df["var38"].clip(upper=q975)
        df["var38_log"] = np.log1p(df["var38_clipped"])

    # log transform imp / saldo
    for prefix in ["imp", "saldo"]:
        cols = [c for c in train.columns if prefix in c]
        for df in [train, test]:
            for c in cols:
                mask = df[c] > 0
                df.loc[mask, c] = np.log1p(df.loc[mask, c])

    # count zero / non-zero
    feature_cols = [c for c in train.columns if c not in ["ID", "TARGET"]]
    for df in [train, test]:
        df["no_zeros"] = (df[feature_cols] == 0).sum(axis=1)
        df["no_nonzeros"] = (df[feature_cols] != 0).sum(axis=1)

    return train, test


# ------------------------------------------------------------
# 4. Prepare Train / Test Matrix
# ------------------------------------------------------------
def prepare_matrix(train, test):
    X = train.drop(columns=["ID", "TARGET"])
    y = train["TARGET"].values
    X_test = test.drop(columns=["ID"])
    test_id = test["ID"].values
    return X, y, X_test, test_id


# ------------------------------------------------------------
# 5. Scaling
# ------------------------------------------------------------
def scale_features(X, X_test):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_test_scaled = scaler.transform(X_test)
    return X_scaled, X_test_scaled


# ------------------------------------------------------------
# 6. Model Training
# ------------------------------------------------------------
def train_model(X, y):
    X_tr, X_val, y_tr, y_val = train_test_split(
        X, y, test_size=0.15, stratify=y, random_state=42
    )

    model = xgb.XGBClassifier(
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=5,
        subsample=0.9,
        colsample_bytree=0.5,
        gamma=5,
        reg_alpha=0.3,
        objective="binary:logistic",
        eval_metric="auc",
        random_state=42,
        n_jobs=-1
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=False
    )

    val_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_pred)
    print(f"‚úÖ Validation AUC: {auc:.4f}")

    return model


# ------------------------------------------------------------
# 7. Main Pipeline
# ------------------------------------------------------------
def main(args):
    print("üöÄ Loading data...")
    train, test = load_data(args.train, args.test)

    print("üßπ Basic cleaning...")
    train, test = basic_cleaning(train, test)

    print("üß† Feature engineering...")
    train, test = feature_engineering(train, test)

    print("üìê Preparing matrices...")
    X, y, X_test, test_id = prepare_matrix(train, test)

    print("‚öñÔ∏è Scaling features...")
    X_scaled, X_test_scaled = scale_features(X, X_test)

    print("ü§ñ Training model...")
    model = train_model(X_scaled, y)

    print("üì§ Predicting test set...")
    preds = model.predict_proba(X_test_scaled)[:, 1]

    submission = pd.DataFrame({
        "ID": test_id,
        "TARGET": preds
    })
    submission.to_csv(args.output, index=False)

    print(f"‚úÖ Submission saved to {args.output}")


# ------------------------------------------------------------
# Entry
# ------------------------------------------------------------
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--train", type=str, required=True, help="Path to train.csv")
    parser.add_argument("--test", type=str, required=True, help="Path to test.csv")
    parser.add_argument("--output", type=str, default="submission.csv")

    args = parser.parse_args()
    main(args)
