# Tabular Super-Ensemble: 4 CatBoost + Linear + 2 LightGBM + XGBoost

Этот ноутбук:
1. Читает табличные данные (`train.csv`, `test.csv`, `sample_submission.csv`)
2. Обучает **4 CatBoost** с разными гиперпараметрами
3. Делает сабмит из ансамбля этих 4 CatBoost
4. Обучает **одну линейную модель** (LinearRegression / LogisticRegression) и делает сабмит
5. Обучает **2 LightGBM**
6. Обучает **1 XGBoost**
7. Делает финальный сабмит из ансамбля **всех моделей**

In [None]:
# ============================
# CONFIG
# ============================
from pathlib import Path

DATA_DIR = Path(".")
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"
SAMPLE_SUB_PATH = DATA_DIR / "sample_submission.csv"

ID_COL = "id"          # имя ID-колонки
TARGET_COL = "target"  # имя таргета в train

TASK_TYPE = "regression"   # "regression" или "binary"

OUTPUT_DIR = DATA_DIR / "submissions_ensemble"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("Config OK")

In [None]:
# ============================
# IMPORTS & INSTALLS
# ============================
import sys
import numpy as np
import pandas as pd

# CatBoost
try:
    from catboost import CatBoostRegressor, CatBoostClassifier, Pool
except ImportError:
    !{sys.executable} -m pip install -q catboost
    from catboost import CatBoostRegressor, CatBoostClassifier, Pool

# LightGBM
try:
    import lightgbm as lgb
except ImportError:
    !{sys.executable} -m pip install -q lightgbm
    import lightgbm as lgb

# XGBoost
try:
    import xgboost as xgb
except ImportError:
    !{sys.executable} -m pip install -q xgboost
    import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression

print("Imports OK")

In [None]:
# ============================
# LOAD DATA
# ============================
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH) if SAMPLE_SUB_PATH.exists() else None

print("train shape:", train.shape)
print("test shape:", test.shape)
print("train columns:", train.columns.tolist())

In [None]:
# ============================
# BASIC CHECKS & FEATURE SPLIT
# ============================
assert ID_COL in train.columns, f"{ID_COL} not in train"
assert ID_COL in test.columns, f"{ID_COL} not in test"
assert TARGET_COL in train.columns, f"{TARGET_COL} not in train"

# общие фичи
common_cols = [c for c in train.columns if c in test.columns]
feature_cols = [c for c in common_cols if c != ID_COL]

X_train = train[feature_cols].copy()
X_test = test[feature_cols].copy()
y = train[TARGET_COL].copy()

cat_cols = [c for c in feature_cols if str(train[c].dtype) in ("object", "category")]
num_cols = [c for c in feature_cols if c not in cat_cols]

print("n_features:", len(feature_cols))
print("n_num:", len(num_cols), "| n_cat:", len(cat_cols))

In [None]:
# ============================
# LABEL ENCODING ДЛЯ КАТЕГОРИАЛЬНЫХ
# ============================
encoders = {}
for c in cat_cols:
    le = LabelEncoder()
    vals = pd.concat([X_train[c], X_test[c]], axis=0).astype(str).fillna("missing")
    le.fit(vals)
    X_train[c] = le.transform(X_train[c].astype(str).fillna("missing"))
    X_test[c] = le.transform(X_test[c].astype(str).fillna("missing"))
    encoders[c] = le

print("Label encoding done")

In [None]:
# ============================
# CATBOOST POOLS
# ============================
cat_idx = [X_train.columns.get_loc(c) for c in cat_cols]

train_pool_cb = Pool(X_train, y, cat_features=cat_idx if cat_idx else None)
test_pool_cb = Pool(X_test, cat_features=cat_idx if cat_idx else None)

print("CatBoost pools ready")

In [None]:
# ============================
# 4 CATBOOST MODELS
# ============================
cat_models = {}
cat_preds_test = {}

if TASK_TYPE == "regression":
    CatModel = CatBoostRegressor
    loss = "RMSE"
    eval_metric = "RMSE"
else:
    CatModel = CatBoostClassifier
    loss = "Logloss"
    eval_metric = "AUC"

cat_params_list = [
    dict(iterations=1000, depth=6, learning_rate=0.05, l2_leaf_reg=3.0, bagging_temperature=0.5, random_strength=1.0),
    dict(iterations=2000, depth=8, learning_rate=0.03, l2_leaf_reg=3.0, bagging_temperature=0.8, random_strength=0.5),
    dict(iterations=3000, depth=10, learning_rate=0.02, l2_leaf_reg=4.0, bagging_temperature=1.0, random_strength=0.2),
    dict(iterations=1500, depth=7, learning_rate=0.04, l2_leaf_reg=2.0, bagging_temperature=0.3, random_strength=1.5),
]

for i, params in enumerate(cat_params_list, 1):
    print(f"=== CatBoost model {i}/4 ===")
    model_name = f"catboost_{i}"
    model = CatModel(
        loss_function=loss,
        eval_metric=eval_metric,
        random_seed=42 + i,
        verbose=200,
        task_type="CPU",  # можно поменять на "GPU"
        **params,
    )
    model.fit(train_pool_cb)
    if TASK_TYPE == "regression":
        pred_test = model.predict(test_pool_cb)
    else:
        pred_test = model.predict_proba(test_pool_cb)[:, 1]
    cat_models[model_name] = model
    cat_preds_test[model_name] = pred_test

print("All 4 CatBoost models trained")

In [None]:
# ============================
# ENSEMBLE CATBOOST SUBMISSION
# ============================
cat_pred_matrix = np.column_stack(list(cat_preds_test.values()))
cat_ens = cat_pred_matrix.mean(axis=1)

sub_cat = sample_sub.copy() if sample_sub is not None else pd.DataFrame()
if ID_COL in test.columns:
    sub_cat[ID_COL] = test[ID_COL].values
sub_cat[TARGET_COL] = cat_ens

path_cat = OUTPUT_DIR / "submission_catboost_ensemble.csv"
sub_cat.to_csv(path_cat, index=False)
print("Saved CatBoost ensemble submission to:", path_cat)

In [None]:
# ============================
# LINEAR MODEL (REGRESSION / LOGISTIC)
# ============================
if TASK_TYPE == "regression":
    lin_model = LinearRegression()
    lin_model.fit(X_train, y)
    lin_pred_test = lin_model.predict(X_test)
else:
    lin_model = LogisticRegression(max_iter=1000)
    lin_model.fit(X_train, y)
    lin_pred_test = lin_model.predict_proba(X_test)[:, 1]

sub_lin = sample_sub.copy() if sample_sub is not None else pd.DataFrame()
if ID_COL in test.columns:
    sub_lin[ID_COL] = test[ID_COL].values
sub_lin[TARGET_COL] = lin_pred_test

path_lin = OUTPUT_DIR / "submission_linear.csv"
sub_lin.to_csv(path_lin, index=False)
print("Saved linear model submission to:", path_lin)

In [None]:
# ============================
# 2 LIGHTGBM MODELS
# ============================
lgb_preds_test = {}

if TASK_TYPE == "regression":
    LGBModel = lgb.LGBMRegressor
    obj = "regression"
    metric = "rmse"
else:
    LGBModel = lgb.LGBMClassifier
    obj = "binary"
    metric = "auc"

lgb_params_list = [
    dict(n_estimators=2000, learning_rate=0.03, num_leaves=64, subsample=0.8, colsample_bytree=0.8),
    dict(n_estimators=3000, learning_rate=0.02, num_leaves=128, subsample=0.9, colsample_bytree=0.9),
]

for i, params in enumerate(lgb_params_list, 1):
    print(f"=== LightGBM model {i}/2 ===")
    model_name = f"lgbm_{i}"
    model = LGBModel(
        objective=obj,
        **params,
        random_state=42 + i,
    )
    model.fit(X_train, y)
    if TASK_TYPE == "regression":
        pred_test = model.predict(X_test)
    else:
        pred_test = model.predict_proba(X_test)[:, 1]
    lgb_preds_test[model_name] = pred_test

print("LightGBM models trained")

In [None]:
# ============================
# XGBOOST MODEL
# ============================
xgb_preds_test = {}

if TASK_TYPE == "regression":
    XGBModel = xgb.XGBRegressor
    obj = "reg:squarederror"
    eval_metric = "rmse"
else:
    XGBModel = xgb.XGBClassifier
    obj = "binary:logistic"
    eval_metric = "auc"

print("=== XGBoost model ===")
xgb_model = XGBModel(
    n_estimators=3000,
    learning_rate=0.03,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    objective=obj,
    eval_metric=eval_metric,
    random_state=42,
    tree_method="hist",
)
xgb_model.fit(X_train, y)
if TASK_TYPE == "regression":
    xgb_pred_test = xgb_model.predict(X_test)
else:
    xgb_pred_test = xgb_model.predict_proba(X_test)[:, 1]
xgb_preds_test["xgb_1"] = xgb_pred_test

print("XGBoost trained")

In [None]:
# ============================
# GLOBAL ENSEMBLE OF ALL MODELS
# ============================
all_preds = {}

# CatBoost (4)
all_preds.update(cat_preds_test)
# Linear
all_preds["linear"] = lin_pred_test
# LightGBM (2)
all_preds.update(lgb_preds_test)
# XGBoost (1)
all_preds.update(xgb_preds_test)

pred_matrix = np.column_stack(list(all_preds.values()))
ens_all = pred_matrix.mean(axis=1)

if TASK_TYPE == "binary":
    ens_all = np.clip(ens_all, 0.0, 1.0)

sub_all = sample_sub.copy() if sample_sub is not None else pd.DataFrame()
if ID_COL in test.columns:
    sub_all[ID_COL] = test[ID_COL].values
sub_all[TARGET_COL] = ens_all

path_all = OUTPUT_DIR / "submission_ensemble_all_models.csv"
sub_all.to_csv(path_all, index=False)
print("Saved global ensemble submission to:", path_all)