In [None]:
# ============================
# CONFIG
# ============================
from pathlib import Path

DATA_DIR = Path(".")
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"
SAMPLE_SUB_PATH = DATA_DIR / "sample_submission.csv"

ID_COL = "id"          # имя ID-колонки (должна быть и в train, и в test)
TARGET_COL = "target"  # имя таргет-колонки (только в train)

TASK_TYPE = "auto"     # "auto", "regression" или "binary"

N_FOLDS = 5
RANDOM_STATE = 42

USE_AUTOML_FEATURES = True   # если True, используем AutoMLPipelineFeatureGenerator (autogluon)
MAX_INTERACTION_NUM_COLS = 50  # макс. число числовых колонок для попарных взаимодействий

OUTPUT_FEATURE_TRAIN = DATA_DIR / "train_features.csv"
OUTPUT_FEATURE_TEST = DATA_DIR / "test_features.csv"
OUTPUT_SUBMISSION = DATA_DIR / "submission_catboost_auto_features.csv"

print("Config OK")

In [None]:
# ============================
# IMPORTS & INSTALLS
# ============================
import sys
import numpy as np
import pandas as pd

try:
    from catboost import CatBoostRegressor, CatBoostClassifier, Pool
except ImportError:
    !{sys.executable} -m pip install -q catboost
    from catboost import CatBoostRegressor, CatBoostClassifier, Pool

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss, roc_auc_score

if USE_AUTOML_FEATURES:
    try:
        from autogluon.features.generators import AutoMLPipelineFeatureGenerator
    except ImportError:
        !{sys.executable} -m pip install -q autogluon.features
        from autogluon.features.generators import AutoMLPipelineFeatureGenerator

print("Imports OK")

In [None]:
# ============================
# LOAD DATA
# ============================
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH) if SAMPLE_SUB_PATH.exists() else None

print("train shape:", train.shape)
print("test shape:", test.shape)
print("columns:", train.columns.tolist())

In [None]:
# ============================
# BASIC CHECKS & TASK TYPE
# ============================
assert ID_COL in train.columns, f"{ID_COL} not in train"
assert ID_COL in test.columns, f"{ID_COL} not in test"
assert TARGET_COL in train.columns, f"{TARGET_COL} not in train"

# Определим режим задачи
y = train[TARGET_COL]
if TASK_TYPE == "auto":
    if y.dtype.kind in "ifu":
        # числовой таргет: проверим количество уникальных
        n_unique = y.nunique()
        if n_unique <= 20 and set(np.unique(y.values[~pd.isna(y.values)])).issubset(set(range(int(y.min()), int(y.max()) + 1))):
            TASK_TYPE_DETECTED = "binary" if n_unique == 2 else "multiclass"
        else:
            TASK_TYPE_DETECTED = "regression"
    else:
        TASK_TYPE_DETECTED = "binary" if y.nunique() == 2 else "multiclass"
else:
    TASK_TYPE_DETECTED = TASK_TYPE

print("Detected TASK_TYPE:", TASK_TYPE_DETECTED)

In [None]:
# ============================
# BASE FEATURES & PREPROCESS
# ============================
# Фичи = все общие колонки между train и test, кроме ID и TARGET
common_cols = [c for c in train.columns if c in test.columns]
feature_cols = [c for c in common_cols if c != ID_COL]

X_train_base = train[feature_cols].copy()
X_test_base = test[feature_cols].copy()
y = train[TARGET_COL].copy()

cat_cols = [c for c in feature_cols if str(train[c].dtype) in ("object", "category")]
num_cols = [c for c in feature_cols if c not in cat_cols]

for c in cat_cols:
    X_train_base[c] = X_train_base[c].astype(str).fillna("missing")
    X_test_base[c] = X_test_base[c].astype(str).fillna("missing")

for c in num_cols:
    X_train_base[c] = pd.to_numeric(X_train_base[c], errors="coerce")
    X_test_base[c] = pd.to_numeric(X_test_base[c], errors="coerce")

print("n_features base:", len(feature_cols))
print("n_num:", len(num_cols), "| n_cat:", len(cat_cols))

In [None]:
# ============================
# MANUAL FEATURE GENERATION
# ============================
X_train_man = X_train_base.copy()
X_test_man = X_test_base.copy()

# 1) DateTime разбор
for c in feature_cols:
    if np.issubdtype(train[c].dtype, np.datetime64):
        dt_tr = pd.to_datetime(X_train_man[c], errors="coerce")
        dt_te = pd.to_datetime(X_test_man[c], errors="coerce")
        for attr in ["year", "month", "day", "dayofweek", "hour"]:
            X_train_man[f"{c}_{attr}"] = getattr(dt_tr.dt, attr)
            X_test_man[f"{c}_{attr}"] = getattr(dt_te.dt, attr)
        X_train_man.drop(columns=[c], inplace=True)
        X_test_man.drop(columns=[c], inplace=True)

# 2) Числовые трансформации: log1p, sqrt, square
for c in num_cols:
    if c in X_train_man.columns:
        tr = X_train_man[c].astype(float)
        te = X_test_man[c].astype(float)
        if (tr > 0).any():
            X_train_man[f"{c}_log1p"] = np.log1p(tr.clip(lower=0))
            X_test_man[f"{c}_log1p"] = np.log1p(te.clip(lower=0))
        X_train_man[f"{c}_sqrt"] = np.sqrt(tr.clip(lower=0))
        X_test_man[f"{c}_sqrt"] = np.sqrt(te.clip(lower=0))
        X_train_man[f"{c}_sq"] = tr ** 2
        X_test_man[f"{c}_sq"] = te ** 2

# 3) Попарные взаимодействия числовых признаков (ограничим количество)
num_cols_for_int = [c for c in num_cols if c in X_train_man.columns]
if len(num_cols_for_int) > MAX_INTERACTION_NUM_COLS:
    num_cols_for_int = num_cols_for_int[:MAX_INTERACTION_NUM_COLS]

for i in range(len(num_cols_for_int)):
    for j in range(i + 1, len(num_cols_for_int)):
        c1 = num_cols_for_int[i]
        c2 = num_cols_for_int[j]
        name_mul = f"{c1}*{c2}"
        X_train_man[name_mul] = X_train_man[c1].astype(float) * X_train_man[c2].astype(float)
        X_test_man[name_mul] = X_test_man[c1].astype(float) * X_test_man[c2].astype(float)

# 4) Частотное кодирование категорий (frequency encoding)
for c in cat_cols:
    if c in X_train_man.columns:
        freq = X_train_man[c].value_counts(normalize=True)
        X_train_man[f"{c}_freq"] = X_train_man[c].map(freq)
        X_test_man[f"{c}_freq"] = X_test_man[c].map(freq).fillna(0)

print("Manual features shape:", X_train_man.shape, X_test_man.shape)

In [None]:
# ============================
# AUTOML PIPELINE FEATURE GENERATOR (OPTIONAL)
# ============================
if USE_AUTOML_FEATURES:
    fg = AutoMLPipelineFeatureGenerator(
        enable_numeric_features=True,
        enable_categorical_features=True,
        enable_datetime_features=True,
        enable_text_special_features=True,
        enable_text_ngram_features=True,
        enable_raw_text_features=False,
    )
    X_train_auto = fg.fit_transform(X=X_train_base, y=y)
    X_test_auto = fg.transform(X_test_base)
    print("AutoML features shape:", X_train_auto.shape, X_test_auto.shape)
else:
    X_train_auto = None
    X_test_auto = None

In [None]:
# ============================
# MERGE MANUAL + AUTOML FEATURES
# ============================
if X_train_auto is not None:
    X_train_all = pd.concat(
        [X_train_man.reset_index(drop=True), 
         X_train_auto.reset_index(drop=True)], 
        axis=1
    )
    X_test_all = pd.concat(
        [X_test_man.reset_index(drop=True), 
         X_test_auto.reset_index(drop=True)], 
        axis=1
    )
else:
    X_train_all = X_train_man
    X_test_all = X_test_man

# Уберём дубли колонок, если появились
X_train_all = X_train_all.loc[:, ~X_train_all.columns.duplicated()]
X_test_all = X_test_all.loc[:, ~X_test_all.columns.duplicated()]

print("Final feature shapes:", X_train_all.shape, X_test_all.shape)

# Сохраним фичи (опционально)
X_train_all.to_csv(OUTPUT_FEATURE_TRAIN, index=False)
X_test_all.to_csv(OUTPUT_FEATURE_TEST, index=False)
print("Saved features to:")
print(OUTPUT_FEATURE_TRAIN)
print(OUTPUT_FEATURE_TEST)

In [None]:
# ============================
# PREPARE CATBOOST POOLS
# ============================
# Определим категориальные признаки по типу object в X_train_all
cat_cols_all = [c for c in X_train_all.columns if X_train_all[c].dtype == "object"]
for c in cat_cols_all:
    X_train_all[c] = X_train_all[c].astype(str)
    X_test_all[c] = X_test_all[c].astype(str)

cat_idx_all = [X_train_all.columns.get_loc(c) for c in cat_cols_all]

print("Total features:", X_train_all.shape[1])
print("Cat features:", len(cat_cols_all))

In [None]:
# ============================
# K-FOLD CATBOOST TRAINING
# ============================
X = X_train_all
X_test_final = X_test_all
y_arr = y.values

oof = np.zeros(len(X))
test_pred = np.zeros(len(X_test_final))

if TASK_TYPE_DETECTED in ("binary", "multiclass"):
    if TASK_TYPE_DETECTED == "binary":
        cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    else:
        cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
else:
    cv = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

models = []
scores = []

for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y_arr), 1):
    print(f"=== Fold {fold}/{N_FOLDS} ===")
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y_arr[tr_idx], y_arr[va_idx]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_idx_all if cat_idx_all else None)
    valid_pool = Pool(X_va, y_va, cat_features=cat_idx_all if cat_idx_all else None)

    if TASK_TYPE_DETECTED == "regression":
        model = CatBoostRegressor(
            loss_function="RMSE",
            eval_metric="RMSE",
            iterations=3000,
            learning_rate=0.03,
            depth=8,
            random_seed=RANDOM_STATE + fold,
            verbose=200,
            task_type="CPU",
        )
    elif TASK_TYPE_DETECTED == "binary":
        model = CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="AUC",
            iterations=3000,
            learning_rate=0.03,
            depth=8,
            random_seed=RANDOM_STATE + fold,
            verbose=200,
            task_type="CPU",
        )
    else:  # multiclass
        model = CatBoostClassifier(
            loss_function="MultiClass",
            eval_metric="MultiClass",
            iterations=3000,
            learning_rate=0.03,
            depth=8,
            random_seed=RANDOM_STATE + fold,
            verbose=200,
            task_type="CPU",
        )

    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    models.append(model)

    if TASK_TYPE_DETECTED == "regression":
        va_pred = model.predict(valid_pool)
        oof[va_idx] = va_pred
        rmse = mean_squared_error(y_va, va_pred, squared=False)
        scores.append(rmse)
        print(f"Fold {fold} RMSE: {rmse:.5f}")
    elif TASK_TYPE_DETECTED == "binary":
        va_pred_proba = model.predict_proba(valid_pool)[:, 1]
        oof[va_idx] = va_pred_proba
        auc = roc_auc_score(y_va, va_pred_proba)
        scores.append(auc)
        print(f"Fold {fold} AUC: {auc:.5f}")
    else:
        va_pred = model.predict(valid_pool)
        oof[va_idx] = va_pred
        # мультикласс метрику можно считать отдельно при желании
        print(f"Fold {fold} done (multiclass).")

    test_pool = Pool(X_test_final, cat_features=cat_idx_all if cat_idx_all else None)
    fold_pred = model.predict(test_pool)
    if TASK_TYPE_DETECTED == "binary":
        fold_pred = model.predict_proba(test_pool)[:, 1]
    test_pred += fold_pred / N_FOLDS

print("CV scores:", scores)
if TASK_TYPE_DETECTED == "regression":
    print("OOF RMSE:", mean_squared_error(y_arr, oof, squared=False))
elif TASK_TYPE_DETECTED == "binary":
    print("OOF AUC:", roc_auc_score(y_arr, oof))

In [None]:
# ============================
# BUILD SUBMISSION
# ============================
if sample_sub is not None and TARGET_COL in sample_sub.columns:
    sub = sample_sub.copy()
    if ID_COL in sub.columns and ID_COL in test.columns:
        sub[ID_COL] = test[ID_COL].values
    sub[TARGET_COL] = test_pred
else:
    if ID_COL in test.columns:
        sub = pd.DataFrame({ID_COL: test[ID_COL].values, TARGET_COL: test_pred})
    else:
        sub = pd.DataFrame({TARGET_COL: test_pred})

sub.to_csv(OUTPUT_SUBMISSION, index=False)
print("Saved submission to:", OUTPUT_SUBMISSION)
sub.head()