In [None]:
# ============================
# Config
# ============================
import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path(".")
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"
SAMPLE_SUB_PATH = DATA_DIR / "sample_submission.csv"

ID_COL = "id"          # колонка с ID в train/test/sample_submission
TARGET_COL = "target"  # колонка с таргетом в train

TASK_TYPE = "regression"   # "regression" или "binary"

AV_VALID_SIZE = 0.2        # доля данных для валидации в adversarial модели
MAIN_VALID_FRAC_HIGH = 0.2 # доля "наиболее тестоподобных" строк для валидации основной модели

RANDOM_STATE = 42

print("Config OK")


In [None]:
# ============================
# Imports
# ============================
import sys

try:
    from catboost import CatBoostClassifier, CatBoostRegressor, Pool
except ImportError:
    !{sys.executable} -m pip install -q catboost
    from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, mean_squared_error, log_loss, roc_auc_score as auc_score

print("Libs imported")


In [None]:
# ============================
# Load data
# ============================
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

print("train shape:", train_df.shape)
print("test shape:", test_df.shape)
print("train columns:", train_df.columns.tolist())


In [None]:
# ============================
# Feature setup
# ============================
# Фичи = все колонки, которые есть и в train, и в test, кроме ID и таргета
common_cols = [c for c in train_df.columns if c in test_df.columns]
feature_cols = [c for c in common_cols if c != ID_COL]

print("n_features:", len(feature_cols))
print("feature_cols (first 20):", feature_cols[:20])

X_train_full = train_df[feature_cols].copy()
y_train_full = train_df[TARGET_COL].copy()
X_test = test_df[feature_cols].copy()

# Категориальные признаки (по типу object и category)
cat_cols = [c for c in feature_cols if str(train_df[c].dtype) in ("object", "category")]
print("n_cat_features:", len(cat_cols))

# Небольшой препроцесс категориальных
for c in cat_cols:
    X_train_full[c] = X_train_full[c].astype(str).fillna("missing")
    X_test[c] = X_test[c].astype(str).fillna("missing")


In [None]:
# ============================
# Adversarial validation: prepare combined dataset
# ============================
train_av = X_train_full.copy()
test_av = X_test.copy()

train_av["is_test"] = 0
test_av["is_test"] = 1

full_av = pd.concat([train_av, test_av], axis=0, ignore_index=True)
y_av = full_av["is_test"].values
X_av = full_av.drop(columns=["is_test"])

cat_cols_av = [c for c in X_av.columns if str(X_av[c].dtype) == "object"]

print("full_av shape:", full_av.shape)
print("cat_cols_av:", cat_cols_av)


In [None]:
# ============================
# Train adversarial model (train vs test classifier)
# ============================
X_tr_av, X_va_av, y_tr_av, y_va_av = train_test_split(
    X_av, y_av,
    test_size=AV_VALID_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_av
)

cat_idx_av = [X_tr_av.columns.get_loc(c) for c in cat_cols_av]

train_pool_av = Pool(X_tr_av, y_tr_av, cat_features=cat_idx_av if cat_idx_av else None)
valid_pool_av = Pool(X_va_av, y_va_av, cat_features=cat_idx_av if cat_idx_av else None)

av_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    iterations=2000,
    learning_rate=0.05,
    depth=6,
    random_seed=RANDOM_STATE,
    verbose=200,
    task_type="CPU",  # можно сменить на "GPU", devices="0"
)

av_model.fit(train_pool_av, eval_set=valid_pool_av, use_best_model=True)

probs_va = av_model.predict_proba(valid_pool_av)[:, 1]
av_auc = roc_auc_score(y_va_av, probs_va)
print(f"Adversarial AUC (train vs test distinguishability): {av_auc:.4f}")


In [None]:
# ============================
# Feature importances (drifting features)
# ============================
feat_importances = av_model.get_feature_importance(train_pool_av, type="FeatureImportance")
feat_imp_df = pd.DataFrame({
    "feature": X_tr_av.columns,
    "importance": feat_importances,
}).sort_values("importance", ascending=False)

print("Top train-vs-test drifting features:")
display(feat_imp_df.head(20))


In [None]:
# ============================
# Compute "probability of being test" for each row
# ============================
full_pool_av = Pool(X_av, cat_features=cat_idx_av if cat_idx_av else None)
probs_test_like = av_model.predict_proba(full_pool_av)[:, 1]

# Записываем av_score обратно в train_df
train_df["av_score"] = probs_test_like[: len(train_df)]
print("av_score stats on train:")
print(train_df["av_score"].describe())


In [None]:
# ============================
# Build main train/val split based on av_score
# ============================
# Чем выше av_score, тем больше строка похожа на test → возьмём верхний квантиль под валидацию
threshold = train_df["av_score"].quantile(1.0 - MAIN_VALID_FRAC_HIGH)
print(f"Adversarial threshold for validation: {threshold:.4f}")

val_mask = train_df["av_score"] >= threshold
train_mask = ~val_mask

X_tr_main = X_train_full.loc[train_mask].reset_index(drop=True)
y_tr_main = y_train_full.loc[train_mask].reset_index(drop=True)

X_va_main = X_train_full.loc[val_mask].reset_index(drop=True)
y_va_main = y_train_full.loc[val_mask].reset_index(drop=True)

print("Main train shape:", X_tr_main.shape)
print("Main valid shape:", X_va_main.shape)

cat_idx_main = [X_tr_main.columns.get_loc(c) for c in cat_cols]


In [None]:
# ============================
# Train main CatBoost model with AV-based split
# ============================
if TASK_TYPE == "regression":
    main_model = CatBoostRegressor(
        loss_function="RMSE",
        eval_metric="RMSE",
        iterations=3000,
        learning_rate=0.03,
        depth=8,
        random_seed=RANDOM_STATE,
        verbose=200,
        task_type="CPU",  # можно сменить на "GPU"
    )
else:
    main_model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        iterations=3000,
        learning_rate=0.03,
        depth=8,
        random_seed=RANDOM_STATE,
        verbose=200,
        task_type="CPU",  # можно сменить на "GPU"
    )

train_pool_main = Pool(X_tr_main, y_tr_main, cat_features=cat_idx_main if cat_idx_main else None)
valid_pool_main = Pool(X_va_main, y_va_main, cat_features=cat_idx_main if cat_idx_main else None)

main_model.fit(train_pool_main, eval_set=valid_pool_main, use_best_model=True)

if TASK_TYPE == "regression":
    va_pred = main_model.predict(valid_pool_main)
    rmse = mean_squared_error(y_va_main, va_pred, squared=False)
    print(f"Main model RMSE on AV-based valid: {rmse:.5f}")
else:
    va_pred_proba = main_model.predict_proba(valid_pool_main)[:, 1]
    logloss = log_loss(y_va_main, va_pred_proba)
    auc = auc_score(y_va_main, va_pred_proba)
    print(f"Main model Logloss: {logloss:.5f}, AUC: {auc:.5f}")


In [None]:
# ============================
# Retrain on full train and predict test
# ============================
full_pool_main = Pool(X_train_full, y_train_full, cat_features=cat_idx_main if cat_idx_main else None)
test_pool = Pool(X_test, cat_features=cat_idx_main if cat_idx_main else None)

main_model.fit(full_pool_main, verbose=200)

test_pred = main_model.predict(test_pool)


In [None]:
# ============================
# Build submission
# ============================
sub = sample_sub.copy()
if ID_COL in sub.columns and ID_COL in test_df.columns:
    sub[ID_COL] = test_df[ID_COL].values

sub[TARGET_COL] = test_pred
sub.to_csv("submission_catboost_av.csv", index=False)
print("Saved submission to: submission_catboost_av.csv")
sub.head()
