In [15]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, f1_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

In [16]:
train = pd.read_csv('contest_train.csv')
test = pd.read_csv('contest_test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [17]:
train.shape, test.shape

((210000, 32), (90000, 31))

In [18]:
X = train.drop(columns=['id', 'target'])
y = train['target']

In [19]:
cat_features = X.select_dtypes(include='object').columns.tolist()
len(cat_features)

19

In [20]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [21]:
results = []

seeds = [42, 2024, 777, 13, 99]
test_probas = []

# обязательно
X_test = test.drop(columns=['id'])

for seed in seeds:
    model = CatBoostClassifier(
        iterations=1200,
        depth=8,
        learning_rate=0.05,
        l2_leaf_reg=3,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=seed,
        verbose=0
    )

    model.fit(
        X_train, y_train,
        cat_features=cat_features,
        eval_set=(X_val, y_val),
        use_best_model=True
    )

    # AUC на валидации
    val_proba = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_proba)

    results.append({
        'seed': seed,
        'auc': auc
    })

# таблица сравнения seed
results_df = pd.DataFrame(results).sort_values('auc', ascending=False)
results_df

KeyboardInterrupt: 

In [None]:
# Precision–Recall curve
precision, recall, pr_thresholds = precision_recall_curve(y_val, val_proba)

# Подбор порога по F1
thresholds = np.linspace(0.0, 1.0, 201)

best_threshold = 0.0
best_f1 = 0.0

for t in thresholds:
    y_pred_t = (val_proba >= t).astype(int)
    f1 = f1_score(y_val, y_pred_t)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

# ===== Plot =====
plt.figure(figsize=(7, 5))

plt.plot(pr_thresholds, recall[:-1], label='Recall (class 1)')
plt.plot(pr_thresholds, precision[:-1], label='Precision (class 1)')

plt.axvline(0.5, linestyle='--', label='Threshold = 0.5')
plt.axvline(best_threshold, linestyle=':', label=f'Best F1 threshold = {best_threshold:.2f}')

plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision / Recall vs Threshold')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# ===== Result =====
print(f"Best threshold (by F1): {best_threshold:.3f}")
print(f"Best F1-score: {best_f1:.3f}")

In [None]:
threshold = best_threshold
y_pred = (val_proba >= threshold).astype(int)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))
print("\nClassification report:\n", classification_report(y_val, y_pred))

In [None]:
# Data
X_full = train.drop(columns=['id', 'target'])
y_full = train['target']

cat_features = X_full.select_dtypes(include='object').columns.tolist()


final_model = CatBoostClassifier(
    iterations=1200,          # можно 1000–1200
    depth=8,
    learning_rate=0.05,
    l2_leaf_reg=3,            # дефолт ок, можно потом попробовать 5–10
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    verbose=100
)

final_model.fit(
    X_full,
    y_full,
    cat_features=cat_features
)

In [None]:
X_test = test.drop(columns=['id'])

test_proba = final_model.predict_proba(X_test)[:, 1]

In [None]:
submission = pd.DataFrame({
    'id': test['id'],
    'target': test_proba
})

submission.to_csv('submission.csv', index=False)
submission

In [None]:
submission.shape

In [None]:
submission['target'].min(), submission['target'].max()

In [None]:
submission.isna().sum()