In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, f1_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

In [2]:
train = pd.read_csv('contest_train.csv')
test = pd.read_csv('contest_test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
train.shape, test.shape

((210000, 32), (90000, 31))

In [4]:
X = train.drop(columns=['id', 'target'])
y = train['target']

In [5]:
cat_features = X.select_dtypes(include='object').columns.tolist()
len(cat_features)

19

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [7]:
# Data
X_full = train.drop(columns=['id', 'target'])
y_full = train['target']
X_test = test.drop(columns=['id'])

cat_features = X_full.select_dtypes(include='object').columns.tolist()


seeds = [42, 2024, 777, 13, 99]
test_probas = []

for seed in seeds:
    model = CatBoostClassifier(
        iterations=1200,
        depth=8,
        learning_rate=0.05,
        l2_leaf_reg=3,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=seed,
        verbose=0
    )

    model.fit(
        X_full,
        y_full,
        cat_features=cat_features
    )

    test_probas.append(
        model.predict_proba(X_test)[:, 1]
    )

In [12]:
test_proba_ensemble = np.mean(test_probas, axis=0)

In [13]:
test_proba_ensemble.shape   # должно быть (len(test),)

(90000,)

In [14]:
submission = pd.DataFrame({
    'id': test['id'],
    'target': np.round(test_proba_ensemble, 1)
})

submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,target
0,0,0.0
1,1,0.1
2,2,0.7
3,3,0.0
4,4,0.3


In [15]:
submission.shape

(90000, 2)

In [16]:
submission['target'].min(), submission['target'].max()

(np.float64(0.0), np.float64(1.0))

In [17]:
submission.isna().sum()

id        0
target    0
dtype: int64