In [1]:
import pandas as pd

# Загрузка обучающей выборки
train_data = pd.read_csv("train_df.csv")
test_data = pd.read_csv('test_df.csv')

In [2]:
# Подсчет количества уникальных значений в целевом признаке
target_counts = train_data["target"].value_counts()

# Вывод уникальных значений и их количества
print("Уникальные значения в целевом признаке 'target':")
print(target_counts)


Уникальные значения в целевом признаке 'target':
target
0    14759
1      322
Name: count, dtype: int64


In [14]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ndcg_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Разделение данных на признаки и целевую переменную
X_train = train_data.drop(columns=["search_id", "target"])
y_train = train_data["target"]
X_test = test_data.drop(columns=["search_id", "target"])
y_test = test_data["target"]

scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)
#
# Создание объекта модели CatBoostClassifier
model = CatBoostClassifier()

# Задание сетки параметров для поиска по сетке
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8, 10],
    'iterations': [100, 200, 300, 500, 1000]
}

# Initialize GridSearchCV with your model, parameter grid, and scoring metric
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc', cv=3)

# Fit the model
grid_search.fit(X_train, y_train, eval_set=(X_test, y_test))

# Get the best parameters
best_params = grid_search.best_params_

# Get the best model
best_model = grid_search.best_estimator_

# Predict probabilities for the test set
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate ROC-AUC score on the test set
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Calculate NDCG score on the test set
ndcg = ndcg_score([y_test], [y_pred_proba])

print("Best parameters:", best_params)
print("ROC-AUC score on test set:", roc_auc)
print("NDCG score on test set:", ndcg)

0:	learn: 0.6747575	test: 0.6749028	best: 0.6749028 (0)	total: 7.92ms	remaining: 785ms
1:	learn: 0.6571008	test: 0.6572539	best: 0.6572539 (1)	total: 15.2ms	remaining: 744ms
2:	learn: 0.6396914	test: 0.6398836	best: 0.6398836 (2)	total: 21.7ms	remaining: 703ms
3:	learn: 0.6226588	test: 0.6228437	best: 0.6228437 (3)	total: 27.6ms	remaining: 663ms
4:	learn: 0.6064217	test: 0.6067943	best: 0.6067943 (4)	total: 33.1ms	remaining: 628ms
5:	learn: 0.5907654	test: 0.5911227	best: 0.5911227 (5)	total: 38.9ms	remaining: 609ms
6:	learn: 0.5755060	test: 0.5761653	best: 0.5761653 (6)	total: 44.3ms	remaining: 589ms
7:	learn: 0.5607278	test: 0.5614272	best: 0.5614272 (7)	total: 49.7ms	remaining: 572ms
8:	learn: 0.5462827	test: 0.5470387	best: 0.5470387 (8)	total: 55.1ms	remaining: 557ms
9:	learn: 0.5324447	test: 0.5334088	best: 0.5334088 (9)	total: 60.6ms	remaining: 546ms
10:	learn: 0.5191331	test: 0.5200554	best: 0.5200554 (10)	total: 65.9ms	remaining: 533ms
11:	learn: 0.5060920	test: 0.5070877	best