In [92]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import TruncatedSVD
from catboost import CatBoostClassifier, Pool, cv
import optuna

In [99]:
# Задаем список возможных значений количества компонент
n_components = np.arange(81)
data = pd.read_csv('C:/Users/savel/Downloads/train_df.csv')

X = data.drop('target', axis=1)  # Признаки
y = data['target']  # Целевая переменная

explained_variance = []
for n in n_components:
    svd = TruncatedSVD(n_components=n)
    svd.fit(data)
    explained_variance.append(svd.explained_variance_ratio_.sum())

best_n_components = n_components[np.argmax(explained_variance)]
print("Best number of components:", best_n_components)

Best number of components: 77


In [100]:
data = pd.read_csv('C:/Users/savel/Downloads/train_df.csv')

X = data.drop('target', axis=1)  # Признаки
y = data['target']  # Целевая переменная

# Факторизация данных при помощи SVD
svd = TruncatedSVD(n_components = best_n_components)  # Выбираем количество компонентов
X_svd = svd.fit_transform(X)

# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X_svd, y, test_size=0.2, random_state=42, stratify = y)

In [101]:
train_dataset = Pool(data = X_train,
                    label = y_train,
                    cat_features = None)

test_dataset = Pool(data = X_test,
                    label = y_test,
                    cat_features = None)

In [102]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int( 'iterations', 300, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 2, 8),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature' : trial.suggest_float('bagging_temperature', 0.5, 1),
    }

    model = CatBoostClassifier(**params, verbose= False)
    model.fit(train_dataset)

    y_pred = model.predict(test_dataset)
    nd = ndcg_score([y_test], [y_pred], k=len(y_test))

    return nd

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_params = study.best_params
print("Best parameters:", best_params)

[I 2024-03-11 19:50:49,729] A new study created in memory with name: no-name-2816546d-077d-4c59-be26-4a4598413e08
[I 2024-03-11 19:51:44,745] Trial 0 finished with value: 0.4266745713217855 and parameters: {'iterations': 912, 'learning_rate': 0.2743126925234466, 'depth': 8, 'l2_leaf_reg': 2.9471873620861673, 'bagging_temperature': 0.5108258482955478}. Best is trial 0 with value: 0.4266745713217855.
[I 2024-03-11 19:51:48,590] Trial 1 finished with value: 0.42792101613363076 and parameters: {'iterations': 501, 'learning_rate': 0.038899535559388514, 'depth': 2, 'l2_leaf_reg': 7.417097340058828, 'bagging_temperature': 0.7041097913795791}. Best is trial 1 with value: 0.42792101613363076.
[I 2024-03-11 19:52:15,225] Trial 2 finished with value: 0.42792101613363076 and parameters: {'iterations': 966, 'learning_rate': 0.123424232413724, 'depth': 7, 'l2_leaf_reg': 2.8908426134065155, 'bagging_temperature': 0.6921981640983674}. Best is trial 1 with value: 0.42792101613363076.
[I 2024-03-11 19:5

Best parameters: {'iterations': 306, 'learning_rate': 0.26616116683189806, 'depth': 2, 'l2_leaf_reg': 7.253982216087021, 'bagging_temperature': 0.9107065628242669}


In [103]:
model = CatBoostClassifier(**best_params, verbose = False)
model.fit(train_dataset)


preds = model.predict(test_dataset)

test_data = pd.read_csv("C:/Users/savel/Downloads/test_df.csv")
y_td = test_data[test_data.columns[-1]].values
X = test_data[test_data.columns[:-1]].values

Xp = Pool(data = X,
         label = y_td,
         cat_features=None)

ptd = model.predict(Xp)
nd = ndcg_score([y_td], [ptd], k=len(y_td))
print(f"NDCG metric = {nd}")


NDCG metric = 0.38792243068552423
