In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedGroupKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import ndcg_score
from xgboost import XGBRanker

from tqdm import tqdm
import optuna

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('intern_task.csv')
data = data.drop(['feature_100', 'feature_72', 'feature_65', 'feature_64'], axis=1)
seed = 42
data = data.sort_values('query_id').reset_index(drop=True)
X = data.drop(['rank', 'query_id'], axis=1)
y = data['rank']
qid = data['query_id']

X_train, X_test = X.loc[:198159], X.loc[198160:]
y_train, y_test = y.loc[:198159], y.loc[198160:]
qid_train, qid_test = qid.loc[:198159], qid.loc[198160:]

X_train.shape[0] + X_test.shape[0] == X.shape[0]

True

In [3]:
def objective(trial):
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 8),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 0.1),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 0.1),
        "lambdarank_num_pair_per_sample": trial.suggest_int("lambdarank_num_pair_per_sample", 4, 10),
    }

    ranker = XGBRanker(**param, tree_method="hist", lambdarank_pair_method="topk", objective="rank:ndcg")
    ranker.fit(X_train, y_train, qid=qid_train)
    ndcg5 = ndcg_score(np.array([y_test]), np.array([ranker.predict(X_test)]), k=5)
    
    return ndcg5

In [4]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10, timeout=1000)

[I 2024-04-29 23:53:25,561] A new study created in memory with name: no-name-69bf0bb9-19a9-400b-8063-e3942deb069c
[I 2024-04-29 23:53:47,252] Trial 0 finished with value: 0.8304198973631918 and parameters: {'n_estimators': 947, 'max_depth': 3, 'learning_rate': 0.026876505670976738, 'reg_alpha': 0.027153904223972264, 'reg_lambda': 0.015522210859609588, 'lambdarank_num_pair_per_sample': 8}. Best is trial 0 with value: 0.8304198973631918.
[I 2024-04-29 23:54:08,449] Trial 1 finished with value: 0.5595495524699013 and parameters: {'n_estimators': 561, 'max_depth': 6, 'learning_rate': 0.0017247730380876049, 'reg_alpha': 0.09456922839495915, 'reg_lambda': 0.06092077019334283, 'lambdarank_num_pair_per_sample': 8}. Best is trial 0 with value: 0.8304198973631918.
[I 2024-04-29 23:54:40,432] Trial 2 finished with value: 0.9576049743407978 and parameters: {'n_estimators': 703, 'max_depth': 7, 'learning_rate': 0.004407096817063037, 'reg_alpha': 0.06644545852369518, 'reg_lambda': 0.0278092720208028

In [5]:
print('Best hyperparameters:', study.best_params)
print('Best ndcg5:', study.best_value)

Best hyperparameters: {'n_estimators': 703, 'max_depth': 7, 'learning_rate': 0.004407096817063037, 'reg_alpha': 0.06644545852369518, 'reg_lambda': 0.02780927202080285, 'lambdarank_num_pair_per_sample': 4}
Best ndcg5: 0.9576049743407978
