In [1]:
from sklearn.datasets import make_classification
import numpy as np

import xgboost as xgb

# Make a synthetic ranking dataset for demonstration
seed = 1994
X, y = make_classification(random_state=seed)
rng = np.random.default_rng(seed)
n_query_groups = 3
qid = rng.integers(0, n_query_groups, size=X.shape[0])

# Sort the inputs based on query index
sorted_idx = np.argsort(qid)
X = X[sorted_idx, :]
y = y[sorted_idx]
qid = qid[sorted_idx]

In [2]:
print(X)

[[-0.43894112 -0.51836695 -0.41465193 ...  0.18095025  1.12992152
  -1.04023331]
 [ 0.19813072 -3.0744766   0.93392721 ...  0.25081308 -0.22527108
  -0.12086963]
 [-0.62152966  0.73251596  0.26901922 ... -0.33976766 -0.495313
   0.18567782]
 ...
 [ 0.81527591 -0.66816723  1.70907199 ...  0.33829174 -0.4691835
  -1.51036055]
 [ 0.21670866 -0.46333069 -0.56772849 ...  0.99016322 -0.39389604
   0.34042155]
 [ 0.13390775  1.42493036 -0.0037121  ...  1.25748579 -0.22658676
   0.14831688]]


In [3]:
print(y)
print(qid)

[1 0 1 1 0 1 0 1 1 0 0 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 0 0 0 1 0 1 0 0 1 1 0 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 1 1 0 0 1 0
 0 1 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 0 1 0 1 1 0 0 1 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [4]:
ranker = xgb.XGBRanker(tree_method="hist", lambdarank_num_pair_per_sample=8, objective="rank:ndcg", lambdarank_pair_method="topk")
ranker.fit(X, y, qid=qid)

Parameters: { "lambdarank_num_pair_per_sample", "lambdarank_pair_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRanker(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None,
          enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1,
          grow_policy='depthwise', importance_type=None,
          interaction_constraints='', lambdarank_num_pair_per_sample=8,
          lambdarank_pair_method='topk', learning_rate=0.300000012, max_bin=256,
          max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0,
          min_child_weight=1, missing=nan, monotone_constraints='()',
          n_estimators=100, n_jobs=0, num_parallel_tree=1,
          objective='rank:ndcg', predictor='auto', ...)

In [5]:
import numpy as np

def ndcg_score(y_true, y_pred, k=None):
    # Calcula o DCG para as previsões
    def dcg_at_k(y_true, y_pred, k=None):
        order = np.argsort(y_pred)[::-1]
        y_true = np.take(y_true, order[:k])
        gains = 2 ** y_true - 1
        discounts = np.log2(np.arange(len(y_true)) + 2)
        return np.sum(gains / discounts)

    # Calcula o ideal DCG (IDCG)
    def idcg_at_k(y_true, k=None):
        ideal_order = np.argsort(y_true)[::-1]
        y_true = np.take(y_true, ideal_order[:k])
        gains = 2 ** y_true - 1
        discounts = np.log2(np.arange(len(y_true)) + 2)
        return np.sum(gains / discounts)

    # Calcula o NDCG
    dcg = dcg_at_k(y_true, y_pred, k)
    idcg = idcg_at_k(y_true, k)
    if idcg == 0:
        return 0
    return dcg / idcg

In [6]:
import pandas as pd
from sklearn.metrics import make_scorer
df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
df["qid"] = qid[sorted_idx]
ranker.fit(X, y, qid=qid)  # No need to pass qid as a separate argument

ndcg_scorer = make_scorer(ndcg_score)


Parameters: { "lambdarank_num_pair_per_sample", "lambdarank_pair_method" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [7]:
scores = ranker.predict(X)
sorted_idx = np.argsort(scores)[::-1]
# Sort the relevance scores from most relevant to least relevant
scores = scores[sorted_idx]

print(scores)

[ 1.3469949   1.3469949   1.3469949   1.3469949   1.3469949   1.3469949
  1.3469949   1.3469949   1.3469949   1.3469949   1.3469949   1.3469949
  1.3469949   1.3469949   1.3469949   1.3469949   1.3469949   1.3469949
  1.3469949   1.3469949   1.3469949   1.3469949   1.3469949   1.3469949
  1.3469949   1.3469949   1.3469949   1.3469949   1.3469949   1.3469949
  1.3469949   1.3469949   1.3469949   1.3469949   1.3469949   1.3469949
  1.3469949   1.3469949   1.1674438   1.0147219   0.585591    0.585591
  0.585591    0.585591    0.585591    0.585591   -0.13086998 -0.13086998
 -0.13086998 -0.3511264  -0.3511264  -0.3511264  -0.3511264  -0.3511264
 -0.3511264  -0.3511264  -0.3511264  -0.3511264  -0.3511264  -0.3511264
 -0.3511264  -0.3511264  -0.3511264  -0.3511264  -0.3511264  -0.3511264
 -0.3511264  -0.3511264  -0.3511264  -0.3511264  -0.3511264  -0.3511264
 -0.3511264  -0.3511264  -0.3511264  -0.3511264  -0.3511264  -0.3511264
 -0.3511264  -0.3511264  -0.3511264  -0.3511264  -0.3511264  -0.