## Implement LambdaMART using `lightgbm`

In [1]:
!ls ../data/MSLR-WEB10K/Fold1/

test.txt  train.txt vali.txt


In [2]:
!tail -n1 ../data/MSLR-WEB10K/Fold1/train.txt

1 qid:29992 1:2 2:1 3:1 4:0 5:2 6:1 7:0.50000 8:0.50000 9:0 10:1 11:1066 12:3 13:11 14:6 15:1086 16:19.036284 17:28.261942 18:27.090498 19:29.47176 20:19.017721 21:43 22:1 23:2 24:0 25:46 26:1 27:0 28:0 29:0 30:1 31:42 32:1 33:2 34:0 35:45 36:21.50000 37:0.50000 38:1 39:0 40:23 41:420.25000 42:0.25000 43:1 44:0 45:484 46:0.040338 47:0.333333 48:0.181818 49:0 50:0.042357 51:0.000938 52:0 53:0 54:0 55:0.000921 56:0.03940 57:0.333333 58:0.181818 59:0 60:0.041436 61:0.020169 62:0.166667 63:0.090909 64:0 65:0.021179 66:0.00037 67:0.027778 68:0.008264 69:0 70:0.00041 71:390.814899 72:12.674474 73:26.489059 74:0 75:417.257951 76:9.968513 77:0 78:0 79:0 80:9.966807 81:380.846387 82:12.674474 83:26.489059 84:0 85:407.291145 86:195.40745 87:6.337237 88:13.244529 89:0 90:208.628976 91:34387.599318 92:40.160576 93:175.417555 94:0 95:39466.657481 96:1 97:0 98:0 99:0 100:1 101:0.692028 102:0.632181 103:0.691352 104:0 105:0.69019 106:38.177038 107:13.909696 108:17.007937 109:0 110:38.287432 111:-11.4

In [3]:
from glob import glob

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score

In [4]:
def format_data(type_):
    files = glob("../data/MSLR-WEB10K/Fold*/{}.txt".format(type_))
    labels = []
    features = []
    groups = []
    grp_cnt = 0
    for file in files:
        with open(file, "r") as fin:
            for i, line in enumerate(fin):
                label, qid, feats = line.split(" ", 2)
                labels.append(int(label))
                qid = int(qid.split(":")[-1])
                if i == 0:
                    last_qid = qid
                feats = [float(f.split(":")[-1]) for f in feats.strip().split(" ")]
                features.append(feats)
                if qid != last_qid:
                    last_qid = qid
                    groups.append(grp_cnt)
                    grp_cnt = 1
                else:
                    grp_cnt += 1
    groups.append(grp_cnt)
    return (np.array(features), np.array(labels), np.array(groups))

In [5]:
train_features, train_labels, train_groups = format_data("train")

In [6]:
len(train_features), len(train_labels), sum(train_groups)

(3600576, 3600576, 3600576)

In [7]:
train_groups

array([103,  76, 102, ...,  79, 180,  40])

In [8]:
test_features, test_labels, test_groups = format_data("test")

In [9]:
len(test_features), len(test_labels), sum(test_groups)

(1200192, 1200192, 1200192)

In [10]:
train_data = lgb.Dataset(data=train_features, label=train_labels, group=train_groups)
test_data = lgb.Dataset(data=test_features, label=test_labels, group=test_groups)

In [11]:
import optuna
from optuna.integration import LightGBMPruningCallback

In [12]:
def objective(trial, data):
    params = {
        "task": "train",
        "objective": "lambdarank",
        "metric": "ndcg",
        "ndcg_eval_at": [10, 5, 1],
        "num_threads": 4,
        "feature_pre_filter": False,
        "reg_alpha": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.1, 1),
        #'subsample': trial.suggest_uniform('subsample', 1e-8, 1),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100),
        "min_sum_hessian_in_leaf": trial.suggest_loguniform(
            "min_sum_hessian_in_leaf", 1e-8, 10.0
        ),
        "learning_rate": trial.suggest_float("learning_rate", 1e-1, 3e-1),
    }

    cv_results = lgb.cv(
        params,
        data,
        num_boost_round=100,
        nfold=3,
        callbacks=[
            lgb.log_evaluation(10),
            lgb.early_stopping(
                stopping_rounds=5, first_metric_only=True, verbose=False
            ),
        ],
    )

    return cv_results["ndcg@10-mean"][-1]


study = optuna.create_study(direction="maximize", study_name="LGBM Ranking")
func = lambda trial: objective(trial, train_data)
study.optimize(func, n_trials=5)

[32m[I 2022-07-27 17:40:22,133][0m A new study created in memory with name: LGBM Ranking[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
[10]	cv_agg's ndcg@1: 0.435929 + 0.0049865	cv_agg's ndcg@5: 0.420043 + 0.00255925	cv_agg's ndcg@10: 0.436062 + 0.00305421
[20]	cv_agg's ndcg@1: 0.456227 + 0.00200612	cv_agg's ndcg@5: 0.437426 + 0.00235658	cv_agg's ndcg@10: 0.456776 + 0.00195

[32m[I 2022-07-27 17:41:58,961][0m Trial 0 finished with value: 0.4918665254521892 and parameters: {'lambda_l1': 9.242279921881652e-05, 'lambda_l2': 8.990030609011882e-07, 'max_depth': 3, 'num_leaves': 116, 'colsample_bytree': 0.9933168261312348, 'min_data_in_leaf': 50, 'min_sum_hessian_in_leaf': 1.1432209445503182e-08, 'learning_rate': 0.12727093934945977}. Best is trial 0 with value: 0.4918665254521892.[0m


[100]	cv_agg's ndcg@1: 0.487506 + 0.00156281	cv_agg's ndcg@5: 0.47514 + 0.00132978	cv_agg's ndcg@10: 0.492069 + 0.000813218
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
[10]	cv_agg's ndcg@1: 0.488324 + 0.00059368	cv_agg's ndcg@5: 0.472717 + 0.00166551	cv_agg's ndcg@10: 0.488881 + 0.001

[32m[I 2022-07-27 17:44:01,992][0m Trial 1 finished with value: 0.5559764500341148 and parameters: {'lambda_l1': 3.3086090474282157e-06, 'lambda_l2': 0.0012514954463073416, 'max_depth': 8, 'num_leaves': 60, 'colsample_bytree': 0.9081227135632017, 'min_data_in_leaf': 23, 'min_sum_hessian_in_leaf': 1.9175765067602226e-08, 'learning_rate': 0.19184245195083788}. Best is trial 1 with value: 0.5559764500341148.[0m


[100]	cv_agg's ndcg@1: 0.590232 + 0.00366504	cv_agg's ndcg@5: 0.547745 + 0.00157108	cv_agg's ndcg@10: 0.555976 + 0.00146979
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
[10]	cv_agg's ndcg@1: 0.472873 + 0.00115681	cv_agg's ndcg@5: 0.459403 + 0.00130689	cv_agg's ndcg@10: 0.47567 + 0.0009

[32m[I 2022-07-27 17:46:06,665][0m Trial 2 finished with value: 0.5298875255820453 and parameters: {'lambda_l1': 3.4017903262411844e-07, 'lambda_l2': 1.5910388785227304e-05, 'max_depth': 5, 'num_leaves': 92, 'colsample_bytree': 0.31115856510796, 'min_data_in_leaf': 48, 'min_sum_hessian_in_leaf': 8.209419003059314e-08, 'learning_rate': 0.2812414978041621}. Best is trial 1 with value: 0.5559764500341148.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
[10]	cv_agg's ndcg@1: 0.461497 + 0.000865682	cv_agg's ndcg@5: 0.446857 + 0.00201199	cv_agg's ndcg@10: 0.46558 + 0.00191351
[20]	cv_agg's ndcg@1: 0.480714 + 0.00137837	cv_agg's ndcg@5: 0.468175 + 0.00274396	cv_agg's ndcg@10: 0.485373 + 0.0024

[32m[I 2022-07-27 17:47:40,934][0m Trial 3 finished with value: 0.5164140423288628 and parameters: {'lambda_l1': 0.07328027219500806, 'lambda_l2': 5.9021869473089795e-05, 'max_depth': 4, 'num_leaves': 23, 'colsample_bytree': 0.9941364862172039, 'min_data_in_leaf': 73, 'min_sum_hessian_in_leaf': 0.0006470352505149061, 'learning_rate': 0.24497698537726156}. Best is trial 1 with value: 0.5559764500341148.[0m


[100]	cv_agg's ndcg@1: 0.521445 + 0.000671233	cv_agg's ndcg@5: 0.502367 + 0.001405	cv_agg's ndcg@10: 0.516414 + 0.00132609
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 2400384, number of used features: 136
[10]	cv_agg's ndcg@1: 0.504964 + 0.00457768	cv_agg's ndcg@5: 0.487492 + 0.00270949	cv_agg's ndcg@10: 0.501149 + 0.0032

[32m[I 2022-07-27 17:50:11,789][0m Trial 4 finished with value: 0.5827377295524029 and parameters: {'lambda_l1': 0.00010123623725183876, 'lambda_l2': 0.0011440172917691488, 'max_depth': 8, 'num_leaves': 91, 'colsample_bytree': 0.5381971793910936, 'min_data_in_leaf': 13, 'min_sum_hessian_in_leaf': 2.831397169129265, 'learning_rate': 0.2856041327762513}. Best is trial 4 with value: 0.5827377295524029.[0m


[100]	cv_agg's ndcg@1: 0.634191 + 0.00252669	cv_agg's ndcg@5: 0.57961 + 0.00277169	cv_agg's ndcg@10: 0.582738 + 0.00336254


In [14]:
print(f"\tBest value: {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value: 0.58274
	Best params:
		lambda_l1: 0.00010123623725183876
		lambda_l2: 0.0011440172917691488
		max_depth: 8
		num_leaves: 91
		colsample_bytree: 0.5381971793910936
		min_data_in_leaf: 13
		min_sum_hessian_in_leaf: 2.831397169129265
		learning_rate: 0.2856041327762513


In [28]:
train_data = lgb.Dataset(data=train_features, label=train_labels, group=train_groups)
test_data = lgb.Dataset(data=test_features, label=test_labels, group=test_groups)

In [30]:
param

In [31]:
param = {
    "task": "train",
    "num_leaves": 255,
    "min_data_in_leaf": 1,
    "min_sum_hessian_in_leaf": 100,
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [1, 3, 5, 10],
    "learning_rate": 0.1,
    "num_threads": 2,
}
param.update(study.best_params)

res = {}
bst = lgb.train(
    param,
    train_data,
    valid_sets=[test_data],
    valid_names=["valid"],
    num_boost_round=1000,
    callbacks=[
        lgb.record_evaluation(res),
        lgb.log_evaluation(100),
        lgb.early_stopping(stopping_rounds=5),
    ],
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25566
[LightGBM] [Info] Number of data points in the train set: 3600576, number of used features: 136
Training until validation scores don't improve for 5 rounds
[100]	valid's ndcg@1: 0.683979	valid's ndcg@3: 0.629224	valid's ndcg@5: 0.616542	valid's ndcg@10: 0.614282
[200]	valid's ndcg@1: 0.764118	valid's ndcg@3: 0.703417	valid's ndcg@5: 0.685055	valid's ndcg@10: 0.674181
[300]	valid's ndcg@1: 0.810266	valid's ndcg@3: 0.755541	valid's ndcg@5: 0.735415	valid's ndcg@10: 0.719593
[400]	valid's ndcg@1: 0.842741	valid's ndcg@3: 0.795866	valid's ndcg@5: 0.775361	valid's ndcg@10: 0.756746
[500]	valid's ndcg@1: 0.863954	valid's ndcg@3: 0.821581	valid's ndcg@5: 0.802156	valid's ndcg@10: 0.783146
[600]	valid's ndcg@1: 0.878893	valid's ndcg@3: 0.84144	valid's ndcg@5: 0.824517	valid's ndcg@10: 0.805024
[700]	valid's ndcg@1: 0.892397	valid's ndcg@3:

In [32]:
pd.DataFrame(res["valid"]).tail()

Unnamed: 0,ndcg@1,ndcg@3,ndcg@5,ndcg@10
946,0.9179,0.893143,0.881308,0.864731
947,0.917756,0.893326,0.881469,0.864885
948,0.917885,0.893529,0.881725,0.865137
949,0.917929,0.893526,0.881778,0.865252
950,0.917929,0.893552,0.881845,0.865298


In [33]:
!tail -n3 ../data/MSLR-WEB10K/Fold1/vali.txt

1 qid:29995 1:1 2:0 3:0 4:0 5:1 6:0.50000 7:0 8:0 9:0 10:0.50000 11:439 12:0 13:19 14:12 15:470 16:27.896604 17:32.206631 18:31.723366 19:31.997173 20:27.894259 21:1 22:0 23:0 24:0 25:1 26:0 27:0 28:0 29:0 30:0 31:1 32:0 33:0 34:0 35:1 36:0.50000 37:0 38:0 39:0 40:0.50000 41:0.25000 42:0 43:0 44:0 45:0.25000 46:0.002278 47:0 48:0 49:0 50:0.002128 51:0 52:0 53:0 54:0 55:0 56:0.002278 57:0 58:0 59:0 60:0.002128 61:0.001139 62:0 63:0 64:0 65:0.001064 66:0.000001 67:0 68:0 69:0 70:0.000001 71:9.475923 72:0 73:0 74:0 75:9.473578 76:0 77:0 78:0 79:0 80:0 81:9.475923 82:0 83:0 84:0 85:9.473578 86:4.737962 87:0 88:0 89:0 90:4.736789 91:22.448281 92:0 93:0 94:0 95:22.437171 96:0 97:0 98:0 99:0 100:0 101:0.471409 102:0 103:0 104:0 105:0.471318 106:12.985653 107:0 108:0 109:0 110:12.740497 111:-7.28838 112:-2.813407 113:-3.768915 114:-3.976556 115:-7.35661 116:-7.799037 117:-7.60278 118:-7.606904 119:-7.608572 120:-7.811665 121:-6.189852 122:-1.714798 123:-2.67031 124:-2.877949 125:-6.258085 126:

In [34]:
vali_features, vali_labels, vali_groups = format_data("vali")

In [35]:
vali_groups = [[i] * g for i, g in enumerate(vali_groups)]
vali_groups = [i for g in vali_groups for i in g]

In [36]:
predictions = bst.predict(vali_features)

In [37]:
predict_df = pd.DataFrame(
    {
        "query_id": vali_groups,
        "true_label": vali_labels,
        "predicted": predictions,
    }
)

In [38]:
predict_df.head()

Unnamed: 0,query_id,true_label,predicted
0,0,2,-1.224149
1,0,1,-1.374357
2,0,3,-2.932686
3,0,1,-1.322733
4,0,0,-1.653431


In [41]:
ndcg_score(
    predict_df[["true_label", "query_id"]], predict_df[["predicted", "query_id"]], k=10
)

0.999940961949751