# Learning to rank

In [1]:
import pandas as pd 
import numpy as np
import logging
import skops.io as sio

from sklearn.metrics import (
    precision_score,
)
import xgboost as xgb

In [2]:
trainval = pd.read_parquet("/Users/limdi/Downloads/train.parquet")
test = pd.read_parquet("/Users/limdi/Downloads/test.parquet")

GROUPCOL = "studyLocusId"
TARGET = "goldStandardSet"
FEATURES = [col for col in trainval.columns if "Neighbourhood" not in col]
FEATURES = sorted(list(set(FEATURES) - set(["studyLocusId", "geneId", TARGET])))

In [3]:
def process_for_l2r(data):
    """
    Function that processes the dataframe for the L2R XGBoost method
    """
    # reorder the data so it is sorted appropriately
    data = data.sort_values(by=GROUPCOL).reset_index(drop=True)
    X = data[FEATURES]
    y = data[TARGET]
    group_ids = data[GROUPCOL]
    # get list of group sizes
    unique_groups, group_counts = np.unique(group_ids, return_counts=True)
    group_sizes = group_counts.tolist()

    return np.array(X), np.array(y), np.array(group_ids), np.array(group_sizes)

In [4]:
def run_model(parameters, train_df, val_df=None, save=False, save_path=None,):
    """
    General purpose model that trains and validates the model 
    """
    print("Processing input dataframes.....")
    X_train, y_train, g_train, gsize_train = process_for_l2r(train_df)
    # creating the XGBoost Dmatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtrain.set_group(gsize_train)
    
    if val_df is not None:
        X_val, y_val, g_val, gsize_val = process_for_l2r(val_df)
        dval = xgb.DMatrix(X_val, label=y_val)
        dval.set_group(gsize_val)

    print("Training the model.....")
    # train
    model = xgb.train(
        parameters,
        dtrain,
        num_boost_round=150,
        # evals=[(dval, "val")],
        # early_stopping_rounds=20,
        verbose_eval=False,
    )

    if save:
        if save_path is None:
            raise ValueError("Please provide a save path")
        print("Saving the model.....")
        sio.dump(model, save_path)

    if val_df is not None:
        print("Predicting using trained model.....")
        # predict and evaluate
        y_pred = model.predict(dval)
    
        # compute metrics
        val_ndcg = model.best_score if hasattr(model, "best_score") else np.nan

        print("Converting predictions to top1 per credible set.....")
        # convert predictions to top1 per group
        y_pred_top1 = np.zeros_like(y_pred)
        for gid in np.unique(g_val):
            mask = g_val == gid
            if np.any(mask):
                top_idx = np.argmax(y_pred[mask])
                y_pred_top1[np.where(mask)[0][top_idx]] = 1
        
        # apply threshold
        precision = precision_score(y_val, y_pred_top1)

        print("Complete!")
        return {"val_precision_top1": precision,}

In [5]:
base_params = {
    "objective": "rank:map",
    "eval_metric": "map",
    "tree_method": "hist",
    "verbosity": 0,
}
best_params = {    
    "eta": 0.125,
    "max_depth": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 1,
    "reg_lambda": 1,
    "min_child_weight": 5}

best_params = {**base_params, **best_params}

In [6]:
model_results = run_model(parameters = best_params, train_df=trainval, val_df=test, save=True, save_path="L2G-L2R_best_model.skops")
print(model_results)

Processing input dataframes.....
Training the model.....
Saving the model.....
Predicting using trained model.....
Converting predictions to top1 per credible set.....
Complete!
{'val_precision_top1': 0.7904929577464789}
