# AdaRank Implementation

In [23]:
import math
import numpy as np
import sklearn
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.utils import check_X_y
from sklearn.datasets import load_svmlight_file

## Scoring

In [11]:
def group_offsets(arr):
    """Return a sequence of start/end offsets for the value subgroups in the input"""
    d = np.ones(arr.size, dtype=int)
    d[1:] = (arr[:-1] != arr[1:]).astype(int)
    idx = np.where(np.append(d, 1))[0]
    return zip(idx, idx[1:])


class Scorer(object):
    def __init__(self, score_func, **kwargs):
        self.score_func = score_func
        self.kwargs = kwargs

    def __call__(self, *args):
        return self.score_func(*args, **self.kwargs)


# DCG/nDCG (Normalized Discounted Cumulative Gain)
# https://en.wikipedia.org/wiki/Discounted_cumulative_gain

def _burges_dcg(y_true, y_pred, k=None):
    # order = np.argsort(y_pred)[::-1]
    order = np.argsort(-y_pred)
    y_true = np.take(y_true, order[:k])
    gain = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(gain)) + 2)
    return np.sum(gain / discounts)

def _dcg_score(y_true, y_pred, qid, k=None, dcg_func=None):
    assert dcg_func is not None
    y_true = np.maximum(y_true, 0)
    return np.array([dcg_func(y_true[a:b], y_pred[a:b], k=k) for a, b in group_offsets(qid)])

def _ndcg_score(y_true, y_pred, qid, k=None, dcg_func=None):
    assert dcg_func is not None
    y_true = np.maximum(y_true, 0)
    dcg = _dcg_score(y_true, y_pred, qid, k=k, dcg_func=dcg_func)
    idcg = np.array([dcg_func(np.sort(y_true[a:b]), np.arange(0, b - a), k=k)
                     for a, b in group_offsets(qid)])
    assert (dcg <= idcg).all()
    idcg[idcg == 0] = 1
    return dcg / idcg

def ndcg_score(y_true, y_pred, qid, k=None):
    dcg_func = _burges_dcg 
    return _ndcg_score(y_true, y_pred, qid, k=k, dcg_func=dcg_func)

class NDCGScorer(Scorer):
    def __init__(self, **kwargs):
        super(NDCGScorer, self).__init__(ndcg_score, **kwargs)


## AdaRank Class

In [98]:
class AdaRank(sklearn.base.BaseEstimator):
    """AdaRank algorithm"""

    def __init__(self, max_iter=500, tol=0.0001, estop=1, verbose=False, scorer=None):
        self.max_iter = max_iter
        self.tol = tol
        self.estop = estop
        self.verbose = verbose
        self.scorer = scorer

    def fit(self, X, y, qid, X_valid=None, y_valid=None, qid_valid=None):
        """Fit a model to the data"""
        X, y = check_X_y(X, y, 'csr')
        
        # if is already array dont convert
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        if X_valid is None:
            X_valid, y_valid, qid_valid = X, y, qid
        else:
            X_valid, y_valid = check_X_y(X_valid, y_valid, 'csr')
            X_valid = X_valid.toarray()

        n_queries = np.unique(qid).shape[0]
        weights = np.ones(n_queries, dtype=np.float64) / n_queries
        weak_rankers = []
        coef = np.zeros(X.shape[1])

        # use nDCG@10 as the default scorer
        if self.scorer is None:
            self.scorer = NDCGScorer(k=10)

        # precompute performance measurements for all weak rankers
        weak_ranker_score = []
        for j in range(X.shape[1]):
            pred = X[:, j].ravel()
            weak_ranker_score.append(self.scorer(y, pred, qid))
            
        best_perf_train = -np.inf
        best_perf_valid = -np.inf
        used_fids = []
        estop = None

        self.n_iter = 0
        while self.n_iter < self.max_iter:
            self.n_iter += 1

            best_weighted_average = -np.inf
            best_weak_ranker = None
            for fid, score in enumerate(weak_ranker_score):
                if fid in used_fids:
                    continue
                weighted_average = np.dot(weights, score)
                if weighted_average > best_weighted_average:
                    best_weak_ranker = {'fid': fid, 'score': score}
                    best_weighted_average = weighted_average

            # stop when all the weaker rankers are out
            if best_weak_ranker is None:
                break

            h = best_weak_ranker
            h['alpha'] = 0.5 * (math.log(np.dot(weights, 1 + h['score']) /
                                         np.dot(weights, 1 - h['score'])))
            weak_rankers.append(h)

            # update the ranker
            coef[h['fid']] += h['alpha']

            # if len(used_fids) > 5:
            #     used_fids.pop(0)
            # used_fids.append(h['fid'])

            # score both training and validation data
            score_train = self.scorer(y, np.dot(X, coef), qid)
            perf_train = score_train.mean()

            perf_valid = perf_train
            if X_valid is not X:
                perf_valid = self.scorer(y_valid, np.dot(X_valid, coef), qid_valid).mean()

            if self.verbose:
                print('{n_iter}\t{alpha}\t{fid}\t{score}\ttrain {train:.4f}\tvalid {valid:.4f}'.
                      format(n_iter=self.n_iter, alpha=h['alpha'], fid=h['fid'],
                             score=h['score'][:5], train=perf_train, valid=perf_valid),
                      file=sys.stderr)

            # update the best validation scores
            if perf_valid > best_perf_valid + self.tol:
                estop = 0
                best_perf_valid = perf_valid
                self.coef_ = coef.copy()
            else:
                estop += 1

            # update the best training score
            if perf_train > best_perf_train + self.tol:
                best_perf_train = perf_train
            else:
                # stop if scores on both sets fail to improve
                if estop >= self.estop:
                    break

            # update weights
            new_weights = np.exp(-score_train)
            weights = new_weights / new_weights.sum()

        return self

    def predict(self, X, qid):
        """Make predictions"""
        if not isinstance(X, np.ndarray):
            X = X.toarray()
        return np.dot(X, self.coef_)

## Test on MQ2007 dataset

In [89]:
X, y, qid = load_svmlight_file("datasets/mq2007/train.txt", query_id=True)
    
X_test, y_test, qid_test = load_svmlight_file("datasets/mq2007/test.txt", query_id=True)

In [30]:
# hyper params
k = 10
max_iter = 100
patience = 20

model = AdaRank(max_iter=max_iter,
                    estop=patience,
                    verbose=False,
                    scorer=NDCGScorer(k=k))

model.fit(X, y, qid)

predictions = model.predict(X_test, qid_test)
for k in (1, 2, 3, 4, 5, 10, 20):
        score = NDCGScorer(k=k)(y_test, predictions, qid_test).mean()
        print('nDCG@{}\t{}'.format(k, score))

nDCG@1	0.40773809523809523
nDCG@2	0.39950776144698336
nDCG@3	0.4079126410276921
nDCG@4	0.4069481096852681
nDCG@5	0.41673937966570745
nDCG@10	0.4502928537269599
nDCG@20	0.5065914762962426


## Test on LOINC original

In [107]:
def read_loinc(path ='datasets/loinc_features.xlsx', test_split=0.10):
    # read excel file
    df_loinc_q0 = pd.read_excel(path, sheet_name=0)
    df_loinc_q1 = pd.read_excel(path, sheet_name=1)
    df_loinc_q2 = pd.read_excel(path, sheet_name=2)

    y0 = df_loinc_q0["RANK"].values
    y1 = df_loinc_q0["RANK"].values
    y2 = df_loinc_q0["RANK"].values
    #concat y   
    y = np.concatenate((y0, y1, y2), axis=0)

    features_names = ["IDF", "BM25", "COISINE",	"JACCARD"] 
    df_loinc_q0 = df_loinc_q0[features_names].values
    df_loinc_q1 = df_loinc_q1[features_names].values
    df_loinc_q2 = df_loinc_q2[features_names].values
    # concat features
    X = np.concatenate((df_loinc_q0, df_loinc_q1, df_loinc_q2), axis=0)

    # prepare qid
    qid0 = np.full(len(df_loinc_q0), 0)
    qid1 = np.full(len(df_loinc_q1), 1)
    qid2 = np.full(len(df_loinc_q2), 2)
    # concat qids
    qid = np.concatenate((qid0, qid1, qid2), axis=0)
    
    # Get the unique 'qid' values
    unique_qid = np.unique(qid)

    # Initialize empty arrays for the training and testing sets
    X_train, X_test, y_train, y_test, qid_train, qid_test = [], [], [], [], [], []

    # Split the data based on 'qid'
    test_size = test_split # Adjust as needed
    for q in unique_qid:
        mask = qid == q  # Create a mask for the current 'qid'
        X_q = X[mask]
        y_q = y[mask]
        qid_q = qid[mask]

        X_train_q, X_test_q, y_train_q, y_test_q, qid_train_q, qid_test_q = train_test_split(X_q, y_q, qid_q, test_size=test_size, shuffle=True)

        X_train.append(X_train_q)
        X_test.append(X_test_q)
        y_train.append(y_train_q)
        y_test.append(y_test_q)
        qid_train.append(qid_train_q)
        qid_test.append(qid_test_q)

    # Concatenate the results to get the final splits
    X_train = np.concatenate(X_train)
    X_test = np.concatenate(X_test)
    y_train = np.concatenate(y_train)
    y_test = np.concatenate(y_test)
    qid_train = np.concatenate(qid_train)
    qid_test = np.concatenate(qid_test)
    
    return X_train, X_test, y_train, y_test, qid_train, qid_test


In [108]:
X_train, X_test, y_train, y_test, qid_train, qid_test  = read_loinc()

# hyper params
k = 10
max_iter = 100
patience = 20

model = AdaRank(max_iter=max_iter,
                    estop=patience,
                    verbose=True,
                    scorer=NDCGScorer(k=k))

model.fit(X, y, qid)

1	1.0704006194194722	2	[1.         1.         0.36883632]	train 0.7896	valid 0.7896
2	0.8560085641270356	2	[1.         1.         0.36883632]	train 0.7896	valid 0.7896
3	0.8560085641270356	2	[1.         1.         0.36883632]	train 0.7896	valid 0.7896
4	0.8560085641270356	2	[1.         1.         0.36883632]	train 0.7896	valid 0.7896
5	0.8560085641270356	2	[1.         1.         0.36883632]	train 0.7896	valid 0.7896
6	0.8560085641270356	2	[1.         1.         0.36883632]	train 0.7896	valid 0.7896
7	0.8560085641270356	2	[1.         1.         0.36883632]	train 0.7896	valid 0.7896
8	0.8560085641270356	2	[1.         1.         0.36883632]	train 0.7896	valid 0.7896
9	0.8560085641270356	2	[1.         1.         0.36883632]	train 0.7896	valid 0.7896
10	0.8560085641270356	2	[1.         1.         0.36883632]	train 0.7896	valid 0.7896
11	0.8560085641270356	2	[1.         1.         0.36883632]	train 0.7896	valid 0.7896
12	0.8560085641270356	2	[1.         1.         0.36883632]	train 0.7896	va

In [109]:
predictions = model.predict(X_test, qid_test)
for k in (1, 2, 3, 4, 5, 10, 20):
        score = NDCGScorer(k=k)(y_test, predictions, qid_test).mean()
        print('nDCG@{}\t{}'.format(k, score))

nDCG@1	0.0
nDCG@2	0.0
nDCG@3	0.0
nDCG@4	0.1186132994310261
nDCG@5	0.15412787682487636
nDCG@10	0.29505782162709315
nDCG@20	0.29505782162709315


## Test LOINC extended

In [113]:
X_train, X_test, y_train, y_test, qid_train, qid_test = read_loinc('datasets/loinc_extended_features.xlsx')

# hyper params
k = 10
max_iter = 100
patience = 20

model = AdaRank(max_iter=max_iter,
                    estop=patience,
                    verbose=True,
                    scorer=NDCGScorer(k=k))

model.fit(X_train, y_train, qid_train)

1	1.09386531354197	2	[1.         1.         0.39485376]	train 0.7983	valid 0.7983
2	0.8886608216911748	2	[1.         1.         0.39485376]	train 0.7983	valid 0.7983
3	0.8886608216911748	2	[1.         1.         0.39485376]	train 0.7983	valid 0.7983
4	0.8886608216911748	2	[1.         1.         0.39485376]	train 0.7983	valid 0.7983
5	0.8886608216911748	2	[1.         1.         0.39485376]	train 0.7983	valid 0.7983
6	0.8886608216911748	2	[1.         1.         0.39485376]	train 0.7983	valid 0.7983
7	0.8886608216911748	2	[1.         1.         0.39485376]	train 0.7983	valid 0.7983
8	0.8886608216911748	2	[1.         1.         0.39485376]	train 0.7983	valid 0.7983
9	0.8886608216911748	2	[1.         1.         0.39485376]	train 0.7983	valid 0.7983
10	0.8886608216911748	2	[1.         1.         0.39485376]	train 0.7983	valid 0.7983
11	0.8886608216911748	2	[1.         1.         0.39485376]	train 0.7983	valid 0.7983
12	0.8886608216911748	2	[1.         1.         0.39485376]	train 0.7983	vali

In [114]:
predictions = model.predict(X_test, qid_test)
for k in (1, 2, 3, 4, 5, 10, 20):
        score = NDCGScorer(k=k)(y_test, predictions, qid_test).mean()
        print('nDCG@{}\t{}'.format(k, score))

nDCG@1	0.6666666666666666
nDCG@2	0.6666666666666666
nDCG@3	0.6145245859974715
nDCG@4	0.5672485934660606
nDCG@5	0.5748744139302352
nDCG@10	0.711748502626464
nDCG@20	0.8048251338886652
