In [1]:
from pandas import DataFrame
import numpy as np
from math import log2
import random
from sklearn.tree import DecisionTreeRegressor as DT

In [2]:
def DCGScore(y_true, y_score, k=10, gains="exponential"):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

def NDCGScore(y_true, y_score, k=10, gains="exponential"):
    best = DCGScore(y_true, y_true, k, gains)
    actual = DCGScore(y_true, y_score, k, gains)
    return actual / best

def deltaNDCGScore(y_true, y_score, k=10, gains="exponential"):
    max_DCG = DCGScore(y_true, y_true, k, gains)
    order[np.argsort(y_score)[::-1]] = range(1, y_score.shape[0] + 1)
    discounts = np.log2(order + 1)
    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")
    elem_DCG = gains / discounts
    _n = y_true.shape[0]
    matr_elem_DCG = np.tile(elem_DCG, (_n, 1)).T
    matr_swap_elem_DCG = gains.reshape((_n, 1)) / discounts.reshape((1, _n))

    lambda_mtr = - matr_elem_DCG - matr_elem_DCG.T + matr_swap_elem_DCG + matr_swap_elem_DCG.T
    no_null_swap = ((order <= k).reshape((_n, 1)) + (order <= k).reshape((1, _n))) > 0
    lambda_mtr = np.abs(lambda_mtr * no_null_swap)
    if max_DCG != 0:
        return lambda_mtr / max_DCG
    else:
        return lambda_mtr
    
def AverageNDCG(data, y_score, k):
    ndcg = 0
    for (indexs, y_true) in data:
        ndcg += NDCGScore(y_true, y_score[indexs], k)
    return ndcg / len(data)


def CountErrorPair(data, y_score):
    count_pair = 0
    for (indexs, y_true) in data:
        y_pred = y_score[indexs]
        _n = y_pred.shape[0]

        pairs = (y_pred.reshape((_n, 1)) - y_pred.reshape((1, _n))) <= 0
        true_pairs = (y_true.reshape((_n, 1)) - y_true.reshape((1, _n))) > 0
        count = sum(sum(pairs == true_pairs))
        count_pair += count
    return count_pair

In [3]:
def relNormalize2(rel):
    uniq_rel = np.unique(rel)
    uniq_rel = sorted(uniq_rel)
    norm_rel = np.empty(rel.shape)
    for i, val in enumerate(uniq_rel):
        norm_rel[rel==val] = i + 1
    return norm_rel * 5

In [4]:
class LambdaRank:
    def __init__(self, learning_rate, n_estimators, sigma, start_depth):
        self._learning_rate = learning_rate
        self._n_estimators = n_estimators
        self._sigma = sigma
        self._start_depth = start_depth
        self._trees = []

    def _createSet(self, DATA, queries, normalize):
        all_queries = DATA[:, -1]
        seq_x = []
        data = []
        last_index = 0
        for q in queries:
            X = DATA[all_queries == q, 1:-1]
            seq_x.append(X)
            data.append((range(last_index, last_index + X.shape[0]), normalize(DATA[all_queries == q, 0])))
            last_index += X.shape[0]
        return data, np.vstack(seq_x)

    def fit(self, DATA, persent_valid, normalize, T_NDCG):
        all_queries = DATA[:, -1]
        uniq_queries = np.unique(all_queries)
        #random.shuffle(uniq_queries)
        uniq_queries = uniq_queries

        count_valid = int(persent_valid * uniq_queries.shape[0])
        valid_queries = uniq_queries[:count_valid]
        train_queries = uniq_queries[count_valid:]

        data_train, X_train = self._createSet(DATA, train_queries, normalize)
        data_valid, X_valid = self._createSet(DATA, valid_queries, normalize)

        self._trees = []
        h_train = np.zeros(X_train.shape[0])
        h_valid = np.zeros(X_valid.shape[0])

        iteration = 0
        while True:
            grad = np.zeros(h_train.shape)
            for (indexs, y) in data_train:
                h = h_train[indexs]
                _n = h.shape[0]

                delta_h = h.reshape((_n, 1)) - h.reshape((1, _n)) 
                sign_h = np.sign(y.reshape((_n, 1)) - y.reshape((1, _n)))
                lambda_matr = self._sigma / (1 + np.exp(self._sigma * delta_h * sign_h))
                # * deltaNDCGScore(y, h, T_NDCG)
                lambda_vect = np.sum(sign_h * lambda_matr, axis=1)
                grad[indexs] = lambda_vect

            new_tree = DT(max_depth=self._start_depth)
            new_tree.fit(X_train, grad)
            self._trees.append(new_tree)

            h_train += self._learning_rate * new_tree.predict(X_train)
            h_valid += self._learning_rate * new_tree.predict(X_valid)

            print(iteration,
                  CountErrorPair(data_train, h_train),
                  CountErrorPair(data_valid, h_valid),
                  AverageNDCG(data_train, h_train, 5),
                  AverageNDCG(data_valid, h_valid, 5),
                  np.linalg.norm(grad))
            iteration += 1
            if (iteration == self._n_estimators):
                break

    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for tree in self._trees:
            y_pred += self._learning_rate * tree.predict(X)
        return y_pred

In [24]:
class LambdaRank:
    def __init__(self, learning_rate, n_estimators, sigma, start_depth):
        self._learning_rate = learning_rate
        self._n_estimators = n_estimators
        self._sigma = sigma
        self._start_depth = start_depth
        self._trees = []

    def _createSet(self, DATA, queries, normalize):
        all_queries = DATA[:, -1]
        seq_x = []
        data = []
        last_index = 0
        for q in queries:
            X = DATA[all_queries == q, 1:-1]
            seq_x.append(X)
            data.append((range(last_index, last_index + X.shape[0]), normalize(DATA[all_queries == q, 0])))
            last_index += X.shape[0]
        return data, np.vstack(seq_x)

    def fit(self, DATA, persent_valid, normalize, T_NDCG, _trees):
        all_queries = DATA[:, -1]
        uniq_queries = np.unique(all_queries)
        #random.shuffle(uniq_queries)
        uniq_queries = uniq_queries

        count_valid = int(persent_valid * uniq_queries.shape[0])
        valid_queries = uniq_queries[:count_valid]
        train_queries = uniq_queries[count_valid:]

        data_train, X_train = self._createSet(DATA, train_queries, normalize)
        data_valid, X_valid = self._createSet(DATA, valid_queries, normalize)
        self._learning_rate = 0.01
        self._trees = _trees
        h_train = self.predict(X_train)
        h_valid = self.predict(X_valid)
        self._learning_rate = 0.003

        iteration = 0
        while True:
            grad = np.zeros(h_train.shape)
            for (indexs, y) in data_train:
                h = h_train[indexs]
                _n = h.shape[0]

                delta_h = h.reshape((_n, 1)) - h.reshape((1, _n)) 
                sign_h = np.sign(y.reshape((_n, 1)) - y.reshape((1, _n)))
                lambda_matr = self._sigma / (1 + np.exp(self._sigma * delta_h * sign_h))
                # * deltaNDCGScore(y, h, T_NDCG)
                lambda_vect = np.sum(sign_h * lambda_matr, axis=1)
                grad[indexs] = lambda_vect

            new_tree = DT(max_depth=self._start_depth)
            new_tree.fit(X_train, grad)
            self._trees.append(new_tree)

            h_train += self._learning_rate * new_tree.predict(X_train)
            h_valid += self._learning_rate * new_tree.predict(X_valid)

            print(iteration,
                  CountErrorPair(data_train, h_train),
                  CountErrorPair(data_valid, h_valid),
                  AverageNDCG(data_train, h_train, 5),
                  AverageNDCG(data_valid, h_valid, 5),
                  np.linalg.norm(grad))
            iteration += 1
            if (iteration == self._n_estimators):
                break

    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for tree in self._trees:
            y_pred += self._learning_rate * tree.predict(X)
        return y_pred

In [5]:
def loadData():
    trainPath = "../data/train.data.cvs"
    return DataFrame.from_csv(trainPath, index_col=False).as_matrix()

def saveResults(queries, rels, namefile):
    uniq_queries = np.unique(queries)
    ans = np.empty((rels.shape[0], 2), dtype=np.int)
    count_last = 1
    for q in uniq_queries:
        rel = rels[queries == q]
        order = np.argsort(rel)[::-1] + count_last
        ans[queries == q, 0] = order
        ans[queries == q, 1] = q
        count_last += rel.shape[0]
    df = DataFrame(ans, columns=["DocumentId","QueryId"])
    df.to_csv(open(namefile, "w"), index=False)

In [6]:
rowData = loadData()

In [7]:
estimator = LambdaRank(learning_rate=0.01, n_estimators=300, sigma=1, start_depth=3)
estimator.fit(DATA=rowData, persent_valid=0.2, normalize=relNormalize2, T_NDCG=30)

0 34748276 6131362 0.246551634974 0.253776974292 36767.1527454
1 36988318 6605704 0.258614363416 0.265747155867 36402.6765257
2 38512066 6910127 0.262650877625 0.270965992202 36156.8932731
3 39233591 7051186 0.264880856598 0.277435292354 35954.97982
4 40293056 7359024 0.26587125548 0.278547334087 35800.3248507
5 40402499 7377574 0.266321941484 0.281148975062 35670.5634698
6 40548975 7407477 0.27135391871 0.28443150753 35544.6610747
7 40675535 7440957 0.271112547247 0.287488251254 35439.2053245
8 40764496 7473136 0.272581850684 0.289164138312 35349.5298186
9 40906059 7501762 0.274217600111 0.289880564394 35248.5832698
10 41215171 7598295 0.276236094404 0.290373558514 35176.1100441
11 41282093 7618288 0.277261869943 0.289637144334 35118.3851241
12 41270282 7616448 0.278340201675 0.291352060268 35044.9527311
13 41222404 7608474 0.279721724444 0.292691581547 34983.7732704
14 41338483 7632174 0.280439846979 0.294906387541 34921.0497576
15 41352620 7640352 0.283069862575 0.297708338905 34880

In [8]:
testRow  = DataFrame.from_csv("../data/testset.cvs", index_col=False).as_matrix()

In [9]:
ans = estimator.predict(testRow[:, 1:-1])
saveResults(testRow[:, -1], ans, "result13")