In [1]:
from pandas import DataFrame
import numpy as np
from math import log2
import random
from sklearn.tree import DecisionTreeRegressor as DT

In [2]:
def DCGScore(y_true, y_score, k=10, gains="exponential"):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

def NDCGScore(y_true, y_score, k=10, gains="exponential"):
    best = DCGScore(y_true, y_true, k, gains)
    actual = DCGScore(y_true, y_score, k, gains)
    return actual / best

def deltaNDCGScore(y_true, y_score, k=10, gains="exponential"):
    max_DCG = DCGScore(y_true, y_true, k, gains)
    order[np.argsort(y_score)[::-1]] = range(1, y_score.shape[0] + 1)
    discounts = np.log2(order + 1)
    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")
    elem_DCG = gains / discounts
    _n = y_true.shape[0]
    matr_elem_DCG = np.tile(elem_DCG, (_n, 1)).T
    matr_swap_elem_DCG = gains.reshape((_n, 1)) / discounts.reshape((1, _n))

    lambda_mtr = - matr_elem_DCG - matr_elem_DCG.T + matr_swap_elem_DCG + matr_swap_elem_DCG.T
    no_null_swap = ((order <= k).reshape((_n, 1)) + (order <= k).reshape((1, _n))) > 0
    lambda_mtr = np.abs(lambda_mtr * no_null_swap)
    if max_DCG != 0:
        return lambda_mtr / max_DCG
    else:
        return lambda_mtr
    
def AverageNDCG(data, y_score, k):
    ndcg = 0
    for (indexs, y_true) in data:
        ndcg += NDCGScore(y_true, y_score[indexs], k)
    return ndcg / len(data)


def CountErrorPair(data, y_score):
    count_pair = 0
    for (indexs, y_true) in data:
        y_pred = y_score[indexs]
        _n = y_pred.shape[0]

        pairs = (y_pred.reshape((_n, 1)) - y_pred.reshape((1, _n))) <= 0
        true_pairs = (y_true.reshape((_n, 1)) - y_true.reshape((1, _n))) > 0
        count = sum(sum(pairs == true_pairs))
        count_pair += count
    return count_pair

In [3]:
def relNormalize2(rel):
    uniq_rel = np.unique(rel)
    uniq_rel = sorted(uniq_rel)
    norm_rel = np.empty(rel.shape)
    for i, val in enumerate(uniq_rel):
        norm_rel[rel==val] = i + 1
    return norm_rel * 5

In [11]:
class LambdaRank:
    def __init__(self, learning_rate, n_estimators, sigma, start_depth):
        self._learning_rate = learning_rate
        self._n_estimators = n_estimators
        self._sigma = sigma
        self._start_depth = start_depth
        self._trees = []

    def _createSet(self, DATA, queries, normalize):
        all_queries = DATA[:, -1]
        seq_x = []
        data = []
        last_index = 0
        for q in queries:
            X = DATA[all_queries == q, 1:-1]
            seq_x.append(X)
            data.append((range(last_index, last_index + X.shape[0]), normalize(DATA[all_queries == q, 0])))
            last_index += X.shape[0]
        return data, np.vstack(seq_x)

    def fit(self, DATA, persent_valid, normalize, T_NDCG):
        all_queries = DATA[:, -1]
        uniq_queries = np.unique(all_queries)
        #random.shuffle(uniq_queries)
        uniq_queries = uniq_queries[:3600]

        count_valid = int(persent_valid * uniq_queries.shape[0])
        valid_queries = uniq_queries[:count_valid]
        train_queries = uniq_queries[count_valid:]

        data_train, X_train = self._createSet(DATA, train_queries, normalize)
        data_valid, X_valid = self._createSet(DATA, valid_queries, normalize)

        self._trees = []
        h_train = np.zeros(X_train.shape[0])
        h_valid = np.zeros(X_valid.shape[0])

        iteration = 0
        while True:
            grad = np.zeros(h_train.shape)
            for (indexs, y) in data_train:
                h = h_train[indexs]
                _n = h.shape[0]

                delta_h = h.reshape((_n, 1)) - h.reshape((1, _n)) 
                sign_h = np.sign(y.reshape((_n, 1)) - y.reshape((1, _n)))
                lambda_matr = self._sigma / (1 + np.exp(self._sigma * delta_h * sign_h))
                # * deltaNDCGScore(y, h, T_NDCG)
                lambda_vect = np.sum(sign_h * lambda_matr, axis=1)
                grad[indexs] = lambda_vect

            new_tree = DT(max_depth=self._start_depth)
            new_tree.fit(X_train, grad)
            self._trees.append(new_tree)

            h_train += self._learning_rate * new_tree.predict(X_train)
            h_valid += self._learning_rate * new_tree.predict(X_valid)

            print(iteration,
                  CountErrorPair(data_train, h_train),
                  CountErrorPair(data_valid, h_valid),
                  AverageNDCG(data_train, h_train, 5),
                  AverageNDCG(data_valid, h_valid, 5),
                  np.linalg.norm(grad))
            iteration += 1
            if (iteration == self._n_estimators):
                break

    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for tree in self._trees:
            y_pred += self._learning_rate * tree.predict(X)
        return y_pred

In [5]:
def loadData():
    trainPath = "../data/train.data.cvs"
    return DataFrame.from_csv(trainPath, index_col=False).as_matrix()

def saveResults(queries, rels, namefile):
    uniq_queries = np.unique(queries)
    ans = np.empty((rels.shape[0], 2), dtype=np.int)
    count_last = 1
    for q in uniq_queries:
        rel = rels[queries == q]
        order = np.argsort(rel)[::-1] + count_last
        ans[queries == q, 0] = order
        ans[queries == q, 1] = q
        count_last += rel.shape[0]
    df = DataFrame(ans, columns=["DocumentId","QueryId"])
    df.to_csv(open(namefile, "w"), index=False)

In [6]:
rowData = loadData()

In [12]:
estimator = LambdaRank(learning_rate=0.05, n_estimators=100, sigma=1, start_depth=3)
estimator.fit(DATA=rowData, persent_valid=0.2, normalize=relNormalize2, T_NDCG=30)

0 10997352 2182105 0.249367039373 0.246395632003 20209.0698821
1 11664915 2312115 0.265432862746 0.26698911086 19053.2349635
2 12394560 2458194 0.268789254718 0.28040597538 18813.390141
3 12585404 2497122 0.280587627993 0.277536595867 18496.4924065
4 12658207 2516386 0.284968207532 0.284517094119 18302.1187058
5 12711083 2529194 0.286785989516 0.281893926111 18145.9900913
6 12693597 2529776 0.293618622286 0.286927268467 18045.4828133
7 12684967 2531140 0.290455866698 0.287102920465 17927.4890443
8 12672461 2532677 0.285057188744 0.285201978363 17850.3533104
9 12657253 2537122 0.286728558377 0.28812375324 17780.7992087
10 12619378 2534493 0.290542474936 0.286968484203 17717.3581174
11 12603836 2534747 0.291962815283 0.286976667234 17598.034835
12 12579357 2536172 0.292457529498 0.289767642673 17557.4507905
13 12557774 2534211 0.293321024296 0.291106643945 17476.3764007
14 12524696 2539642 0.294241384439 0.283990499296 17417.9410062
15 12487399 2538790 0.297469259399 0.285386138659 17327

KeyboardInterrupt: 

In [13]:
testRow  = DataFrame.from_csv("../data/testset.cvs", index_col=False).as_matrix()

In [14]:
ans = np.zeros(testRow.shape[0])
_trees = estimator._trees
for tree in _trees[:17]:
    ans += tree.predict(testRow[:, 1:-1])
saveResults(testRow[:, -1], ans, "result10")