In [42]:
import numpy as np
from sklearn.metrics import ndcg_score 
from sklearn.datasets import load_svmlight_file
from sklearn.utils import check_X_y

In [187]:
class AdaRank():
    def __init__(self, k = 10, E = ndcg_score, n_iterations = 100):
        self.n_iterations = n_iterations
        self.E = E
        self.k = k
        self.weights = None
        self.weak_ranks = []
        self.alphas = []
        self.feature_scores = []
        self.n_queries = None 
    
    def fit(self, X, y, qid):
        X = X.toarray()
        dataset = []
        
        for q in np.unique(qid):
            dataset.append((q, X[qid == q], y[qid == q]))
        
        self.n_queries = np.unique(qid).shape[0]
        self.weights = np.ones(self.n_queries) /self.n_queries
        
        for j in range(X.shape[1]):
            
            feature_score = []
            for data in dataset:
                y_true = np.asarray([data[2]])
                x_k = np.asarray([data[1][:, j]])
                feature_score.append(self.E(y_true, x_k, k = self.k))
            
            self.feature_scores.append(feature_score)
        
        print(np.asarray(self.feature_scores).shape)
        
        for i in range(self.n_iterations):
            
            # weak rank
            best_feature = self.select_best_feature()
            h_predictions = [] 
            x_best_feature = X[:, best_feature]
            for q in np.unique(qid):
                h_predictions.append(x_best_feature[qid == q])
                
            #h_predictions = dataset[:, 1, best_feature] # dataset[1] = X
            print(np.asarray(h_predictions).shape)
            self.weak_ranks.append(h_predictions)
            
            h_scores = []
            for h_pred, data in zip(h_predictions, dataset):
                y_true = np.asarray([data[2]])
                h_scores.append(self.E(y_true, h_pred, k = self.k))
            
            print(f"h_score: {np.asarray(h_scores).shape}")
            
            # h score shape deve (1017,)
            # Choose alpha t
            alpha = 0.5 * np.log((self.weights * (1 + h_scores)) / (1 - h_scores))
            # ALPHA E' un numero
            
            self.alphas.append(alpha)
            
            f_predictions = np.sum(self.alphas * self.weak_ranks, axis = 0)
            
            # update P t+1
            f_score = self.E(y, f_predictions, k = self.k)
            self.weights = np.exp(-f_score) / np.exp(np.sum(-f_score))
    
    def select_best_feature(self):
        """Create weak ranker ht with weighted distribution Pt on training data"""
        
        best_feature = None
        best_weighted_performance = -np.inf
        
        for fid, score in enumerate(self.feature_scores):
            
            weighted_average = np.dot(self.weights, score)
            
            if weighted_average > best_weighted_performance:
                best_feature = fid
                best_weighted_performance = weighted_average

        # Return the best feature
        return best_feature
        
                
    def predict(self, X):
        pass
            

Test

In [11]:
# hyper params
K = 10
max_iter = 100
estop = 10

In [124]:
X, y, qid = load_svmlight_file("datasets/mq2007/train.txt", query_id=True)
    
X_test, y_test, qid_test = load_svmlight_file("datasets/mq2007/test.txt", query_id=True)

In [100]:
# print shapes
print(f"X: {X.shape}, y: {y.shape}, qid: {qid.shape}")

X: (42158, 46), y: (42158,), qid: (42158,)


In [180]:
dataset = []
for q in np.unique(qid):
    dataset.append((q, X[qid == q], y[qid == q]))
    # dataset.append({    
    #     "qid": q,
    #     "X": X[qid == q],
    #     "y": y[qid == q]         
    #                 })

In [188]:
model = AdaRank()

model.fit(X, y, qid)

predictions = model.predict(X_test)

for k in (1, 2, 3, 4, 5, 10, 20):
        score = ndcg_score(y_test, predictions, k = k).mean()
        print('nDCG@{}\t{}'.format(k, score))

(46, 1017)
(1017,)


  print(np.asarray(h_predictions).shape)


ValueError: Found input variables with inconsistent numbers of samples: [1, 40]