# Yahoo Data Processing

Testing the performance of various feature configurations when using DeltaMART with the Yahoo LETOR dataset (train/validation/test split). See README.md, "Data Preparation for the Initial Ranker" section:

https://github.com/QingyaoAi/Unbiased-Learning-to-Rank-with-Unbiased-Propensity-Estimation

Only using a small subset of queries since running locally on laptop.

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from scipy.special import expit  # Logistic function
from rank_metrics import ndcg_at_k, mean_average_precision
from lightgbm import LGBMRanker

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Functions

In [2]:
def load_np_arrays(train_or_val):
    """
    Loads saved numpy arrays with query, doc, and relevance data
    
    Params:
        train_or_val (string): enter 'train' or 'val' to load training or validation arrays
    
    Returns:
        dids (np array): document ids
        qids (np array): query ids
        features (np array): training features
        gold_weights (np array): labels for each document per query
    """
    
    if train_or_val == 'train':
        folder = 'Yahoo_Numpy//Train//'
    elif train_or_val == 'val':
        folder = 'Yahoo_Numpy//Val//'
    
    dids = np.load(folder + 'dids.npy', allow_pickle=True)
    qids = np.load(folder + 'qids.npy', allow_pickle=True)
    features = np.load(folder + 'features.npy', allow_pickle=True)
    gold_weights = np.load(folder + 'gold_weights.npy', allow_pickle=True)

    return dids, qids, features, gold_weights

In [3]:
def prepare_data(dids, qids, features, gold_weights, sample=True, size=10, seed=1):
    """
    Samples and formats data before feature generation
    
    Params:
        dids, qids, features, gold_weights (np array): see load_np_arrays
        sample (bool): whether to sample a set of queries (True) or use all queries (False)
        size (int): size of sample
        seed (int): random seed for sampling, allows reproducible results

    Returns:
        q_choice (np array): unique sampled query ids 
        q_rel (np array): relevant non-unique sampled query ids, aligned row-wise w/ features
        feat_rel (np array): relevant sampled features
        label_rel (np array): relevant sampled labels
    """
    
    # Random seed for query sampling
    np.random.seed(seed)
    
    # Randomly select queries
    if sample:
        q_choice = np.random.choice(qids, size=size, replace=False)
    else:
        q_choice = qids

    # Get query id aligned with features
    query_id = np.array([int(ele.split("_")[1]) for ele in dids])
    
    # Get relevant queries, features, and labels
    q_rel = query_id[np.isin(query_id, q_choice)]
    feat_rel = features[np.isin(query_id, q_choice)]
    label_rel = gold_weights[np.isin(qids, q_choice)]

    # Join subarrays
    label_rel = np.concatenate(label_rel)

    # Include query id in features
    feat_rel = np.hstack((q_rel.reshape(-1, 1), feat_rel))
    
    return q_choice, q_rel, feat_rel, label_rel

In [4]:
def generate_features(q_choice, q_rel, feat_rel, label_rel, 
                      repeat_importance, delta_features):
    """
    Generate pairwise features between documents in all queries
    
    Params:
        q_choice, q_rel, feat_rel, label_rel (np array): see prepare_data
        repeat_importance (bool): whether to duplicate rows based on the magnitude of the
                                  difference in scores
        delta_features (bool): whether to include the difference between document features
                               in the pairwise generated features
    
    Returns:
        features (np array): pairwise generated features, including query id (index 0) and
                             difference in scores (index -1)
        q_rel (np array): relevant query ids aligned with label_rel, used with label_rel in 
                          metrics
        label_rel (np array): relevant labels aligned with q_rel, used in metrics
    """
    
    n_rows = 0
    max_diff = 4
    n_features = 700

    # Find max possible number of rows: n_queries * (n_urls_per_query ^ 2) * max_repeat_factor
    for qid in q_choice:
        urls_per_query = np.sum(np.isin(q_rel, qid))

        # If not repeating importance, then every query-URL pair only appears once
        if repeat_importance:
            n_rows += (urls_per_query ** 2) * max_diff
        else:
            n_rows += (urls_per_query ** 2)

    # Add extra set of columns if delta_features, + 2 for (query_id, label)
    if delta_features:
        n_columns = (n_features * 3) + 2
    else:
        n_columns = (n_features * 2) + 2

    # Create array to fill in later (faster), step thru with idx
    features = np.full(shape=(n_rows, n_columns), fill_value=np.nan)
    idx = 0

    # Iter thru queries
    for progress, qid in enumerate(q_choice):

        temp_feat = feat_rel[np.isin(q_rel, qid)]
        temp_label = label_rel[np.isin(q_rel, qid)]

        m = temp_feat.shape[0]

        # First URL
        for i in range(m):

            # Second URL
            for j in range(m):

                label_diff = temp_label[i] - temp_label[j]

                # Repeat importance: duplicate row |label_diff| times
                if repeat_importance:
                    end_k = int(abs(label_diff)) + 1
                else:
                    end_k = 1

                for k in range(end_k):

                    # Delta features: for feature (a, b), represent as (a, b, a-b)
                    # Format: (qid, feat[i], feat[j], feat[i] - feat[j], label_diff)
                    if delta_features:
                        new_row = np.hstack((temp_feat[i], 
                                             temp_feat[j, 1:], 
                                             temp_feat[i, 1:] - temp_feat[j, 1:],
                                             label_diff))
                    else:
                        new_row = np.hstack((temp_feat[i], 
                                             temp_feat[j, 1:], 
                                             label_diff))

                    features[idx] = new_row
                    idx += 1

        print(progress + 1)

    # Originally allocated array is likely too large, only save relevant rows
    features = features[~np.isnan(features[:, 0])]
    
    # Also return relevant query ids and labels in order to later use metrics (MAP, NDCG)
    return features, q_rel, label_rel

In [5]:
def build_model(train_feat, test_feat, q_rel, label_rel):
    """
    Trains and tests xgboost model on generated features
    
    Params:
        train_feat (np array): training features
        test_feat (np array): testing features (from validation set)
        q_rel, label_rel (np array): see generate_features
        
    Returns:
        r_ls (list of np arrays): each subarray is an ordered ranking of the original labels
                                  for a single test query, generated by the xgboost model
    """
    
    ################################# TRAINING #################################
    X_train = train_feat[:, :-1]
    y_train = train_feat[:, -1]

    # Same parameters for all calls to ensure consistency
    xgbr = XGBRegressor(max_depth=6, 
                        learning_rate=0.1,
                        n_estimators=10,
                        objective='reg:squarederror')

    xgbr.fit(X_train, y_train)

    print('Model fitted')

    ################################# TESTING #################################
    # Want to make predictions on every URL pair within a query, for all queries
    X_test = test_feat[:, :-1]
    y_test = test_feat[:, -1]
    y_pred = xgbr.predict(X_test)

    # Query ids in test set
    qids = np.unique(X_test[:, 0])
    size = qids.size
    
    # Record results over all queries
    MAP = 0
    NDCG1, NDCG3, NDCG5, NDCG10, NDCGM = 0, 0, 0, 0, 0

    # Save rankings (to visually compare)
    r_ls = []

    # For each query, make a prediction array (scores)
    for qid in qids:

        # m will be the number of URLs per given query ID
        m = int(np.sqrt(np.sum(X_test[:, 0] == qid)))

        # Save y_pred only for query of interest as y_pq, reshape in order to sum across rows
        # Note that the default order='C' in reshape is fine (row-major)
        # Setting order='F' will result in roughly the same result, just reversed since the 
        # learned labels correspond to (URLi - URLj)
        y_pq = y_pred[X_test[:, 0] == qid]
        y_pq = y_pq.reshape(m, m, order='C')

        # Apply logistic function
        y_pq = expit(y_pq)

        # Sum across rows to get 'power' of each individual training example
        # Get order using the scores as indices
        scores = np.sum(y_pq, axis=0)
        order = np.argsort(scores)

        # Apply order to original labels
        y_orig = label_rel[q_rel == qid]
        r = y_orig[order]

        # Save ranking
        r_ls.append(r)

        # Get results
        m_a_p = mean_average_precision([r])
        n1, n3, n5, n10, nm = (ndcg_at_k(r=r, k=1),ndcg_at_k(r=r, k=3), ndcg_at_k(r=r, k=5), 
                               ndcg_at_k(r=r, k=10), ndcg_at_k(r=r, k=m))

        # Update overall results
        MAP += m_a_p
        NDCG1 += n1
        NDCG3 += n3
        NDCG5 += n5
        NDCG10 += n10
        NDCGM += nm

        # Results for query
        print('Query %d, m=%d:' % (qid, m))
        print('\tMAP:     %.4f' % m_a_p)
        print('\tNDCG@1:  %.4f' % n1)
        print('\tNDCG@3:  %.4f' % n3)
        print('\tNDCG@5:  %.4f' % n5)
        print('\tNDCG@10: %.4f' % n10)
        print('\tNDCG@m:  %.4f' % nm)

    # Results over all queries
    print('\nOverall:')
    print('\tMAP:     %.4f' % (MAP / size))
    print('\tNDCG@1:  %.4f' % (NDCG1 / size))
    print('\tNDCG@3:  %.4f' % (NDCG3 / size))
    print('\tNDCG@5:  %.4f' % (NDCG5 / size))
    print('\tNDCG@10: %.4f' % (NDCG10 / size))
    print('\tNDCG@m:  %.4f' % (NDCGM / size))
    
    # Return list of ranking lists for each query in order to manually inspect rankings
    return r_ls

In [6]:
def lambdaMART_predictions(X_train, y_train, X_test, y_test):
    """"""

    # Counts of docs/query, 'group' parameter in fit()
    _, counts = np.unique(X_train[:, 0], return_counts=True)
    
    # Create and fit lightGBM with same parameters as XGBoost
    lgbm = LGBMRanker(num_leaves=127, 
                      max_depth=6,
                      learning_rate=0.1,
                      n_estimators=10,
                      objective='lambdarank')

    lgbm.fit(X_train, y_train, group=counts)
    
    # Record results over all queries
    MAP = 0
    NDCG1, NDCG3, NDCG5, NDCG10, NDCGM = 0, 0, 0, 0, 0
    r_ls = []
    
    # Number of queries in test set
    qids = np.unique(X_test[:, 0])
    size = qids.size
    
    # Make predictions and give results for each query
    for qid in qids:
        
        # Get relevant features, labels, size
        X_pq = X_test[X_test[:, 0] == qid]
        y_pq = y_test[X_test[:, 0] == qid]
        m = y_pq.size
        
        # Make predictions and sort docs by predicted scores
        y_pred = lgbm.predict(X_pq)
        order = np.argsort(y_pred)[::-1]
        r = y_pq[order]
        r_ls.append(r)
        
        # Compute results
        m_a_p = mean_average_precision([r])
        n1, n3, n5, n10, nm = (ndcg_at_k(r=r, k=1),ndcg_at_k(r=r, k=3), ndcg_at_k(r=r, k=5), 
                               ndcg_at_k(r=r, k=10), ndcg_at_k(r=r, k=m))
        
        # Update overall results
        MAP += m_a_p
        NDCG1 += n1
        NDCG3 += n3
        NDCG5 += n5
        NDCG10 += n10
        NDCGM += nm
        
        # Results for query
        print('Query %d, m=%d:' % (qid, m))
        print('\tMAP:     %.4f' % m_a_p)
        print('\tNDCG@1:  %.4f' % n1)
        print('\tNDCG@3:  %.4f' % n3)
        print('\tNDCG@5:  %.4f' % n5)
        print('\tNDCG@10: %.4f' % n10)
        print('\tNDCG@m:  %.4f' % nm)
        
    # Results over all queries
    print('\nOverall:')
    print('\tMAP:     %.4f' % (MAP / size))
    print('\tNDCG@1:  %.4f' % (NDCG1 / size))
    print('\tNDCG@3:  %.4f' % (NDCG3 / size))
    print('\tNDCG@5:  %.4f' % (NDCG5 / size))
    print('\tNDCG@10: %.4f' % (NDCG10 / size))
    print('\tNDCG@m:  %.4f' % (NDCGM / size))
    
    return r_ls

## Testing DeltaMART

In [7]:
# Load and generate features from queries in training set 
arrays = load_np_arrays('train')
data = prepare_data(*arrays, sample=True, size=1000, seed=55)
train_feat, _, _ = generate_features(*data, repeat_importance=True, delta_features=False)
print('Generated training features\n')
del arrays, data

# Load and generate features from queries in validation set
arrays = load_np_arrays('val')
data = prepare_data(*arrays, sample=True, size=500, seed=66)
test_feat, q_rel, label_rel = generate_features(*data, repeat_importance=False, 
                                                delta_features=False)
print('Generated validation features\n')
del arrays, data

# Train/evaluate xgboost model
# build_model(train_feat, test_feat, q_rel, label_rel)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


### DeltaMART Results

    10 trees in XGBoost
    100/50 queries in train/test, seed=55/66 for train/validation
    
    delta_features = True
    Overall:
        MAP:     0.8259
        NDCG@1:  0.7567
        NDCG@3:  0.7229
        NDCG@5:  0.7339
        NDCG@10: 0.7629
        NDCG@m:  0.8450
    
    delta_features = False
    Overall:
        MAP:     0.8325
        NDCG@1:  0.6700
        NDCG@3:  0.7192
        NDCG@5:  0.7148
        NDCG@10: 0.7577
        NDCG@m:  0.8427
        
        
### LambdaMART Results

    10 trees in lightGBM
    100/50 queries in train/test, seed=55/66 for train/validation
    
    Overall:
        MAP:     0.8241
        NDCG@1:  0.6800
        NDCG@3:  0.6773
        NDCG@5:  0.7004
        NDCG@10: 0.7420
        NDCG@m:  0.8304

## Testing LambdaMART

In [None]:
arrays = load_np_arrays('train')
_, _, X_train, y_train = prepare_data(*arrays, sample=True, size=100, seed=55)

arrays = load_np_arrays('val')
_, _, X_test, y_test = prepare_data(*arrays, sample=True, size=50, seed=66)

print('Prepared data')

lambdaMART_predictions(X_train, y_train, X_test, y_test)

# Old stuff/experimentation 

In [None]:
lambdaMART_predictions(X_train, y_train, X_test, y_test)

In [None]:
arrays = load_np_arrays('train')
q_choice, q_rel, feat_rel, label_rel = prepare_data(*arrays, sample=True, size=10, seed=7)

X_train = feat_rel
y_train = label_rel

feat_rel.shape, label_rel.shape, q_rel.shape

In [None]:
_, counts = np.unique(q_rel, return_counts=True)
counts

In [None]:
lgbm = lightgbm.LGBMRanker()
lgbm.fit(feat_rel, label_rel, group=counts)

In [None]:
arrays = load_np_arrays('val')
q_choice, q_rel, feat_rel, label_rel = prepare_data(*arrays, sample=True, size=5, seed=7)

X_test = feat_rel
y_test = label_rel

feat_rel.shape, label_rel.shape, q_rel.shape

In [None]:
label_rel

In [None]:
feat_rel[:, 0]

In [None]:
y_pred = lgbm.predict(feat_rel[feat_rel[:, 0] == np.unique(feat_rel[:, 0])[0]])
order = np.argsort(y_pred)
r = label_rel[order]

mean_average_precision([r]), ndcg_at_k(r, k=5), r