# Data Processing

Testing the performance of various feature configurations when using DeltaMART with the MSLR-WEB10K dataset. 

https://www.microsoft.com/en-us/research/project/mslr/

Only using a small subset of queries (10 at the moment) given that the notebook is run locally on a laptop with 8GB RAM. 

In [1]:
import numpy as np
import pandas as pd
import re
from xgboost import XGBRegressor
from scipy.special import expit  # Logistic function
from rank_metrics import ndcg_at_k

## Functions

In [2]:
def generate_dataframe(filepath, n_queries=30, seed=1):
    """"""
    
    # For reproducible results from randomly selecting queries
    np.random.seed(seed)
    
    df = pd.read_csv(filepath,
                     sep=' ',
                     header=None)
    
    # Remove last column of NaN
    df = df.iloc[:, :-1]
    
    # First column: hand-labeled score, second column: query id
    df = df.rename(columns={0: 'label', 1: 'query_id'})
    
    # Get random sample of queries
    qids = df.query_id.unique()
    qids = np.random.choice(qids, size=n_queries)
    
    # Only save dataframe with queries of interest
    df = df[df.query_id.isin(qids)]
    
    # Save hand-labels
    labels = df.label

    # Use regex to get number after colon for every column other than label
    features = df.iloc[:, 1:].applymap(lambda x: float(re.findall(r':(.*)', x)[0]))

    # Put features and labels in same dataframe
    df = features
    df['label'] = labels
    
    return df

In [3]:
def generate_features(df, repeat_importance, two_sided, delta_features):
    """"""
    
    n_rows = 0
    max_diff = 4
    n_features = 136
    
    # Find max possible number of rows: n_queries * (n_urls_per_query ^ 2) * max_repeat_factor
    for qid in df.query_id.unique():
        urls_per_query = df[df.query_id == qid].shape[0]
        
        # If not repeating importance, then every query-URL pair only appears once
        if repeat_importance:
            n_rows += (urls_per_query ** 2) * max_diff
        else:
            n_rows += (urls_per_query ** 2)
    
    # Add extra set of columns if delta_features, + 4 for i, j, query_id, label
    if delta_features:
        n_columns = n_features * 3 + 4
    else:
        n_columns = n_features * 2 + 4
    
    # Create array to fill in later (faster)
    features = np.full(shape=(n_rows, n_columns), fill_value=np.nan)
    idx = 0
    
    # Compare each URL for a given query
    for progress, qid in enumerate(df.query_id.unique()):
        
        # tdf: temporary dataframe, m: number of URLs in tdf
        tdf = df[df.query_id == qid]
        m = tdf.shape[0]
        
        # First URL
        for i in range(m):
            
            # Two sided: feature (a, b) will be repeated later as feature (b, a)
            if two_sided:
                start_j = 0
            else:
                start_j = i
            
            # Second URL
            for j in range(start_j, m):
                
                label_diff = tdf.label.iloc[i] - tdf.label.iloc[j]
                
                # Repeat importance: duplicate row |label_diff| times
                if repeat_importance:
                    end_k = int(abs(label_diff)) + 1
                else:
                    end_k = 1
                    
                for k in range(end_k):
                    
                    # Delta features: for feature (a, b), represent as (a, b, a-b)
                    # Format: (i, j, query_id, URLi, URLj, URLi-URLj (?), label_diff)
                    if delta_features:
                        new_row = np.hstack((i,
                                             j,
                                             qid,
                                             tdf.iloc[i, 1:-1], 
                                             tdf.iloc[j, 1:-1], 
                                             tdf.iloc[i, 1:-1] - tdf.iloc[j, 1:-1],  
                                             label_diff))
                    else:
                            new_row = np.hstack((i,
                                                 j,
                                                 qid,
                                                 tdf.iloc[i, 1:-1], 
                                                 tdf.iloc[j, 1:-1],  
                                                 label_diff))
                        
                    features[idx] = new_row
                    idx += 1

        print(progress)
    
    # Originally allocated array is likely too large, only save relevant rows
    features = features[~np.isnan(features[:, 0])]
    return features

In [4]:
def build_model(features, df):
    """"""

    # Features does not include i, j, does includes query_id
    X = features[:, 2:-1]
    y = features[:, -1]

    # Same parameters for all calls to ensure consistency
    xgbr = XGBRegressor(max_depth=6, 
                        learning_rate=0.1,
                        n_estimators=100, # Change to make faster OR more powerful (?)
                        objective='reg:squarederror')
    xgbr.fit(X, y)

    print('Model fitted')

    # Want to make predictions on every URL pair within a query, for all queries
    # Avoid predicting on rows that were repeated above
    # Combo of i, j query_id ensures that unique will work to prevent repeated rows
    feat_unique = np.unique(features, axis=0)
    X_unique = feat_unique[:, 2:-1]
    y_pred = xgbr.predict(X_unique)

    # For each query, make a prediction array (scores)
    for qid in np.unique(X_unique[:, 0]):

        # m will be the number of URLs per given query ID
        m = int(np.sqrt(np.sum(X_unique[:, 0] == qid)))

        # Save y_pred only for query of interest as y_pq, reshape in order to sum across rows
        # Note that the default order='C' in reshape is fine (row-major)
        # Setting order='F' will result in roughly the same result, just reversed since the 
        # learned labels correspond to (URLi - URLj)
        y_pq = y_pred[X_unique[:, 0] == qid]
        y_pq = y_pq.reshape(m, m, order='C')

        # Apply logistic function
        y_pq = expit(y_pq)

        # Sum across rows to get 'power' of each individual training example
        # Get order using the scores as indices
        scores = np.sum(y_pq, axis=0)
        order = np.argsort(scores)

        # Apply order to original labels
        y_orig = df[df.query_id == qid].label.values
        r = y_orig[order]

        # Results for entire ranking
        print('Query %d, m=%d:' % (qid, m))
        print('\tNDCG@5:  %.4f' % ndcg_at_k(r=r, k=5))
        print('\tNDCG@10: %.4f' % ndcg_at_k(r=r, k=10))
        print('\tNDCG@25: %.4f' % ndcg_at_k(r=r, k=25))
        print('\tNDCG@50: %.4f' % ndcg_at_k(r=r, k=50))
        print('\tNDCG@m:  %.4f' % ndcg_at_k(r=r, k=m))

## Testing

In [5]:
my_df = generate_dataframe('/Users/Ashtekar15/Desktop/Thesis/MGBoost/other/test_data/ranking/MSLR-WEB10K/Fold1/vali.txt', 
                           n_queries=10, 
                           seed=1)

In [6]:
hyp_ls = [[False, False],
          [False, True],
          [True, False],
          [True, True]]

for hyp in hyp_ls:
    
    print('\nrepeat_importance: %r, two_sided: %r, delta_features: %r' %(hyp[0], True, hyp[1]))
    
    my_f = generate_features(my_df, 
                             repeat_importance=hyp[0], 
                             two_sided=True, 
                             delta_features=hyp[1])

    build_model(my_f, my_df)


repeat_importance: False, two_sided: True, delta_features: False
0
1
2
3
4
5
6
7
8
9


  "because it will generate extra copies and increase memory consumption")


Model fitted
Query 3535, m=131:
	NDCG@5:  1.0000
	NDCG@10: 1.0000
	NDCG@25: 0.9884
	NDCG@50: 0.9984
	NDCG@m:  0.9980
Query 10735, m=89:
	NDCG@5:  1.0000
	NDCG@10: 1.0000
	NDCG@25: 0.9847
	NDCG@50: 0.9958
	NDCG@m:  0.9954
Query 12715, m=91:
	NDCG@5:  1.0000
	NDCG@10: 0.9984
	NDCG@25: 0.9730
	NDCG@50: 0.9787
	NDCG@m:  0.9942
Query 13585, m=156:
	NDCG@5:  0.9181
	NDCG@10: 0.9775
	NDCG@25: 0.9574
	NDCG@50: 0.9662
	NDCG@m:  0.9851
Query 14410, m=150:
	NDCG@5:  1.0000
	NDCG@10: 1.0000
	NDCG@25: 0.9932
	NDCG@50: 0.9948
	NDCG@m:  0.9954
Query 15925, m=131:
	NDCG@5:  1.0000
	NDCG@10: 1.0000
	NDCG@25: 0.9969
	NDCG@50: 0.9783
	NDCG@m:  0.9944
Query 16450, m=77:
	NDCG@5:  0.9440
	NDCG@10: 0.8609
	NDCG@25: 0.9217
	NDCG@50: 0.9448
	NDCG@m:  0.9448
Query 25045, m=46:
	NDCG@5:  1.0000
	NDCG@10: 0.9942
	NDCG@25: 0.9681
	NDCG@50: 0.9915
	NDCG@m:  0.9915
Query 26875, m=202:
	NDCG@5:  1.0000
	NDCG@10: 1.0000
	NDCG@25: 1.0000
	NDCG@50: 1.0000
	NDCG@m:  0.9999
Query 28990, m=151:
	NDCG@5:  1.0000
	NDCG@10: 

In [12]:
# Should record means within loop next time

# repeat_importance: False, two_sided: True, delta_features: False
print(np.mean([0.9980, 0.9954, 0.9942, 0.9851, 0.9954, 0.9944, 0.9448, 0.9915, 0.9999, 0.9979]))

# repeat_importance: False, two_sided: True, delta_features: True
print(np.mean([0.9981, 0.9885, 0.9939, 0.9861, 0.9969, 0.9925, 0.9471, 0.9932, 0.9999, 0.9972]))

# repeat_importance: True, two_sided: True, delta_features: False
print(np.mean([0.9989, 0.9930, 0.9921, 0.9930, 0.9957, .9947, 0.9461,  0.9942, 0.9999, 0.9979]))

# repeat_importance: True, two_sided: True, delta_features: True
print(np.mean([0.9978, 0.9923, 0.9908, 0.9858, 0.9968, 0.9935, 0.9460, 0.9939, 0.9999, 0.9976]))

0.98966
0.98934
0.99055
0.9894400000000001


Not much difference in performance across different feature configurations. To better illustrate differences, I should 