# Split Data Processing

Testing the performance of various feature configurations when using DeltaMART with the MSLR-WEB10K dataset. 

https://www.microsoft.com/en-us/research/project/mslr/

Only using a small subset of queries (10 at the moment) given that the notebook is run locally on a laptop with 8GB RAM. 

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from scipy.special import expit  # Logistic function
from rank_metrics import ndcg_at_k, mean_average_precision

## Functions

In [2]:
def generate_dataframe(filepath, n_queries=30, seed=1):
    """"""
    
    # For reproducible results from randomly selecting queries
    np.random.seed(seed)
    
    df = pd.read_csv(filepath,
                     sep=' ',
                     header=None)
    
    # Remove last column of NaN
    df = df.iloc[:, :-1]
    
    # First column: hand-labeled score, second column: query id
    df = df.rename(columns={0: 'label', 1: 'query_id'})
    
    # Get random sample of queries
    qids = df.query_id.unique()
    qids = np.random.choice(qids, size=n_queries)
    
    # Only save dataframe with queries of interest
    df = df[df.query_id.isin(qids)]
    
    # Save hand-labels
    labels = df.label

    # Use regex to get number after colon for every column other than label
    features = df.iloc[:, 1:].applymap(lambda x: float(re.findall(r':(.*)', x)[0]))

    # Put features and labels in same dataframe
    df = features
    df['label'] = labels
    
    return df

In [3]:
def generate_features(df, repeat_importance, two_sided, delta_features):
    """"""
    
    n_rows = 0
    max_diff = 4
    n_features = 136
    
    # Find max possible number of rows: n_queries * (n_urls_per_query ^ 2) * max_repeat_factor
    for qid in df.query_id.unique():
        urls_per_query = df[df.query_id == qid].shape[0]
        
        # If not repeating importance, then every query-URL pair only appears once
        if repeat_importance:
            n_rows += (urls_per_query ** 2) * max_diff
        else:
            n_rows += (urls_per_query ** 2)
    
    # Add extra set of columns if delta_features, + 4 for i, j, query_id, label
    if delta_features:
        n_columns = n_features * 3 + 4
    else:
        n_columns = n_features * 2 + 4
    
    # Create array to fill in later (faster)
    features = np.full(shape=(n_rows, n_columns), fill_value=np.nan)
    idx = 0
    
    # Compare each URL for a given query
    for progress, qid in enumerate(df.query_id.unique()):
        
        # tdf: temporary dataframe, m: number of URLs in tdf
        tdf = df[df.query_id == qid]
        m = tdf.shape[0]
        
        # First URL
        for i in range(m):
            
            # Two sided: feature (a, b) will be repeated later as feature (b, a)
            if two_sided:
                start_j = 0
            else:
                start_j = i
            
            # Second URL
            for j in range(start_j, m):
                
                label_diff = tdf.label.iloc[i] - tdf.label.iloc[j]
                
                # Repeat importance: duplicate row |label_diff| times
                if repeat_importance:
                    end_k = int(abs(label_diff)) + 1
                else:
                    end_k = 1
                    
                for k in range(end_k):
                    
                    # Delta features: for feature (a, b), represent as (a, b, a-b)
                    # Format: (i, j, query_id, URLi, URLj, URLi-URLj (?), label_diff)
                    if delta_features:
                        new_row = np.hstack((i,
                                             j,
                                             qid,
                                             tdf.iloc[i, 1:-1], 
                                             tdf.iloc[j, 1:-1], 
                                             tdf.iloc[i, 1:-1] - tdf.iloc[j, 1:-1],  
                                             label_diff))
                    else:
                            new_row = np.hstack((i,
                                                 j,
                                                 qid,
                                                 tdf.iloc[i, 1:-1], 
                                                 tdf.iloc[j, 1:-1],  
                                                 label_diff))
                        
                    features[idx] = new_row
                    idx += 1

#         print(progress)
    
    # Originally allocated array is likely too large, only save relevant rows
    features = features[~np.isnan(features[:, 0])]
    return features

In [4]:
def build_model(train_feat, test_feat, labels):
    """"""

    """ TRAIN """
    # Features does not include i, j, does includes query_id
    X_train = train_feat[:, 2:-1]
    y_train = train_feat[:, -1]

    # Same parameters for all calls to ensure consistency
    xgbr = XGBRegressor(max_depth=6, 
                        learning_rate=0.1,
                        n_estimators=100, # Change to make faster OR more powerful (?)
                        objective='reg:squarederror')
    
    xgbr.fit(X_train, y_train)

    print('Model fitted')

    """ TEST """
    # Want to make predictions on every URL pair within a query, for all queries
    # Avoid predicting on rows that were repeated above
    # Combo of i, j query_id ensures that unique will work to prevent repeated rows
    feat_unique = np.unique(test_feat, axis=0)
    X_unique = feat_unique[:, 2:-1]
    y_pred = xgbr.predict(X_unique)

    # Record results over all queries
    MAP = 0
    NDCG1, NDCG3, NDCG5, NDCG10, NDCGM = 0, 0, 0, 0, 0
    
    # For each query, make a prediction array (scores)
    for qid in np.unique(X_unique[:, 0]):

        # m will be the number of URLs per given query ID
        m = int(np.sqrt(np.sum(X_unique[:, 0] == qid)))

        # Save y_pred only for query of interest as y_pq, reshape in order to sum across rows
        # Note that the default order='C' in reshape is fine (row-major)
        # Setting order='F' will result in roughly the same result, just reversed since the 
        # learned labels correspond to (URLi - URLj)
        y_pq = y_pred[X_unique[:, 0] == qid]
        y_pq = y_pq.reshape(m, m, order='C')

        # Apply logistic function
        y_pq = expit(y_pq)

        # Sum across rows to get 'power' of each individual training example
        # Get order using the scores as indices
        scores = np.sum(y_pq, axis=0)
        order = np.argsort(scores)

        # Apply order to original labels
        y_orig = labels[labels.query_id == qid].label.values
        r = y_orig[order]

        # Get results
        m_a_p = mean_average_precision([r])
        n1, n3, n5, n10, nm = ndcg_at_k(r=r, k=1),ndcg_at_k(r=r, k=3), ndcg_at_k(r=r, k=5), ndcg_at_k(r=r, k=10), ndcg_at_k(r=r, k=m)

        # Update overall results
        MAP += m_a_p
        NDCG1 += n1
        NDCG3 += n3
        NDCG5 += n5
        NDCG10 += n10
        NDCGM += nm
        
        # Results for query
        print('Query %d, m=%d:' % (qid, m))
        print('\tNDCG@1:  %.4f' % n1)
        print('\tNDCG@3: %.4f' % n3)
        print('\tNDCG@5: %.4f' % n5)
        print('\tNDCG@10: %.4f' % n10)
        print('\tNDCG@m:  %.4f' % nm)

    # Results over all queries
    print('\nOverall:')
    print('\tMAP:     %.4f' % (MAP / 10))
    print('\tNDCG@1:  %.4f' % (NDCG1 / 10))
    print('\tNDCG@3:  %.4f' % (NDCG3 / 10))
    print('\tNDCG@5:  %.4f' % (NDCG5 / 10))
    print('\tNDCG@10: %.4f' % (NDCG10 / 10))
    print('\tNDCG@m:  %.4f' % (NDCGM / 10))

## Testing

In [5]:
""" SETUP BETWEEN TRIALS HERE """
my_seed = 5
delta_features = True

In [6]:
my_df = generate_dataframe('/Users/Ashtekar15/Desktop/Thesis/MGBoost/other/test_data/ranking/MSLR-WEB10K/Fold1/vali.txt', 
                           n_queries=10, 
                           seed=my_seed)

In [7]:
# Sets shape of train/test features during initilization
if delta_features:
    n = 412
else:
    n = 276

# Initlize to then fill in
train = np.full(shape=(1, n), fill_value=np.nan)
test = np.full(shape=(1, n), fill_value=np.nan)
labels = pd.DataFrame()

for progress, qid in enumerate(np.unique(my_df.query_id)):
    
    # Split train/test -> 80/20
    train_df, test_df = train_test_split(my_df[my_df.query_id == qid], test_size=0.2)
    
    # Save test labels (in order to evaluate final rank)
    labels = labels.append(test_df[['query_id', 'label']], ignore_index=True)
    
    # Make train/test features for a given query, no need to repeat_importance for testing
    train_f = generate_features(train_df, repeat_importance=True, two_sided=True, delta_features=delta_features)
    test_f = generate_features(test_df, repeat_importance=False, two_sided=True, delta_features=delta_features)
    
    # Save train/test features
    train = np.vstack((train, train_f))
    test = np.vstack((test, test_f))
    
    print(progress)

# Ignore first NaN row
train = train[1:, :]
test = test[1:, :]

0
1
2
3
4
5
6
7
8
9


In [8]:
# build_model(train, test, labels)

train_feat, test_feat, labels = train, test, labels

""" TRAIN """
# Features does not include i, j, does includes query_id
X_train = train_feat[:, 2:-1]
y_train = train_feat[:, -1]

# Same parameters for all calls to ensure consistency
xgbr = XGBRegressor(max_depth=6, 
                    learning_rate=0.1,
                    n_estimators=100, # Change to make faster OR more powerful (?)
                    objective='reg:squarederror')

xgbr.fit(X_train, y_train)

print('Model fitted')

""" TEST """
# Want to make predictions on every URL pair within a query, for all queries
# Avoid predicting on rows that were repeated above
# Combo of i, j query_id ensures that unique will work to prevent repeated rows
feat_unique = np.unique(test_feat, axis=0)
X_unique = feat_unique[:, 2:-1]
y_pred = xgbr.predict(X_unique)

# Record results over all queries
MAP = 0
NDCG1, NDCG3, NDCG5, NDCG10, NDCGM = 0, 0, 0, 0, 0

# Save rankings (to visually compare)
r_ls = []

# For each query, make a prediction array (scores)
for qid in np.unique(X_unique[:, 0]):

    # m will be the number of URLs per given query ID
    m = int(np.sqrt(np.sum(X_unique[:, 0] == qid)))

    # Save y_pred only for query of interest as y_pq, reshape in order to sum across rows
    # Note that the default order='C' in reshape is fine (row-major)
    # Setting order='F' will result in roughly the same result, just reversed since the 
    # learned labels correspond to (URLi - URLj)
    y_pq = y_pred[X_unique[:, 0] == qid]
    y_pq = y_pq.reshape(m, m, order='C')

    # Apply logistic function
    y_pq = expit(y_pq)

    # Sum across rows to get 'power' of each individual training example
    # Get order using the scores as indices
    scores = np.sum(y_pq, axis=0)
    order = np.argsort(scores)

    # Apply order to original labels
    y_orig = labels[labels.query_id == qid].label.values
    r = y_orig[order]
    
    # Save ranking
    r_ls.append(r)

    # Get results
    m_a_p = mean_average_precision([r])
    n1, n3, n5, n10, nm = ndcg_at_k(r=r, k=1),ndcg_at_k(r=r, k=3), ndcg_at_k(r=r, k=5), ndcg_at_k(r=r, k=10), ndcg_at_k(r=r, k=m)

    # Update overall results
    MAP += m_a_p
    NDCG1 += n1
    NDCG3 += n3
    NDCG5 += n5
    NDCG10 += n10
    NDCGM += nm

    # Results for query
    print('Query %d, m=%d:' % (qid, m))
    print('\tNDCG@1:  %.4f' % n1)
    print('\tNDCG@3: %.4f' % n3)
    print('\tNDCG@5: %.4f' % n5)
    print('\tNDCG@10: %.4f' % n10)
    print('\tNDCG@m:  %.4f' % nm)

# Results over all queries
print('\nOverall:')
print('\tMAP:     %.4f' % (MAP / 10))
print('\tNDCG@1:  %.4f' % (NDCG1 / 10))
print('\tNDCG@3:  %.4f' % (NDCG3 / 10))
print('\tNDCG@5:  %.4f' % (NDCG5 / 10))
print('\tNDCG@10: %.4f' % (NDCG10 / 10))
print('\tNDCG@m:  %.4f' % (NDCGM / 10))

  "because it will generate extra copies and increase memory consumption")


Model fitted
Query 1105, m=20:
	NDCG@1:  0.5000
	NDCG@3: 0.8100
	NDCG@5: 0.7759
	NDCG@10: 0.8260
	NDCG@m:  0.8957
Query 3100, m=20:
	NDCG@1:  0.5000
	NDCG@3: 0.7246
	NDCG@5: 0.7808
	NDCG@10: 0.7920
	NDCG@m:  0.8821
Query 11110, m=24:
	NDCG@1:  0.0000
	NDCG@3: 0.0000
	NDCG@5: 0.0000
	NDCG@10: 0.0000
	NDCG@m:  0.0000
Query 13015, m=28:
	NDCG@1:  0.0000
	NDCG@3: 0.4319
	NDCG@5: 0.5269
	NDCG@10: 0.4475
	NDCG@m:  0.7235
Query 14980, m=39:
	NDCG@1:  0.6667
	NDCG@3: 0.8403
	NDCG@5: 0.8153
	NDCG@10: 0.8198
	NDCG@m:  0.9034
Query 15490, m=11:
	NDCG@1:  1.0000
	NDCG@3: 1.0000
	NDCG@5: 1.0000
	NDCG@10: 1.0000
	NDCG@m:  1.0000
Query 17140, m=24:
	NDCG@1:  1.0000
	NDCG@3: 0.6806
	NDCG@5: 0.7538
	NDCG@10: 0.7696
	NDCG@m:  0.8505
Query 21370, m=33:
	NDCG@1:  1.0000
	NDCG@3: 0.5508
	NDCG@5: 0.5329
	NDCG@10: 0.5254
	NDCG@m:  0.7259
Query 25885, m=1:
	NDCG@1:  0.0000
	NDCG@3: 0.0000
	NDCG@5: 0.0000
	NDCG@10: 0.0000
	NDCG@m:  0.0000
Query 26515, m=23:
	NDCG@1:  0.5000
	NDCG@3: 0.3100
	NDCG@5: 0.3828
	NDC

## Results

With delta_features terms False/True listed below (copy/pasted from output):

Random seed = 1

False
Overall:
	MAP:     0.7628
	NDCG@1:  0.7000
	NDCG@3:  0.7175
	NDCG@5:  0.7215
	NDCG@10: 0.7233
	NDCG@m:  0.8196
    
True
Overall:
	MAP:     0.7307
	NDCG@1:  0.5500
	NDCG@3:  0.5895
	NDCG@5:  0.6202
	NDCG@10: 0.6733
	NDCG@m:  0.7592
    
------------------------------------

Random seed = 2  

False
Overall:
	MAP:     0.6444
	NDCG@1:  0.3167
	NDCG@3:  0.5901
	NDCG@5:  0.6254
	NDCG@10: 0.6445
	NDCG@m:  0.7481
    
True
Overall:
	MAP:     0.5685
	NDCG@1:  0.4500
	NDCG@3:  0.4447
	NDCG@5:  0.4560
	NDCG@10: 0.5452
	NDCG@m:  0.6598
    
------------------------------------

Random seed = 3  

False
Overall:
	MAP:     0.5472
	NDCG@1:  0.6167
	NDCG@3:  0.5459
	NDCG@5:  0.5630
	NDCG@10: 0.5696
	NDCG@m:  0.6740

True
Overall:
	MAP:     0.4850
	NDCG@1:  0.4333
	NDCG@3:  0.4269
	NDCG@5:  0.4888
	NDCG@10: 0.5237
	NDCG@m:  0.6299
    
------------------------------------

Random seed = 4  

False
Overall:
	MAP:     0.5962
	NDCG@1:  0.5167
	NDCG@3:  0.4563
	NDCG@5:  0.4947
	NDCG@10: 0.5267
	NDCG@m:  0.6297

True:
Overall:
	MAP:     0.6074
	NDCG@1:  0.5500
	NDCG@3:  0.5209
	NDCG@5:  0.5116
	NDCG@10: 0.5391
	NDCG@m:  0.6439
        
------------------------------------

Random seed = 5 

False
Overall:
	MAP:     0.6079
	NDCG@1:  0.5333
	NDCG@3:  0.5449
	NDCG@5:  0.5460
	NDCG@10: 0.5638
	NDCG@m:  0.6654
    
Overall:
	MAP:     0.6275
	NDCG@1:  0.5167
	NDCG@3:  0.5348
	NDCG@5:  0.5568
	NDCG@10: 0.5648
	NDCG@m:  0.6672