In [53]:
import numpy as np
import pandas as pd
import re
from xgboost import XGBRegressor
from scipy.special import expit  # Logistic function
from rank_metrics import ndcg_at_k

In [2]:
def generate_dataframe(filepath, n_queries=30, seed=1):
    """"""
    
    # For reproducible results from randomly selecting queries
    np.random.seed(seed)
    
    df = pd.read_csv(filepath,
                       sep=' ',
                       header=None)
    
    # Remove last column of NaN
    df = df.iloc[:, :-1]
    
    # First column: hand-labeled score, second column: query id
    df = df.rename(columns={0: 'label', 1: 'query_id'})
    
    # Get random sample of queries
    qids = df.query_id.unique()
    qids = np.random.choice(qids, size=n_queries)
    
    # Only save dataframe with queries of interest
    df = df[df.query_id.isin(qids)]
    
    # Save hand-labels
    labels = df.label

    # Use regex to get number after colon for every column other than label
    features = df.iloc[:, 1:].applymap(lambda x: float(re.findall(r':(.*)', x)[0]))

    # Put features and labels in same dataframe
    df = features
    df['label'] = labels
    
    return df

In [34]:
def generate_features(df, repeat_importance, two_sided, delta_features):
    """"""
    
    n_rows = 0
    max_diff = 4
    n_features = 136
    
    # Find max number of rows: n_queries * n_urls_per_query ^ 2 * max_repeat_factor
    for qid in df.query_id.unique():
        urls_per_query = df[df.query_id == qid].shape[0]
        n_rows += (urls_per_query ** 2) * max_diff
    
    # Add extra set of columns if delta_features, + 4 for i, j, query_id, label
    if delta_features:
        n_columns = n_features * 3 + 4
    else:
        n_columns = n_features * 2 + 4
    
    # Create array to fill in later (faster)
    features = np.full(shape=(n_rows, n_columns), fill_value=np.nan)
    idx = 0
    
    # Compare each URL for a given query
    for progress, qid in enumerate(df.query_id.unique()):
        
        # tdf: temporary dataframe, m: number of URLs in tdf
        tdf = df[df.query_id == qid]
        m = tdf.shape[0]
        
        # First URL
        for i in range(m):
            
            # Two sided: feature (a, b) will be repeated later as feature (b, a)
            if two_sided:
                start_j = 0
            else:
                start_j = i
            
            # Second URL
            for j in range(start_j, m):
                
                label_diff = tdf.label.iloc[i] - tdf.label.iloc[j]
                
                # Repeat importance: duplicate row |label_diff| times
                if repeat_importance:
                    end_k = 1
                else:
                    end_k = int(abs(label_diff)) + 1
                    
                for k in range(end_k):
                    
                    # Delta features: for feature (a, b), represent as (a, b, a-b)
                    # Format: (i, j, query_id, URLi, URLj, URLi-URLj (?), label_diff)
                    if delta_features:
                        new_row = np.hstack((i,
                                             j,
                                             qid,
                                             tdf.iloc[i, 1:-1], 
                                             tdf.iloc[j, 1:-1], 
                                             tdf.iloc[i, 1:-1] - tdf.iloc[j, 1:-1],  
                                             label_diff))
                    else:
                            new_row = np.hstack((i,
                                                 j,
                                                 qid,
                                                 tdf.iloc[i, 1:-1], 
                                                 tdf.iloc[j, 1:-1],  
                                                 label_diff))
                        
                    features[idx] = new_row
                    idx += 1

        print(progress)
        
    features = features[~np.isnan(features[:, 0])]
    return features

In [54]:
def build_model(features, ):
    # Includes query_id in features
    X = features[:, 2:-1]
    y = features[:, -1]
    
    # Same parameters for all calls to ensure consistency
    xgbr = XGBRegressor(max_depth=6, 
                        learning_rate=0.1,
                        n_estimators=10, # CHANGE ?
                        objective='reg:squarederror')
    xgbr.fit(X, y)
    
    # Want to make predictions on every URL pair within a query, for all queries
    # Avoid predicting on rows that were repeated above
    # Combo of i, j query_id ensures that unique will work
    feat_unique = np.unique(features[:, :-1], axis=0)
    X_unique = feat_unique[:, 2:]
    y_unique = feat_unique[:, -1]
    y_pred = xgbr.predict(X_unique)
    
    # idx stores progress thru y_pred
    idx = 0
    
    # For each query, make a prediction matrix
    for qid in np.unique(X_unique[:, 0]):
        
        m = np.sum(X_unique[:, 0] == qid)
        mat = np.full(shape=(m, m), fill_value=np.nan)
        
        # Fill in prediction matrix
        for i in range(m * m):
            
            (a, b) = feat_unique[feat_unique[:, 2] == qid][i, :2] 
            mat[a, b] = y_pred[idx]
            idx += 1
        
        # Apply logistic function
        mat = expit(mat)
        
        # Sum across rows to get 'power' of each individual training example
        # Get ranking using the scores as indices
        scores = np.sum(mat, axis=0)
        ranking = np.argsort(scores)
        
        # Apply ranking to original labels
        r = y_unique[ranking]
        print('Query %d, m=%d, NDCG=%.4f' % (qid, m, ndcg_at_k(r=r, k=m)))
        

## Testing below

In [83]:
my_df = generate_dataframe('/Users/Ashtekar15/Desktop/Thesis/MGBoost/other/test_data/ranking/MSLR-WEB10K/Fold1/vali.txt', 
                           n_queries=3, 
                           seed=1)

In [84]:
my_f = generate_features(my_df, 
                         repeat_importance=False, 
                         two_sided=True, 
                         delta_features=True)

0
1
2


In [85]:
my_f.shape

(107391, 412)

In [41]:
features = my_f

In [42]:
# # Includes query_id in features
# X = features[:, 2:-1]
# y = features[:, -1]

# # # Same parameters for all calls to ensure consistency
# # xgbr = XGBRegressor(max_depth=6, 
# #                     learning_rate=0.1,
# #                     n_estimators=30,
# #                     objective='reg:squarederror')
# # xgbr.fit(X, y)

# # Want to make predictions on every URL pair within a query, for all queries
# # Avoid predicting on rows that were repeated above
# # Combo of i, j query_id ensures that unique will work
# X_unique = np.unique(features[:, :-1], axis=0)
# X_unique = X_unique[:, 2:]
# # y_pred = xgbr.predict(X_unique)

In [46]:
# np.unique(X_unique[:, 0])

array([ 3535., 15925., 28990.])

In [52]:
# (np.sum(my_df.query_id == 3535) ** 2 + np.sum(my_df.query_id == 15925) ** 2 + np.sum(my_df.query_id == 28990) ** 2) / 2 

28561.5

In [50]:
# count = 0
# for qid in np.unique(X_unique[:, 0]):
#     print(np.sum(X_unique[:, 0] == qid))
#     count += np.sum(X_unique[:, 0] == qid)
# count

8646
8646
11476


28768

In [105]:
np.sum(my_df.query_id == qid)

131

In [94]:
# Includes query_id in features
X = features[:, 2:-1]
y = features[:, -1]

# Same parameters for all calls to ensure consistency
xgbr = XGBRegressor(max_depth=6, 
                    learning_rate=0.1,
                    n_estimators=100, # CHANGE ?
                    objective='reg:squarederror')
xgbr.fit(X, y)

print('Model fitted')

# Want to make predictions on every URL pair within a query, for all queries
# Avoid predicting on rows that were repeated above
# Combo of i, j query_id ensures that unique will work
feat_unique = np.unique(features, axis=0)
X_unique = feat_unique[:, 2:-1]
y_unique = feat_unique[:, -1]
y_pred = xgbr.predict(X_unique)

# idx stores progress thru y_pred
idx = 0

# For each query, make a prediction matrix
for qid in np.unique(X_unique[:, 0]):

    m = np.sum(X_unique[:, 0] == qid)
    mat = np.full(shape=(m, m), fill_value=np.nan)

    # Fill in prediction matrix
    for i in range(m):

        (a, b) = feat_unique[feat_unique[:, 2] == qid][i, :2] 
        mat[int(a), int(b)] = y_pred[idx]
        idx += 1

    # Apply logistic function
    mat = expit(mat)

    # Sum across rows to get 'power' of each individual training example
    # Get ranking using the scores as indices
    scores = np.sum(mat, axis=0)
    ranking = np.argsort(scores)

    # Apply ranking to original labels
    r = y_unique[X_unique[:, 0] == qid][ranking]
    print('Query %d, m=%d, NDCG=%.4f' % (qid, m, ndcg_at_k(r=r, k=m)))


Query 3535, m=8646, NDCG=0.0427


KeyboardInterrupt: 