In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
def generate_dataframe(filepath, n_queries=30, seed=1):
    """"""
    
    # For reproducible results from randomly selecting queries
    np.random.seed(seed)
    
    df = pd.read_csv(filepath,
                       sep=' ',
                       header=None)
    
    # Remove last column of NaN
    df = df.iloc[:, :-1]
    
    # First column: hand-labeled score, second column: query id
    df = df.rename(columns={0: 'label', 1: 'query_id'})
    
    # Get random sample of queries
    qids = df.query_id.unique()
    qids = np.random.choice(qids, size=n_queries)
    
    # Only save dataframe with queries of interest
    df = df[df.query_id.isin(qids)]
    
    # Save hand-labels
    labels = df.label

    # Use regex to get number after colon for every column other than label
    features = df.iloc[:, 1:].applymap(lambda x: float(re.findall(r':(.*)', x)[0]))

    # Put features and labels in same dataframe
    df = features
    df['label'] = labels
    
    return df

In [None]:
def generate_features(df, repeat_importance, two_sided, delta_features):
    """"""
    
    n_rows = 0
    max_diff = 4
    n_features = 136
    
    # Find max number of rows: n_queries * n_urls_per_query ^ 2 * max_repeat_factor
    for qid in df.query_id.unique():
        urls_per_query = df[df.query_id == qid].shape[0]
        n_rows += (urls_per_query ** 2) * max_diff
    
    # Add extra set of columns if delta_features, + 2 for query_id and label (score)
    if delta_features:
        n_columns = n_features * 3 + 2
    else:
        n_columns = n_features * 2 + 2
    
    # Create array to fill in later (faster)
    features = np.full(shape=(n_rows, n_columns), fill_value=np.nan)
    idx = 0
    
    # Compare each URL for a given query
    for progress, qid in enumerate(df.query_id.unique()):
        
        # tdf: temporary dataframe, m: number of URLs in tdf
        tdf = df[df.query_id == qid]
        m = tdf.shape[0]
        
        # First URL
        for i in range(m):
            
            if two_sided:
                start_j = 0
            else:
                start_j = i
            
            # Second URL
            for j in range(start_j, m):
                
                label_diff = tdf.label.iloc[i] - tdf.label.iloc[j]
                
                if repeat_importance:
                    end_k = 1
                else:
                    end_k = int(abs(label_diff)) + 1
                    
                for k in range(end_k):
                    
                    if delta_features:
                        new_row = np.hstack((tdf.iloc[i, 1:-1], 
                                             tdf.iloc[j, 1:-1], 
                                             tdf.iloc[i, 1:-1] - tdf.iloc[j, 1:-1], 
                                             qid, 
                                             label_diff))
                    else:
                            new_row = np.hstack((tdf.iloc[i, 1:-1], 
                                                 tdf.iloc[j, 1:-1], 
                                                 qid, 
                                                 label_diff))
                        
                    features[idx] = new_row
                    idx += 1

        print(progress)
        
    features = features[~np.isnan(features[:, 0])]
    return features

In [None]:
my_df = generate_dataframe(path + 'vali.txt', n_queries=5)

In [None]:
my_f = generate_features(my_df, repeat_importance=False, two_sided=False, delta_features=True)

In [None]:
X = my_f[:, :-2]
y = my_f[:, -1]

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
xgbr = XGBRegressor(max_depth=6, 
                    learning_rate=0.1,
                    n_estimators=100,
                    objective='reg:squarederror')
xgbr.fit(X, y)
mean_absolute_error(y, xgbr.predict(X))