In [3]:
import sys

def save_data(group_data,output_feature,output_group):
    if len(group_data) == 0:
        return

    output_group.write(str(len(group_data))+"\n")
    for data in group_data:
        # only include nonzero features
        feats = [ p for p in data[2:] if float(p.split(':')[1]) != 0.0 ]        
        output_feature.write(data[0] + " " + " ".join(feats) + "\n")


In [4]:
def prepare_data(datafile, featurefile, groupfile):
    
    fi = open(datafile)
    output_feature = open(featurefile,"w")
    output_group = open(groupfile,"w")
    
    group_data = []
    group = ""
    for line in fi:
        if not line:
            break
        if "#" in line:
            line = line[:line.index("#")]
        splits = line.strip().split(" ")
        if splits[1] != group:
            save_data(group_data,output_feature,output_group)
            group_data = []
        group = splits[1]
        group_data.append(splits)

    save_data(group_data,output_feature,output_group)

    fi.close()
    output_feature.close()
    output_group.close()

In [11]:
prepare_data('MQ2008/Fold1/train.txt', 'mq2008.train', 'mq2008.train.group')
prepare_data('MQ2008/Fold1/test.txt', 'mq2008.test', 'mq2008.test.group')
prepare_data('MQ2008/Fold1/vali.txt', 'mq2008.vali', 'mq2008.vali.group')

In [12]:
import sklearn
from sklearn.datasets import load_svmlight_file
x_train, y_train = load_svmlight_file("mq2008.train")
x_valid, y_valid = load_svmlight_file("mq2008.vali")
x_test, y_test = load_svmlight_file("mq2008.test")

In [13]:
import numpy as np
q_train = np.loadtxt('mq2008.train.group')
q_valid = np.loadtxt('mq2008.vali.group')
q_test = np.loadtxt('mq2008.test.group')

In [14]:
print(sum(q_train), x_train.shape)

9630.0 (9630, 46)


In [25]:
import lightgbm as lgb

gbm = lgb.LGBMRanker()

gbm.fit(x_train, y_train, group=q_train, eval_set=[(x_valid, y_valid)], 
        eval_group=[q_valid], eval_at=[1, 3], early_stopping_rounds=20, 
        verbose=True, callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])

[1]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0
Training until validation scores don't improve for 20 rounds
[2]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.567973
[3]	valid_0's ndcg@1: 0.333333	valid_0's ndcg@3: 0.156426
[4]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.333333
[5]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.0782131
[6]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.098694
[7]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.176907
[8]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.176907
[9]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.469279
[10]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.469279
[11]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.469279
[12]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.469279
[13]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.547492
[14]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.547492
[15]	valid_0's ndcg@1: 0.333333	valid_0's ndcg@3: 0.391066
[16]	valid_0's ndcg@1: 0.333333	valid_0's ndcg@3: 0.156426
[17]	valid_0's ndcg@1: 0.333333	valid_0's ndcg@3: 0.391066
[18]	valid_0's ndcg@1: 0.333333	valid_0's 

LGBMRanker(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
           importance_type='split', learning_rate=0.1, max_depth=-1,
           min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
           n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
           random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
           subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [26]:
from scipy.stats import spearmanr
preds_train = gbm.predict(x_train)
spearmanr(y_train, preds_train)

SpearmanrResult(correlation=0.3843555632065391, pvalue=0.0)

In [27]:
q_train = [x_train.shape[0]]
q_valid = [x_valid.shape[0]]
q_test = [x_test.shape[0]]
print(q_train)
gbm = lgb.LGBMRanker()
gbm.fit(x_train, y_train, group=q_train, eval_set=[(x_valid, y_valid)],
eval_group=[q_valid], eval_at=[1, 3], early_stopping_rounds=20, verbose=True,
callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])

[9630]
[1]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0
Training until validation scores don't improve for 20 rounds
[2]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.098694
[3]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.0782131
[4]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.0782131
[5]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.0782131
[6]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.333333
[7]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.333333
[8]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.333333
[9]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.333333
[10]	valid_0's ndcg@1: 0	valid_0's ndcg@3: 0.333333
[11]	valid_0's ndcg@1: 0.333333	valid_0's ndcg@3: 0.687148
[12]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.843574
[13]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.843574
[14]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.843574
[15]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.843574
[16]	valid_0's ndcg@1: 1	valid_0's ndcg@3: 0.843574
[17]	valid_0's ndcg@1: 0.333333	valid_0's ndcg@3: 0.530721
[18]	valid_0's ndcg@1: 0.333333	valid_0's ndcg@

LGBMRanker(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
           importance_type='split', learning_rate=0.1, max_depth=-1,
           min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
           n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
           random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
           subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [21]:
preds_train = gbm.predict(x_train)
spearmanr(y_train, preds_train)

SpearmanrResult(correlation=0.4481291965266587, pvalue=0.0)

In [22]:
x_train.shape[0]

9630

In [23]:
q_train = np.loadtxt('mq2008.train.group')
q_train

array([  8.,   8.,   8.,   8.,   8.,  16.,   8., 118.,  16.,   8.,   8.,
         8.,   7.,   8.,  16.,   8.,  16.,   8.,  32.,   8.,   8.,   8.,
        31.,   8.,   8.,  15.,   8.,  15.,   8.,  28.,   7.,  62.,   8.,
         8.,  16.,  16.,   8.,   8.,  15.,   8., 118.,   8.,  16.,   8.,
         8.,  16.,  16.,   8.,  16.,   8.,   8.,   8.,   8.,   8.,  16.,
         8.,  30.,   8.,   8.,   8.,   8.,   8.,  28.,   8., 113.,   7.,
        15.,  25.,   8.,  16.,   8.,  16.,  59.,   8.,   7.,   8.,  31.,
        30.,  31.,  32.,   8.,   8.,  16.,   8.,  31.,  15.,  15.,  28.,
        32.,   8.,  31.,  59.,   8.,   7.,  55.,   8.,   8.,   8.,  32.,
         7.,   8.,  16.,  58.,   8.,  15.,  29.,   8.,  31.,   8.,   8.,
        16.,   8.,   8.,   6.,   8.,  62.,   8.,   8.,  60.,   8.,   8.,
         8.,   8.,   8.,   8.,   7.,   8.,  16.,  13.,   8.,   7.,   8.,
         8.,   8.,   8., 114.,  15.,  31.,   8.,   7.,   8.,   8.,  29.,
         7.,  25.,   8.,   8.,   8., 118.,   7.,   