<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
# import common libraries
%load_ext autoreload
%autoreload 2

#%matplotlib notebook
%matplotlib inline

In [None]:
# this is to install lightgbm
#import sys
#!{sys.executable} -m pip install lightgbm

In [None]:
import os
import os.path
import numpy as np
import lightgbm

# Documentation

 - http://lightgbm.readthedocs.io/en/latest/
 - http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 - https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide

# Train LightGBM and optimize NDCG

In [None]:
# Read dataset from file

train_file = "C:/opt/kiis-training/MSLR-WEB10K/Fold1/train.txt"

if not os.path.exists(train_file) or not os.path.isfile(train_file):
    raise FileNotFoundError("'" + train_file + "': no such file")

In [None]:
from sklearn.datasets import load_svmlight_file
# see http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_svmlight_file.html

train_data = load_svmlight_file(train_file, query_id=True)

In [None]:
print ("X shape:", train_data[0].shape )
print ("Y shape:", train_data[1].shape )
print ("qid shape: ", train_data[2].shape )

In [None]:
# Make it a LightGBM dataset

import itertools

query_lens = [ sum( 1 for _ in group ) for key, group in itertools.groupby( train_data[2] )  ]
train_lgb = lightgbm.Dataset(data=train_data[0], label=train_data[1], group=query_lens)


In [None]:
# Set training parameters
# see http://lightgbm.readthedocs.io/en/latest/Parameters.html

params = {
    'objective':'lambdarank', # what to optimize during training
    'max_position': 10,      # threshold used in optimizing lamdarank (NDCG)
    'learning_rate': 0.1,
    'num_leaves': 32,
    'min_data_in_leaf': 5,
    'metric': ['ndcg'],       # what to use/print for evaluation
    'ndcg_eval_at': 10
}    

lgbm_model = lightgbm.train(params, train_lgb, 
                            num_boost_round=100,
                            valid_sets = [train_lgb], 
                            verbose_eval=True)

In [None]:
# A more reasonable setting: train/valid/test

import itertools
from sklearn.datasets import load_svmlight_file
# see http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_svmlight_file.html


train_file = "C:/opt/kiis-training/MSLR-WEB10K/Fold1/train.txt"
valid_file = "C:/opt/kiis-training/MSLR-WEB10K/Fold1/vali.txt"
test_file  = "C:/opt/kiis-training/MSLR-WEB10K/Fold1/test.txt"

# train
raw_data = load_svmlight_file(train_file, query_id=True)
query_lens = [ sum( 1 for _ in group ) for key, group in itertools.groupby( raw_data[2] )  ]
train_lgb = lightgbm.Dataset(data=raw_data[0], label=raw_data[1], group=query_lens)

# valid
raw_data = load_svmlight_file(valid_file, query_id=True)
query_lens = [ sum( 1 for _ in group ) for key, group in itertools.groupby( raw_data[2] )  ]
valid_lgb = lightgbm.Dataset(data=raw_data[0], label=raw_data[1], group=query_lens)

# test
raw_data = load_svmlight_file(test_file, query_id=True)
query_lens = [ sum( 1 for _ in group ) for key, group in itertools.groupby( raw_data[2] )  ]
test_lgb = lightgbm.Dataset(data=raw_data[0], label=raw_data[1], group=query_lens)


In [None]:
params = {
    'objective':'lambdarank', # what to optimize during training
    'max_position': 10,      # threshold used in optimizing lamdarank (NDCG)
    'learning_rate': 0.1,
    'num_leaves': 16,
    'min_data_in_leaf': 5,
    'metric': ['ndcg'],       # what to use/print for evaluation
    'ndcg_eval_at': 10
}    

lgbm_info = {}

lgbm_model = lightgbm.train(params, train_lgb, num_boost_round=200,
                            valid_sets   = [train_lgb, valid_lgb, test_lgb], 
                            valid_names  = ["train", "valid", "test"],
                            evals_result = lgbm_info,
                            verbose_eval = 10)


lgbm_info

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(9,6), tight_layout=True)

plt.plot(lgbm_info['train']['ndcg@10'], label='training')
plt.plot(lgbm_info['valid']['ndcg@10'], label='validation')
plt.plot(lgbm_info['test']['ndcg@10'], label='test')

plt.grid()
plt.legend()
plt.xlabel("# Trees")
plt.ylabel("ndcg@10")
plt.title("Model Error")

# Advanced example with custom optimization

Let's optimize MSE which we know well.

- see: https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py

In [None]:
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def mse_eval(preds, train_data):
    labels = train_data.get_label()
    
    avg_mse = 0.5 * np.mean( (labels-preds)**2 )
    
    return 'Custom-MSE', avg_mse, False

In [None]:
params = {
    'objective':'lambdarank', # what to optimize during training
    'max_position': 10,      # threshold used in optimizing lamdarank (NDCG)
    'learning_rate': 0.1,
    'num_leaves': 16,
    'min_data_in_leaf': 5,
    'metric': ['None'], #['ndcg'],       # what to use/print for evaluation
#    'ndcg_eval_at': 10
}    

lgbm_info = {}

lgbm_model = lightgbm.train(params, train_lgb, num_boost_round=100,
                            feval = mse_eval,
                            valid_sets   = [train_lgb, valid_lgb, test_lgb], 
                            valid_names  = ["train", "valid", "test"],
                            evals_result = lgbm_info,
                            verbose_eval = 10)


lgbm_info

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(9,6), tight_layout=True)

plt.plot(lgbm_info['train']['Custom-MSE'], label='training')
plt.plot(lgbm_info['valid']['Custom-MSE'], label='validation')
plt.plot(lgbm_info['test']['Custom-MSE'], label='test')

plt.grid()
plt.legend()
plt.xlabel("# Trees")
plt.ylabel("Custom-MSE")
plt.title("Model Error")

In [None]:
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
def mse_grads(preds, train_data):
    labels = train_data.get_label()
    #grad = labels - preds # this is the neg grad !!!
    grad = preds - labels
    hess = np.ones_like(grad)
    return grad, hess

In [None]:
params = {
#    'objective':'lambdarank', # what to optimize during training
#    'max_position': 10,      # threshold used in optimizing lamdarank (NDCG)
    'learning_rate': 0.1,
    'num_leaves': 16,
    'min_data_in_leaf': 5,
    'metric': ['None'], #['ndcg'],       # what to use/print for evaluation
#    'ndcg_eval_at': 10
# try printing ndcg and testing
}    

lgbm_info = {}

lgbm_model = lightgbm.train(params, train_lgb, num_boost_round=100,
                            feval = mse_eval,
                            fobj  = mse_grads,
                            valid_sets   = [train_lgb, valid_lgb, test_lgb], 
                            valid_names  = ["train", "valid", "test"],
                            evals_result = lgbm_info,
                            verbose_eval = 10)


lgbm_info

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(9,6), tight_layout=True)

plt.plot(lgbm_info['train']['Custom-MSE'], label='training')
plt.plot(lgbm_info['valid']['Custom-MSE'], label='validation')
plt.plot(lgbm_info['test']['Custom-MSE'], label='test')

plt.grid()
plt.legend()
plt.xlabel("# Trees")
plt.ylabel("Custom-MSE")
plt.title("Model Error")

# Suggestion

If you want/need to be fast, use cython to implement your objective and evaluation functions.