## LightGBM - Ranking with LambdaRank

We will demonstrate how to use the LightGBM ranker.


This sample demonstrates how to use the following APIs:
- `LightGBMRanker`

In [None]:
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import explode

Read the SVMLight data file

In [None]:

df1 = spark.read.format('libsvm') \
    .load("https://github.com/microsoft/LightGBM/raw/master/examples/lambdarank/rank.train") \
    .withColumn('iid', monotonically_increasing_id())

Create a helper function to process the query file

In [None]:
import numpy as np
from pyspark.sql.types import ArrayType, LongType
def create_rows(value, index):
    arr = np.zeros([value])
    arr.fill(index)
    return arr.astype('int').tolist()
create_rows_udf = udf(create_rows, ArrayType(LongType()))

In [None]:
create_rows(13, 1)

Read the CSV query file and join with the SVMLight training data

In [None]:
query_col = 'query'
label_col = 'labels'
# read in CSV file
df2 = spark.read.format('csv').option('inferSchema', True) \
    .load("https://github.com/microsoft/LightGBM/raw/master/examples/lambdarank/rank.train.query") \
    .withColumn('index', monotonically_increasing_id()) \
    .withColumn(query_col, explode(create_rows_udf('_c0', 'index'))) \
    .withColumn('iid', monotonically_increasing_id()) \
    .drop('_c0', 'index') \
    .join(df1, 'iid').drop('iid') \
    .withColumnRenamed('label', label_col)

In [None]:
# print some basic info
print("records read: " + str(df1.count()))
print("Schema: ")
df2.printSchema()
df2.limit(10).toPandas()

In [None]:
from mmlspark.lightgbm import LightGBMRanker

Create the LightGBMRanker

In [None]:
features_col = 'features'
lgbm_ranker = LightGBMRanker(labelCol=label_col,
                             featuresCol=features_col,
                             groupCol=query_col,
                             predictionCol='preds',
                             leafPredictionCol='leafPreds',
                             featuresShapCol='importances',
                             repartitionByGroupingColumn=True,
                             numLeaves=32,
                             numIterations=200,
                             evalAt=[1,3,5],
                             metric='ndcg')

In [None]:
lgbm_ranker_model = lgbm_ranker.fit(df2)

Read in SVMLight test file and query column

In [None]:

dt1 = spark.read.format('libsvm') \
    .load("https://github.com/microsoft/LightGBM/raw/master/examples/lambdarank/rank.train") \
    .withColumn('iid', monotonically_increasing_id())
dt2 = spark.read.format('csv').option('inferSchema', True) \
    .load("https://github.com/microsoft/LightGBM/raw/master/examples/lambdarank/rank.test.query") \
    .withColumn('index', monotonically_increasing_id()) \
    .withColumn(query_col, explode(create_rows_udf('_c0', 'index'))) \
    .withColumn('iid', monotonically_increasing_id()) \
    .drop('_c0', 'index') \
    .join(df1, 'iid').drop('iid') \
    .withColumnRenamed('label', label_col)

Get the predictions from LambdaRank

In [None]:
predictions = lgbm_ranker_model.transform(dt2)

In [None]:
predictions.limit(10).toPandas()