In [2]:
import pyspark

# com.microsoft.ml.spark:mmlspark_2.11:1.0.0-rc1 Cannot Find Package
spark = pyspark.sql.SparkSession.builder.appName("MyApp") \
            .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.11:0.18.0") \
            .getOrCreate()

import mmlspark

https://github.com/Azure/mmlspark/blob/master/notebooks/samples/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb
https://mmlspark.blob.core.windows.net/docs/1.0.0-rc1/pyspark/mmlspark.lightgbm.html

In [17]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from mmlspark.train import ComputeModelStatistics
from mmlspark.lightgbm import LightGBMRegressor
import numpy as np
import sys

In [22]:
print("System version: {}".format(sys.version))
print("PySpark version: {}".format(pyspark.version.__version__))

System version: 3.7.6 | packaged by conda-forge | (default, Mar 23 2020, 23:03:20) 
[GCC 7.3.0]
PySpark version: 2.4.5


In [73]:
input_path = "/home/jovyan/work/ScalaAndSpark/pima-indians-diabetes.csv" # https://uzshare.com/view/811490

pima_indian_dataset = spark.read.format("csv")\
    .option("sep", ",")\
    .option("inferSchema", "true")\
    .option("header", "false")\
    .load(input_path)\
    .toDF("Pregnancles","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFuction","Age","Outcome")

In [40]:
pima_indian_dataset.show()

+-----------+-------+-------------+-------------+-------+----+-----------------------+---+-------+
|Pregnancles|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFuction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+-----------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                  0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                  0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                  0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                  0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                  2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                  0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                  0.248| 26|      1|
|         

In [43]:
from pyspark.ml.feature import VectorAssembler

In [63]:
features = pima_indian_dataset.columns[0:-1]
label = pima_indian_dataset.columns[-1]

In [64]:
va = VectorAssembler().setInputCols(features).setOutputCol("features")
va.transform(pima_indian_dataset).show()

+-----------+-------+-------------+-------------+-------+----+-----------------------+---+-------+--------------------+
|Pregnancles|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFuction|Age|Outcome|            features|
+-----------+-------+-------------+-------------+-------+----+-----------------------+---+-------+--------------------+
|          6|    148|           72|           35|      0|33.6|                  0.627| 50|      1|[6.0,148.0,72.0,3...|
|          1|     85|           66|           29|      0|26.6|                  0.351| 31|      0|[1.0,85.0,66.0,29...|
|          8|    183|           64|            0|      0|23.3|                  0.672| 32|      1|[8.0,183.0,64.0,0...|
|          1|     89|           66|           23|     94|28.1|                  0.167| 21|      0|[1.0,89.0,66.0,23...|
|          0|    137|           40|           35|    168|43.1|                  2.288| 33|      1|[0.0,137.0,40.0,3...|
|          5|    116|           74|     

In [69]:
from mmlspark.lightgbm import LightGBMClassifier
lgbm = LightGBMClassifier().setFeaturesCol("features").setLabelCol("Outcome")

In [77]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[va, lgbm])

In [75]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder()\
    .addGrid(lgbm.learningRate, [0.05, 0.1])\
    .build()

In [76]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()\
    .setLabelCol("Outcome")\
    .setRawPredictionCol("prediction")\
    .setMetricName("areaUnderROC")

In [78]:
from pyspark.ml.tuning import CrossValidator

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          seed=0)

In [79]:
train, test = pima_indian_dataset.randomSplit([.8, .2], 0)

In [80]:
lgbModel = crossval.fit(train)

In [82]:
results = lgbModel.transform(test)
auc = evaluator.evaluate(results)

print("----- AUC -----")
print(f"The Model's AUC is : {auc}")

----- AUC -----
The Model's AUC is : 0.7291482789055606
