<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Description" data-toc-modified-id="Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Description</a></span></li><li><span><a href="#Load-the-libraries" data-toc-modified-id="Load-the-libraries-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the libraries</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Data-Preparation" data-toc-modified-id="Data-Preparation-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Data Preparation</a></span></li><li><span><a href="#Modelling" data-toc-modified-id="Modelling-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Modelling</a></span><ul class="toc-item"><li><span><a href="#Logistic-Regression" data-toc-modified-id="Logistic-Regression-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Logistic Regression</a></span></li><li><span><a href="#Different-Models" data-toc-modified-id="Different-Models-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Different Models</a></span></li></ul></li></ul></div>

# Description
Github link: https://github.com/Azure/mmlspark  
lightgbm doc: https://github.com/Azure/mmlspark/blob/master/docs/lightgbm.md  

# Load the libraries

In [1]:
import numpy as np
import pandas as pd
import os
HOME = os.path.expanduser('~')

import findspark
# findspark.init(HOME + "/Softwares/Spark/spark-3.0.0-bin-hadoop2.7")

# We need to use spark 2.4.6 to use lgbm
findspark.init(HOME + "/Softwares/Spark/spark-2.4.6-bin-hadoop2.7")

import pyspark
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

spark = (pyspark.sql.SparkSession.builder.appName("MyApp")

    # config for microsoft ml spark
    .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.11:1.0.0-rc2") 
    .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
         
    # usual
    .getOrCreate()
    )
import mmlspark

SEED = 100

df_eval = pd.DataFrame({
    "Model": [],
    "Description": [],
    "Accuracy": [],
    "Precision": [],
    "AUC": []
})

from pyspark.ml.feature import VectorAssembler

from pyspark.ml.classification import RandomForestClassifier
from mmlspark.lightgbm import LightGBMClassifier

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

print(f'pyspark version: {pyspark.__version__}')

pyspark version: 2.4.6


# Load the data

In [2]:
sdf = spark.read.csv('affairs.csv',inferSchema=True,header=True)
print((sdf.count(),len(sdf.columns)))

print(sdf.printSchema())
sdf.show(5)

(6366, 6)
root
 |-- rate_marriage: integer (nullable = true)
 |-- age: double (nullable = true)
 |-- yrs_married: double (nullable = true)
 |-- children: double (nullable = true)
 |-- religious: integer (nullable = true)
 |-- affairs: integer (nullable = true)

None
+-------------+----+-----------+--------+---------+-------+
|rate_marriage| age|yrs_married|children|religious|affairs|
+-------------+----+-----------+--------+---------+-------+
|            5|32.0|        6.0|     1.0|        3|      0|
|            4|22.0|        2.5|     0.0|        2|      0|
|            3|32.0|        9.0|     3.0|        3|      1|
|            3|27.0|       13.0|     3.0|        1|      1|
|            4|22.0|        2.5|     0.0|        1|      1|
+-------------+----+-----------+--------+---------+-------+
only showing top 5 rows



In [3]:
from mmlspark.stages import SummarizeData
summary = SummarizeData().transform(sdf)
summary.toPandas()

Unnamed: 0,Feature,Count,Unique_Value_Count,Missing_Value_Count,Min,1st_Quartile,Median,3rd_Quartile,Max,Sample_Variance,Sample_Standard_Deviation,Sample_Skewness,Sample_Kurtosis,P0_5,P1,P5,P95,P99,P99_5
0,rate_marriage,6366.0,5.0,0.0,1.0,4.0,4.0,5.0,5.0,0.924347,0.96143,-1.009472,0.557297,1.0,1.0,2.0,5.0,5.0,5.0
1,age,6366.0,6.0,0.0,17.5,22.0,27.0,32.0,42.0,46.893486,6.847882,0.576013,-0.733838,17.5,17.5,22.0,42.0,42.0,42.0
2,yrs_married,6366.0,7.0,0.0,0.5,2.5,6.0,16.5,23.0,53.000147,7.28012,0.725683,-0.791416,0.5,0.5,0.5,23.0,23.0,23.0
3,children,6366.0,6.0,0.0,0.0,0.0,1.0,2.0,5.5,2.054839,1.433471,0.89077,0.244131,0.0,0.0,0.0,4.0,5.5,5.5
4,religious,6366.0,4.0,0.0,1.0,2.0,2.0,3.0,4.0,0.771532,0.878369,-0.028343,-0.73352,1.0,1.0,1.0,4.0,4.0,4.0
5,affairs,6366.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.218526,0.467468,0.759494,-1.423169,0.0,0.0,0.0,1.0,1.0,1.0


# Data Preparation

In [4]:
sdf.show(2)

+-------------+----+-----------+--------+---------+-------+
|rate_marriage| age|yrs_married|children|religious|affairs|
+-------------+----+-----------+--------+---------+-------+
|            5|32.0|        6.0|     1.0|        3|      0|
|            4|22.0|        2.5|     0.0|        2|      0|
+-------------+----+-----------+--------+---------+-------+
only showing top 2 rows



In [5]:
sdf.columns

['rate_marriage', 'age', 'yrs_married', 'children', 'religious', 'affairs']

In [6]:
from mmlspark.featurize import CleanMissingData
cols = ['rate_marriage', 'yrs_married', 'children', 'religious']

removeNansMedian = (CleanMissingData()
              .setCleaningMode("Median")                                                 
              .setInputCols(cols)
              .setOutputCols(cols))

# sdf = removeNansMedian.fit(sdf).transform(sdf)

In [7]:
from mmlspark.featurize import CleanMissingData
cols = ['age']

removeNansMean = (CleanMissingData()
              .setCleaningMode("Mean")                                                 
              .setInputCols(cols)
              .setOutputCols(cols))

# sdf = removeNansMean.fit(sdf).transform(sdf)

In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
inputCols = ['rate_marriage', 'age', 'yrs_married', 'children', 'religious']
labelCol = "affairs"
assembler = VectorAssembler(inputCols=inputCols, outputCol="features")

sdf = assembler.transform(sdf)

In [10]:
train,test,validation = sdf.select(['features',labelCol]).randomSplit([0.6,0.2,0.2],seed=SEED)

In [11]:
train.count()

3889

In [12]:
train.groupBy(labelCol).count().show()

+-------+-----+
|affairs|count|
+-------+-----+
|      1| 1241|
|      0| 2648|
+-------+-----+



In [13]:
test.groupBy(labelCol).count().show()

+-------+-----+
|affairs|count|
+-------+-----+
|      1|  421|
|      0|  799|
+-------+-----+



# Modelling

## Logistic Regression

In [14]:
from mmlspark.train import TrainClassifier
from pyspark.ml.classification import LogisticRegression

model = TrainClassifier(model=LogisticRegression(), labelCol=labelCol, numFeatures=256)
model_name = "logreg.pkl"
model.write().overwrite().save(model_name)

model = TrainClassifier.load(model_name)

model = model.fit(train)

In [15]:
from mmlspark.train import ComputeModelStatistics, TrainedClassifierModel

prediction = model.transform(test)
metrics = ComputeModelStatistics().transform(prediction)
df_metrics = metrics.toPandas()

df_metrics

Unnamed: 0,evaluation_type,confusion_matrix,accuracy,precision,recall,AUC
0,Classification,"DenseMatrix([[719., 80.],\n [262....",0.719672,0.665272,0.377672,0.726141


In [16]:
df_metrics['confusion_matrix'][0]

DenseMatrix(2, 2, [719.0, 262.0, 80.0, 159.0], False)

## Different Models

In [17]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from mmlspark.train import TrainClassifier
import itertools

lrHyperParams       = [0.05, 0.2]
logisticRegressions = [LogisticRegression(regParam = hyperParam)
                       for hyperParam in lrHyperParams]
lrmodels            = [TrainClassifier(model=lrm, labelCol=labelCol).fit(train)
                       for lrm in logisticRegressions]

rfHyperParams       = itertools.product([5, 10], [2, 3]) # [(5,2),(5,3) etc]
randomForests       = [RandomForestClassifier(numTrees=hyperParam[0], maxDepth=hyperParam[1])
                       for hyperParam in rfHyperParams]
rfmodels            = [TrainClassifier(model=rfm, labelCol=labelCol).fit(train)
                       for rfm in randomForests]

gbtHyperParams      = itertools.product([8, 16], [2, 3])
gbtclassifiers      = [GBTClassifier(maxBins=hyperParam[0], maxDepth=hyperParam[1])
                       for hyperParam in gbtHyperParams]
gbtmodels           = [TrainClassifier(model=gbt, labelCol=labelCol).fit(train)
                       for gbt in gbtclassifiers]

trainedModels       = lrmodels + rfmodels + gbtmodels

In [18]:
from mmlspark.automl import FindBestModel
bestModel = FindBestModel(evaluationMetric="AUC", models=trainedModels).fit(test)

display(bestModel.getEvaluationResults().limit(5).toPandas())
display(bestModel.getBestModelMetrics().toPandas())
display(bestModel.getAllModelMetrics().toPandas())

Unnamed: 0,false_positive_rate,true_positive_rate
0,0.0,0.0
1,0.0,0.002375
2,0.0,0.004751
3,0.0,0.007126
4,0.0,0.009501


Unnamed: 0,evaluation_type,confusion_matrix,accuracy,precision,recall,AUC
0,Classification,"DenseMatrix([[739., 60.],\n [308....",0.698361,0.653179,0.268409,0.720258


Unnamed: 0,model_name,metric,parameters
0,TrainClassifier_b58c02ac354f,0.720258,"aggregationDepth: 2, elasticNetParam: 0.0, fam..."
1,TrainClassifier_1406bcdc2152,0.716458,"aggregationDepth: 2, elasticNetParam: 0.0, fam..."
2,TrainClassifier_3217762c976f,0.691793,"cacheNodeIds: false, checkpointInterval: 10, f..."
3,TrainClassifier_94107f5e0d2c,0.691953,"cacheNodeIds: false, checkpointInterval: 10, f..."
4,TrainClassifier_b6d4c3aa4f48,0.691057,"cacheNodeIds: false, checkpointInterval: 10, f..."
5,TrainClassifier_115646e3f6b4,0.692549,"cacheNodeIds: false, checkpointInterval: 10, f..."
6,TrainClassifier_695deb684592,0.625097,"cacheNodeIds: false, checkpointInterval: 10, f..."
7,TrainClassifier_b1a4b9159fea,0.633525,"cacheNodeIds: false, checkpointInterval: 10, f..."
8,TrainClassifier_c17bea260adb,0.625097,"cacheNodeIds: false, checkpointInterval: 10, f..."
9,TrainClassifier_c31eb9314ffa,0.633525,"cacheNodeIds: false, checkpointInterval: 10, f..."


In [19]:
from mmlspark.train import ComputeModelStatistics

predictions = bestModel.transform(validation)
metrics = ComputeModelStatistics().transform(predictions)
print("Best model's accuracy on validation set = "
      + "{0:.2f}%".format(metrics.first()["accuracy"] * 100))
print("Best model's AUC on validation set = "
      + "{0:.2f}%".format(metrics.first()["AUC"] * 100))

Best model's accuracy on validation set = 71.20%
Best model's AUC on validation set = 74.33%


In [20]:
df_metrics = metrics.toPandas()
df_metrics

Unnamed: 0,evaluation_type,confusion_matrix,accuracy,precision,recall,AUC
0,Classification,"DenseMatrix([[795., 71.],\n [291....",0.712013,0.584795,0.255754,0.743348


In [21]:
cm = df_metrics['confusion_matrix'][0]
print(cm)
type(cm)

DenseMatrix([[795.,  71.],
             [291., 100.]])


pyspark.ml.linalg.DenseMatrix

In [22]:
[i for i in dir(cm) if i[0]!='_']

['isTransposed', 'numCols', 'numRows', 'toArray', 'toSparse', 'values']

In [23]:
df_cm = pd.DataFrame(cm.toArray())
df_cm

Unnamed: 0,0,1
0,795.0,71.0
1,291.0,100.0
