## Import Packages

In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [12]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import col,count,when,isnan,isnull

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import RandomForestClassifier

from pyspark.ml.evaluation import MulticlassClassificationEvaluator,BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

## SparkSession Creation

In [3]:
spark = SparkSession.builder.appName('Classification with Spark').getOrCreate()

## Read Dataset

In [4]:
dataset = spark.read.csv('diabetes.csv',header=True)

In [5]:
dataset.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|  31|                   0.248| 26|      1|


In [6]:
dataset.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)



## EDA
### Convert String to Float type

In [10]:
new_data = dataset.select(*(col(c).cast('float').alias(c) for c in dataset.columns))

In [11]:
new_data.printSchema()

root
 |-- Pregnancies: float (nullable = true)
 |-- Glucose: float (nullable = true)
 |-- BloodPressure: float (nullable = true)
 |-- SkinThickness: float (nullable = true)
 |-- Insulin: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: float (nullable = true)
 |-- Outcome: float (nullable = true)



### Null & NAN check

In [15]:
new_data.select([count(when(col(c).isNull(),c)).alias(c) for c in new_data.columns]).show()

+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|          0|      0|            0|            0|      0|  0|                       0|  0|      0|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+



## Feature Engineering

In [16]:
cols = new_data.columns
cols.remove("Outcome")
assembler = VectorAssembler(inputCols = cols, outputCol='features')


In [17]:
data = assembler.transform(new_data)
data.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|Outcome|            features|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|        6.0|  148.0|         72.0|         35.0|    0.0|33.6|                   0.627|50.0|    1.0|[6.0,148.0,72.0,3...|
|        1.0|   85.0|         66.0|         29.0|    0.0|26.6|                   0.351|31.0|    0.0|[1.0,85.0,66.0,29...|
|        8.0|  183.0|         64.0|          0.0|    0.0|23.3|                   0.672|32.0|    1.0|[8.0,183.0,64.0,0...|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|    0.0|[1.0,89.0,66.0,23...|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|    1.0|[0.0,137.0,40.0,3...|
|        5.0|  116.0|   

In [19]:
data.select('features','Outcome').show(truncate=False)

+-----------------------------------------------------------------------+-------+
|features                                                               |Outcome|
+-----------------------------------------------------------------------+-------+
|[6.0,148.0,72.0,35.0,0.0,33.599998474121094,0.6269999742507935,50.0]   |1.0    |
|[1.0,85.0,66.0,29.0,0.0,26.600000381469727,0.35100001096725464,31.0]   |0.0    |
|[8.0,183.0,64.0,0.0,0.0,23.299999237060547,0.671999990940094,32.0]     |1.0    |
|[1.0,89.0,66.0,23.0,94.0,28.100000381469727,0.16699999570846558,21.0]  |0.0    |
|[0.0,137.0,40.0,35.0,168.0,43.099998474121094,2.2880001068115234,33.0] |1.0    |
|[5.0,116.0,74.0,0.0,0.0,25.600000381469727,0.20100000500679016,30.0]   |0.0    |
|[3.0,78.0,50.0,32.0,88.0,31.0,0.24799999594688416,26.0]                |1.0    |
|[10.0,115.0,0.0,0.0,0.0,35.29999923706055,0.1340000033378601,29.0]     |0.0    |
|[2.0,197.0,70.0,45.0,543.0,30.5,0.15800000727176666,53.0]              |1.0    |
|[8.0,125.0,96.0

In [20]:
standardscaler = StandardScaler().setInputCol('features').setOutputCol('Scaled_features')
data = standardscaler.fit(data).transform(data)

In [22]:
data.select('features','Outcome','Scaled_features').show(truncate=False)

+-----------------------------------------------------------------------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                               |Outcome|Scaled_features                                                                                                                                          |
+-----------------------------------------------------------------------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|[6.0,148.0,72.0,35.0,0.0,33.599998474121094,0.6269999742507935,50.0]   |1.0    |[1.7806383732194306,4.628960915766174,3.7198138711154307,2.1940523222807116,0.0,4.261709202425419,1.8923810993699686,4.251616970894646]                  |
|[1.0,85.0,66.0,29.0,0.0,26.600000381469727,0.3510000109

In [24]:
assembled_data = data.select("Scaled_features",'Outcome')
assembled_data.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|Scaled_features                                                                                                                                          |Outcome|
+---------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|[1.7806383732194306,4.628960915766174,3.7198138711154307,2.1940523222807116,0.0,4.261709202425419,1.8923810993699686,4.251616970894646]                  |1.0    |
|[0.29677306220323846,2.658524850271114,3.4098293818558116,1.8179290670325896,0.0,3.373853320188119,1.0593713140527197,2.6360025219546803]                |0.0    |
|[2.3741844976259077,5.723647618818986,3.306501218769272,0.0,0.0,2.955292430788826,2.028197980632078,2.721034861372573]                                   |1.0    |
|[0.296773062203

### Train Test Split

In [25]:
train, test =assembled_data.randomSplit([0.7,0.3])

In [26]:
train.show()

+--------------------+-------+
|     Scaled_features|Outcome|
+--------------------+-------+
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[0.8...|    0.0|
|(8,[0,1,6,7],[2.9...|    1.0|
|(8,[1,5,6,7],[2.2...|    0.0|
|(8,[1,5,6,7],[3.6...|    0.0|
|(8,[1,5,6,7],[3.7...|    1.0|
|(8,[1,5,6,7],[4.3...|    1.0|
|(8,[1,5,6,7],[4.4...|    1.0|
|(8,[1,6,7],[2.940...|    0.0|
|[0.0,1.7827754878...|    0.0|
|[0.0,2.0955431172...|    0.0|
|[0.0,2.3144804578...|    0.0|
|[0.0,2.4395875096...|    0.0|
|[0.0,2.6272480873...|    0.0|
|[0.0,2.6272480873...|    0.0|
|[0.0,2.6898016132...|    0.0|
|[0.0,2.8461854279...|    0.0|
|[0.0,2.8461854279...|    0.0|
|[0.0,2.9087389538...|    0.0|
|[0.0,2.9400157167...|    0.0|
+--------------------+-------+
only showing top 20 rows



In [27]:
test.show()

+--------------------+-------+
|     Scaled_features|Outcome|
+--------------------+-------+
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[1.7...|    0.0|
|(8,[0,1,6,7],[2.0...|    0.0|
|(8,[1,5,6,7],[3.0...|    0.0|
|(8,[1,5,6,7],[4.0...|    1.0|
|(8,[1,5,6,7],[4.5...|    1.0|
|(8,[1,5,6,7],[5.2...|    1.0|
|[0.0,2.9087389538...|    0.0|
|[0.0,2.9087389538...|    0.0|
|[0.0,2.9712924797...|    1.0|
|[0.0,3.1276762944...|    0.0|
|[0.0,3.1589530573...|    0.0|
|[0.0,3.1902298203...|    0.0|
|[0.0,3.1902298203...|    0.0|
|[0.0,3.6593812644...|    0.0|
|[0.0,3.6906580274...|    0.0|
|[0.0,3.7532115533...|    0.0|
|[0.0,3.7844883162...|    1.0|
|[0.0,3.8470418421...|    1.0|
|[0.0,4.1910862345...|    0.0|
+--------------------+-------+
only showing top 20 rows



## Modeling
### Logistic Regression

In [30]:
log_reg = LogisticRegression(labelCol="Outcome",featuresCol="Scaled_features",maxIter=40)
lor_model = log_reg.fit(train)

In [31]:
lor_pred = lor_model.transform(test)

In [32]:
lor_pred.show()

+--------------------+-------+--------------------+--------------------+----------+
|     Scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.5...|    0.0|[4.91353615128169...|[0.99270711279712...|       0.0|
|(8,[0,1,6,7],[1.7...|    0.0|[2.78696618194638...|[0.94196742387747...|       0.0|
|(8,[0,1,6,7],[2.0...|    0.0|[2.86484272196646...|[0.94608087009026...|       0.0|
|(8,[1,5,6,7],[3.0...|    0.0|[2.13970458069086...|[0.89470278235402...|       0.0|
|(8,[1,5,6,7],[4.0...|    1.0|[-0.5552463920694...|[0.36464806469510...|       1.0|
|(8,[1,5,6,7],[4.5...|    1.0|[-1.5413513256855...|[0.17633891765067...|       1.0|
|(8,[1,5,6,7],[5.2...|    1.0|[-1.4733542383570...|[0.18643332153367...|       1.0|
|[0.0,2.9087389538...|    0.0|[2.29664159194255...|[0.90859851488323...|       0.0|
|[0.0,2.9087389538...|    0.0|[2.64799194200083...|[0.93388711702161...|    

In [33]:
lor_pred.select("Outcome","prediction").show(10)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       1.0|
|    1.0|       1.0|
|    1.0|       1.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
+-------+----------+
only showing top 10 rows



In [34]:
# Compute row score on the test data
predAndLabels = lor_pred.select("Outcome","prediction").rdd

In [35]:
predAndLabels.collect()

[Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=1.0),
 Row(Outcome=0.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, pr

In [36]:
metrcis = BinaryClassificationMetrics(predAndLabels)

print('Area Under ROC = %s' % metrcis.areaUnderROC)



Area Under ROC = 0.7730644968290056


#### Logistic Regression Model Evaluation

In [38]:
evalutor = MulticlassClassificationEvaluator(labelCol="Outcome",predictionCol='prediction',metricName ='accuracy')
accuracy_LR = evalutor.evaluate(lor_pred)

In [49]:
print("Accuracy LR:-",accuracy_LR)

Accuracy LR:- 0.7929292929292929


### Naive Bayes

In [40]:
naive_bayes = NaiveBayes(featuresCol = "Scaled_features",labelCol = "Outcome", smoothing = 1.0)
## smoothing is used for smooth the categorical variable  to slove the zero probabilities in Naive Bayes.

In [41]:
nb_model = naive_bayes.fit(train)

In [42]:
nb_pred_test = model.transform(test)

In [43]:
nb_pred_test.show()

+--------------------+-------+--------------------+--------------------+----------+
|     Scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.5...|    0.0|[4.91353615128169...|[0.99270711279712...|       0.0|
|(8,[0,1,6,7],[1.7...|    0.0|[2.78696618194638...|[0.94196742387747...|       0.0|
|(8,[0,1,6,7],[2.0...|    0.0|[2.86484272196646...|[0.94608087009026...|       0.0|
|(8,[1,5,6,7],[3.0...|    0.0|[2.13970458069086...|[0.89470278235402...|       0.0|
|(8,[1,5,6,7],[4.0...|    1.0|[-0.5552463920694...|[0.36464806469510...|       1.0|
|(8,[1,5,6,7],[4.5...|    1.0|[-1.5413513256855...|[0.17633891765067...|       1.0|
|(8,[1,5,6,7],[5.2...|    1.0|[-1.4733542383570...|[0.18643332153367...|       1.0|
|[0.0,2.9087389538...|    0.0|[2.29664159194255...|[0.90859851488323...|       0.0|
|[0.0,2.9087389538...|    0.0|[2.64799194200083...|[0.93388711702161...|    

In [44]:
nb_predAndLabel = nb_pred_test.select("Outcome","prediction").rdd

In [47]:
nb_predAndLabel.collect()

[Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=1.0),
 Row(Outcome=0.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, pr

In [57]:
metrcis_NB = BinaryClassificationMetrics(nb_predAndLabel)

print('Naive Bayes Area Under ROC = %s' % metrcis_NB.areaUnderROC)

Naive Bayes Area Under ROC = 0.7730644968290056


#### Naive Bayes Model Evaluation

In [45]:
nb_evalutor = MulticlassClassificationEvaluator(labelCol="Outcome",predictionCol='prediction',metricName ='accuracy')
accuracy_NB = nb_evalutor.evaluate(nb_pred_test)

In [46]:
print("Accuracy of Naive Bayes model:-",accuracy_NB)

Accuracy of Naive Bayes model:- 0.7929292929292929


### GBTClassifier

In [50]:
gradient_boost_class = GBTClassifier(labelCol = "Outcome",featuresCol = "Scaled_features")

In [51]:
gbt_model = gradient_boost_class.fit(train)

In [52]:
gbt_pred_test = gbt_model.transform(test)

In [53]:
gbt_pred_test.show()

+--------------------+-------+--------------------+--------------------+----------+
|     Scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.5...|    0.0|[1.52390334663133...|[0.95468774069736...|       0.0|
|(8,[0,1,6,7],[1.7...|    0.0|[1.10905437163393...|[0.90186393648473...|       0.0|
|(8,[0,1,6,7],[2.0...|    0.0|[-1.5369785944876...|[0.04419436924387...|       1.0|
|(8,[1,5,6,7],[3.0...|    0.0|[1.51347234675561...|[0.95377666104972...|       0.0|
|(8,[1,5,6,7],[4.0...|    1.0|[-1.3787947731125...|[0.05965944946850...|       1.0|
|(8,[1,5,6,7],[4.5...|    1.0|[-1.5745476169039...|[0.04112694331416...|       1.0|
|(8,[1,5,6,7],[5.2...|    1.0|[-1.2533569407447...|[0.07538885022068...|       1.0|
|[0.0,2.9087389538...|    0.0|[1.32056586903105...|[0.93346229166846...|       0.0|
|[0.0,2.9087389538...|    0.0|[1.49953021413660...|[0.95253166205624...|    

In [54]:
gbt_predAndLabel = gbt_pred_test.select('Outcome','prediction').rdd

In [55]:
gbt_predAndLabel.collect()

[Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=1.0),
 Row(Outcome=0.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, pr

In [56]:
metrcis_GBT = BinaryClassificationMetrics(gbt_predAndLabel)

print('GBT Area Under ROC = %s' % metrcis_GBT.areaUnderROC)



GBT Area Under ROC = 0.6752262443438914


#### GBT model evaluation

In [58]:
gbt_evalutor = MulticlassClassificationEvaluator(labelCol="Outcome",predictionCol='prediction',metricName ='accuracy')
accuracy_GBT = gbt_evalutor.evaluate(gbt_pred_test)

In [59]:
print("Accuracy of GBT model:-",accuracy_GBT)

Accuracy of GBT model:- 0.7070707070707071


### RandomForestClassifier

In [60]:
random_forest_classifier = RandomForestClassifier(labelCol = "Outcome",featuresCol = "Scaled_features",numTrees=40)

In [61]:
rf_model = random_forest_classifier.fit(train)

In [63]:
rf_pred_test = rf_model.transform(test)

In [64]:
rf_pred_test.show()

+--------------------+-------+--------------------+--------------------+----------+
|     Scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.5...|    0.0|[36.0775637485086...|[0.90193909371271...|       0.0|
|(8,[0,1,6,7],[1.7...|    0.0|[34.5360570934643...|[0.86340142733660...|       0.0|
|(8,[0,1,6,7],[2.0...|    0.0|[26.3845253246621...|[0.65961313311655...|       0.0|
|(8,[1,5,6,7],[3.0...|    0.0|[37.5358970818419...|[0.93839742704604...|       0.0|
|(8,[1,5,6,7],[4.0...|    1.0|[20.1173400276634...|[0.50293350069158...|       0.0|
|(8,[1,5,6,7],[4.5...|    1.0|[13.6604454045065...|[0.34151113511266...|       1.0|
|(8,[1,5,6,7],[5.2...|    1.0|[9.11596369694374...|[0.22789909242359...|       1.0|
|[0.0,2.9087389538...|    0.0|[34.6582034232591...|[0.86645508558147...|       0.0|
|[0.0,2.9087389538...|    0.0|[37.0799473431565...|[0.92699868357891...|    

In [68]:
rf_predAndLabel = rf_pred_test.select('Outcome','prediction').rdd

In [69]:
rf_predAndLabel.collect()

[Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=1.0),
 Row(Outcome=0.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, pr

In [70]:
metrcis_RF = BinaryClassificationMetrics(rf_predAndLabel)

print('RF Area Under ROC = %s' % metrcis_RF.areaUnderROC)



RF Area Under ROC = 0.765741604477612


#### RandomForest Classifier model evaluation

In [71]:
rf_evalutor = MulticlassClassificationEvaluator(labelCol="Outcome",predictionCol='prediction',metricName ='accuracy')
accuracy_RF = rf_evalutor.evaluate(rf_pred_test)

In [74]:
print("Accuracy of RF model:-",accuracy_RF)

Accuracy of RF model:- 0.7878787878787878


In [73]:
#ROC :- Calculate best value of of threshold by which probability tell H0 or H1.
#AUC :- It captures the area under the curve and compares the relation with the True Positive Rate and False Positive Rate.

In [75]:
print("Accuracy LR:-",accuracy_LR)
print("Accuracy of Naive Bayes model:-",accuracy_NB)
print("Accuracy of GBT model:-",accuracy_GBT)
print("Accuracy of RF model:-",accuracy_RF)

Accuracy LR:- 0.7929292929292929
Accuracy of Naive Bayes model:- 0.7929292929292929
Accuracy of GBT model:- 0.7070707070707071
Accuracy of RF model:- 0.7878787878787878
