In [3]:
import spark
from pyspark.sql import SparkSession

In [4]:
# evaluate model to get model performance
from pyspark.ml.evaluation import RegressionEvaluator

In [7]:
spark = SparkSession.builder.appName('pyspark regression algos').getOrCreate()
spark

In [8]:
df = spark.read.csv('diabetes.csv', header = True)
df.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [9]:
# its a sql df
type(df)

pyspark.sql.dataframe.DataFrame

In [10]:
# to get info on the df
df.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)



In [11]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|summary|       Pregnancies|          Glucose|     BloodPressure|     SkinThickness|           Insulin|               BMI|DiabetesPedigreeFunction|               Age|           Outcome|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|  count|               768|              768|               768|               768|               768|               768|                     768|               768|               768|
|   mean|3.8450520833333335|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.992578124999977|      0.4718763020833327|33.240885416666664|0.3489583333333333|
| stddev|  3.36957806269887|31.97261819513622|19.355807170644777|15.95

In [12]:
df.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [13]:
# use to apply sql queries
from pyspark.sql.functions import col

In [14]:
for c in df.columns:
    print(c)

Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age
Outcome


In [15]:
# use to apply functions to column
for c in df.columns:
    print(col(c))

Column<'Pregnancies'>
Column<'Glucose'>
Column<'BloodPressure'>
Column<'SkinThickness'>
Column<'Insulin'>
Column<'BMI'>
Column<'DiabetesPedigreeFunction'>
Column<'Age'>
Column<'Outcome'>


In [16]:
# sql query
df.select(*(col(c) for c in df.columns)).show(5)


+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [17]:
df.select(*(col(c).cast('float') for c in df.columns)).show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
|        6.0|  148.0|         72.0|         35.0|    0.0|33.6|                   0.627|50.0|    1.0|
|        1.0|   85.0|         66.0|         29.0|    0.0|26.6|                   0.351|31.0|    0.0|
|        8.0|  183.0|         64.0|          0.0|    0.0|23.3|                   0.672|32.0|    1.0|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|    0.0|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|    1.0|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
only showing top 5 rows



In [18]:
df.select(*(col(c).cast('float').alias(c) for c in df.columns)).show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
|        6.0|  148.0|         72.0|         35.0|    0.0|33.6|                   0.627|50.0|    1.0|
|        1.0|   85.0|         66.0|         29.0|    0.0|26.6|                   0.351|31.0|    0.0|
|        8.0|  183.0|         64.0|          0.0|    0.0|23.3|                   0.672|32.0|    1.0|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|    0.0|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|    1.0|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
only showing top 5 rows



In [19]:
formated_df = df.select(*(col(c).cast('float') for c in df.columns))
formated_df.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
|        6.0|  148.0|         72.0|         35.0|    0.0|33.6|                   0.627|50.0|    1.0|
|        1.0|   85.0|         66.0|         29.0|    0.0|26.6|                   0.351|31.0|    0.0|
|        8.0|  183.0|         64.0|          0.0|    0.0|23.3|                   0.672|32.0|    1.0|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|    0.0|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|    1.0|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+
only showing top 5 rows



In [20]:
formated_df.printSchema()

root
 |-- Pregnancies: float (nullable = true)
 |-- Glucose: float (nullable = true)
 |-- BloodPressure: float (nullable = true)
 |-- SkinThickness: float (nullable = true)
 |-- Insulin: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: float (nullable = true)
 |-- Outcome: float (nullable = true)



In [21]:
formated_df.describe().show(5)

+-------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------------+------------------+------------------+
|summary|       Pregnancies|          Glucose|     BloodPressure|     SkinThickness|           Insulin|              BMI|DiabetesPedigreeFunction|               Age|           Outcome|
+-------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------------+------------------+------------------+
|  count|               768|              768|               768|               768|               768|              768|                     768|               768|               768|
|   mean|3.8450520833333335|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.99257813890775|      0.4718763029280429|33.240885416666664|0.3489583333333333|
| stddev|  3.36957806269887|31.97261819513622|19.355807170644777|15.9522175

## Missing Values

In [22]:
from pyspark.sql.functions import count, isnan, when

In [23]:
formated_df.select([count(when(isnan(c), c)).alias(c) for c in formated_df.columns]).show()

+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|          0|      0|            0|            0|      0|  0|                       0|  0|      0|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+



In [24]:
formated_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in formated_df.columns]).show()

+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|          0|      0|            0|            0|      0|  0|                       0|  0|      0|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+



## Assembler

In [25]:
from pyspark.ml.feature import VectorAssembler

In [26]:
features = formated_df.drop('Outcome')

In [27]:
features.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+----+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+
|        6.0|  148.0|         72.0|         35.0|    0.0|33.6|                   0.627|50.0|
|        1.0|   85.0|         66.0|         29.0|    0.0|26.6|                   0.351|31.0|
|        8.0|  183.0|         64.0|          0.0|    0.0|23.3|                   0.672|32.0|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+
only showing top 5 rows



In [28]:
features.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [29]:
# combines given list of columns & create a single vector col
assembler = VectorAssembler(inputCols= features.columns, outputCol = 'features')

In [30]:
output = assembler.transform(formated_df)
output.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|Outcome|            features|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|        6.0|  148.0|         72.0|         35.0|    0.0|33.6|                   0.627|50.0|    1.0|[6.0,148.0,72.0,3...|
|        1.0|   85.0|         66.0|         29.0|    0.0|26.6|                   0.351|31.0|    0.0|[1.0,85.0,66.0,29...|
|        8.0|  183.0|         64.0|          0.0|    0.0|23.3|                   0.672|32.0|    1.0|[8.0,183.0,64.0,0...|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|    0.0|[1.0,89.0,66.0,23...|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|    1.0|[0.0,137.0,40.0,3...|
+-----------+-------+---

In [31]:
output.select('features', 'Outcome').show(5)

+--------------------+-------+
|            features|Outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|    1.0|
|[1.0,85.0,66.0,29...|    0.0|
|[8.0,183.0,64.0,0...|    1.0|
|[1.0,89.0,66.0,23...|    0.0|
|[0.0,137.0,40.0,3...|    1.0|
+--------------------+-------+
only showing top 5 rows



In [32]:
output.select('features','Outcome').show(5, truncate= False)

+----------------------------------------------------------------------+-------+
|features                                                              |Outcome|
+----------------------------------------------------------------------+-------+
|[6.0,148.0,72.0,35.0,0.0,33.599998474121094,0.6269999742507935,50.0]  |1.0    |
|[1.0,85.0,66.0,29.0,0.0,26.600000381469727,0.35100001096725464,31.0]  |0.0    |
|[8.0,183.0,64.0,0.0,0.0,23.299999237060547,0.671999990940094,32.0]    |1.0    |
|[1.0,89.0,66.0,23.0,94.0,28.100000381469727,0.16699999570846558,21.0] |0.0    |
|[0.0,137.0,40.0,35.0,168.0,43.099998474121094,2.2880001068115234,33.0]|1.0    |
+----------------------------------------------------------------------+-------+
only showing top 5 rows



In [35]:
assembled_df = output.select('features', 'Outcome')
assembled_df.show(5)

+--------------------+-------+
|            features|Outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|    1.0|
|[1.0,85.0,66.0,29...|    0.0|
|[8.0,183.0,64.0,0...|    1.0|
|[1.0,89.0,66.0,23...|    0.0|
|[0.0,137.0,40.0,3...|    1.0|
+--------------------+-------+
only showing top 5 rows



## Scaling

In [36]:
from pyspark.ml.feature import StandardScaler

In [37]:
std_scalar = StandardScaler().setInputCol('features').setOutputCol('scaled_features')
std_scaled_df = std_scalar.fit(assembled_df).transform(assembled_df)

In [38]:
std_scaled_df.show(5)

+--------------------+-------+--------------------+
|            features|Outcome|     scaled_features|
+--------------------+-------+--------------------+
|[6.0,148.0,72.0,3...|    1.0|[1.78063837321943...|
|[1.0,85.0,66.0,29...|    0.0|[0.29677306220323...|
|[8.0,183.0,64.0,0...|    1.0|[2.37418449762590...|
|[1.0,89.0,66.0,23...|    0.0|[0.29677306220323...|
|[0.0,137.0,40.0,3...|    1.0|[0.0,4.2849165233...|
+--------------------+-------+--------------------+
only showing top 5 rows



In [39]:
scaled_df = std_scaled_df.select('scaled_features', 'Outcome')
scaled_df.show(5, truncate= False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|scaled_features                                                                                                                                        |Outcome|
+-------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|[1.7806383732194306,4.628960915766174,3.7198138711154307,2.1940523222807116,0.0,4.261709202425419,1.8923810993699686,4.251616970894646]                |1.0    |
|[0.29677306220323846,2.658524850271114,3.4098293818558116,1.8179290670325896,0.0,3.373853320188119,1.0593713140527197,2.6360025219546803]              |0.0    |
|[2.3741844976259077,5.723647618818986,3.306501218769272,0.0,0.0,2.955292430788826,2.028197980632078,2.721034861372573]                                 |1.0    |
|[0.29677306220323846,2.7836

In [40]:
scaled_df.show(5)

+--------------------+-------+
|     scaled_features|Outcome|
+--------------------+-------+
|[1.78063837321943...|    1.0|
|[0.29677306220323...|    0.0|
|[2.37418449762590...|    1.0|
|[0.29677306220323...|    0.0|
|[0.0,4.2849165233...|    1.0|
+--------------------+-------+
only showing top 5 rows



## Train test Split

In [41]:
train_df, test_df = scaled_df.randomSplit([0.7,0.3])

In [42]:
train_df.show(5)

+--------------------+-------+
|     scaled_features|Outcome|
+--------------------+-------+
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[0.8...|    0.0|
|(8,[0,1,6,7],[1.7...|    0.0|
|(8,[0,1,6,7],[2.0...|    0.0|
+--------------------+-------+
only showing top 5 rows



In [43]:
test_df.show(5)

+--------------------+-------+
|     scaled_features|Outcome|
+--------------------+-------+
|(8,[1,5,6,7],[2.2...|    0.0|
|(8,[1,5,6,7],[3.6...|    0.0|
|(8,[1,5,6,7],[4.4...|    1.0|
|[0.0,1.7827754878...|    0.0|
|[0.0,2.4395875096...|    0.0|
+--------------------+-------+
only showing top 5 rows



## Logistic regression

In [44]:
from pyspark.ml.classification import LogisticRegression


#### fit model

In [45]:
log_reg = LogisticRegression(featuresCol = 'scaled_features', labelCol = 'Outcome')
log_reg_model = log_reg.fit(train_df)


#### Predict

In [46]:
log_reg_predictions = log_reg_model.transform(test_df)
log_reg_predictions.show(5)

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[2.2...|    0.0|[2.97837073156098...|[0.95158736795904...|       0.0|
|(8,[1,5,6,7],[3.6...|    0.0|[-1.1548579069412...|[0.23960288546272...|       1.0|
|(8,[1,5,6,7],[4.4...|    1.0|[-1.6333032972623...|[0.16337834497809...|       1.0|
|[0.0,1.7827754878...|    0.0|[2.70422281555887...|[0.93727536325253...|       0.0|
|[0.0,2.4395875096...|    0.0|[2.59081446275094...|[0.93026806934250...|       0.0|
+--------------------+-------+--------------------+--------------------+----------+
only showing top 5 rows



In [47]:
log_reg_predictions.select('Outcome','prediction').show()

+-------+----------+
|Outcome|prediction|
+-------+----------+
|    0.0|       0.0|
|    0.0|       1.0|
|    1.0|       1.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
+-------+----------+
only showing top 20 rows



In [48]:
prediction_and_Labels = log_reg_predictions.select("Outcome","prediction").rdd

In [49]:
prediction_and_Labels.collect()

[Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, pr

#### Model Evaluation

In [54]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [56]:
evaluator = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName="accuracy")
accuracy_LR = evaluator.evaluate(log_reg_predictions)
print ("Accuracy = " ,accuracy_LR)

Accuracy =  0.7743362831858407


## Naive Bayes

In [57]:
from pyspark.ml.classification import NaiveBayes

In [58]:
naive_bayes = NaiveBayes(featuresCol='scaled_features', labelCol='Outcome', smoothing=1.0)

In [59]:
train_df.show(5)

+--------------------+-------+
|     scaled_features|Outcome|
+--------------------+-------+
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[0.8...|    0.0|
|(8,[0,1,6,7],[1.7...|    0.0|
|(8,[0,1,6,7],[2.0...|    0.0|
+--------------------+-------+
only showing top 5 rows



In [60]:
naive_bay_model = naive_bayes.fit(train_df)

In [61]:
naive_prediction = naive_bay_model.transform(test_df)
naive_prediction.show(5)

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[2.2...|    0.0|[-14.971394413191...|[0.58828421632891...|       0.0|
|(8,[1,5,6,7],[3.6...|    0.0|[-27.396299066735...|[0.52876773863154...|       0.0|
|(8,[1,5,6,7],[4.4...|    1.0|[-22.122620098002...|[0.57101956248266...|       0.0|
|[0.0,1.7827754878...|    0.0|[-29.085690086716...|[0.68655152631738...|       0.0|
|[0.0,2.4395875096...|    0.0|[-31.747249850470...|[0.74141771024924...|       0.0|
+--------------------+-------+--------------------+--------------------+----------+
only showing top 5 rows



In [62]:
predictionAndLabels = naive_prediction.select("Outcome","prediction").rdd

In [63]:
evaluator = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName="accuracy")
accuracy_NB = evaluator.evaluate(naive_prediction)

In [64]:
print ("Accuracy",accuracy_NB)

Accuracy 0.6769911504424779


## GBTClassifier

In [65]:
from pyspark.ml.classification import GBTClassifier

In [66]:
gradient_boost_class = GBTClassifier(labelCol="Outcome", featuresCol="scaled_features")
gradient_boost_model = gradient_boost_class.fit(train_df)


In [67]:
gbt_prediction = gradient_boost_model.transform(test_df)

In [68]:
gbt_prediction.show(5)

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[2.2...|    0.0|[1.52974106441609...|[0.95519013628407...|       0.0|
|(8,[1,5,6,7],[3.6...|    0.0|[-0.5882999410465...|[0.23566409712830...|       1.0|
|(8,[1,5,6,7],[4.4...|    1.0|[-1.7976180774408...|[0.02672060613007...|       1.0|
|[0.0,1.7827754878...|    0.0|[0.81650635454177...|[0.83658193184099...|       0.0|
|[0.0,2.4395875096...|    0.0|[1.13943442722664...|[0.90711178044169...|       0.0|
+--------------------+-------+--------------------+--------------------+----------+
only showing top 5 rows



In [69]:
evaluator = MulticlassClassificationEvaluator( labelCol="Outcome", predictionCol="prediction", metricName="accuracy")
accuracy_GBT = evaluator.evaluate(gbt_prediction)

In [70]:
print ("Accuracy",accuracy_GBT)

Accuracy 0.6858407079646017


## RandomForestClassifier

In [71]:
from pyspark.ml.classification import RandomForestClassifier

In [72]:
random_forest_classifier = RandomForestClassifier(labelCol="Outcome", featuresCol="scaled_features", numTrees=40)
rf_model = random_forest_classifier.fit(train_df)
rf_prediction = rf_model.transform(test_df)
rf_prediction.show(5)

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[2.2...|    0.0|[35.9580531440866...|[0.89895132860216...|       0.0|
|(8,[1,5,6,7],[3.6...|    0.0|[15.1580659672269...|[0.37895164918067...|       1.0|
|(8,[1,5,6,7],[4.4...|    1.0|[13.4013064307224...|[0.33503266076806...|       1.0|
|[0.0,1.7827754878...|    0.0|[33.0300483162224...|[0.82575120790556...|       0.0|
|[0.0,2.4395875096...|    0.0|[34.2878340303959...|[0.85719585075989...|       0.0|
+--------------------+-------+--------------------+--------------------+----------+
only showing top 5 rows



In [73]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator( labelCol="Outcome", predictionCol="prediction", metricName="accuracy")
accuracy_RF= evaluator.evaluate(rf_prediction)

In [74]:
print ("Accuracy",accuracy_RF)

Accuracy 0.7522123893805309
