<a href="https://colab.research.google.com/github/muhammetsnts/SPARK/blob/main/2.ML_with_PySpark_MLlib/Tree_Methods/1.TreeMethods_Documentation_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Environment

In [1]:
# install Java8
!apt-get -q install openjdk-8-jdk-headless -qq > /dev/null

# download spark3.1.1
!wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

# unzip it
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

# install findspark 
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
#spark = SparkSession.builder.appName('ops').getOrCreate()

# Download and Read the Data

In [2]:
!wget -q https://raw.githubusercontent.com/muhammetsnts/SPARK/main/data/sample_libsvm_data.txt

In [3]:
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [4]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



# Train-Test Split

In [5]:
train_data, test_data = data.randomSplit([0.7,0.3])

# Modelling

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier

# For regression, we can use this;
# from pyspark.ml.regression import ....

In [9]:
dtc = DecisionTreeClassifier() # will use default parameters
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [10]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [11]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

Lets show the predictions.

In [12]:
dtc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[121,122,123...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[122,123,148...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[127,128,129...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[152,153,154...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[153,154,155...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  1.0|(692,[99,100,101,...|   [34.0,0.0]|  [1.0,0.0]|       0.0|
|  1.0|(692,[100,101,102...|   [0.0,46.0]|  [0.0,1.0]|       1.0|
|  1.0|(692,[123,124,125...|   [0.0,46.0]|  [0.0,1.0]|       1.0|
|  1.0|(69

In [13]:
rfc_preds.show()

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[121,122,123...|   [96.0,4.0]|[0.96,0.04]|       0.0|
|  0.0|(692,[122,123,148...|  [83.0,17.0]|[0.83,0.17]|       0.0|
|  0.0|(692,[123,124,125...|   [94.0,6.0]|[0.94,0.06]|       0.0|
|  0.0|(692,[124,125,126...|   [91.0,9.0]|[0.91,0.09]|       0.0|
|  0.0|(692,[124,125,126...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[125,126,127...|  [89.0,11.0]|[0.89,0.11]|       0.0|
|  0.0|(692,[127,128,129...|   [96.0,4.0]|[0.96,0.04]|       0.0|
|  0.0|(692,[152,153,154...|   [93.0,7.0]|[0.93,0.07]|       0.0|
|  0.0|(692,[153,154,155...|  [74.0,26.0]|[0.74,0.26]|       0.0|
|  1.0|(692,[99,100,101,...|  [45.0,55.0]|[0.45,0.55]|       1.0|
|  1.0|(692,[100,101,102...|   [2.0,98.0]|[0.02,0.98]|       1.0|
|  1.0|(692,[123,124,125...|   [1.0,99.0]|[0.01,0.99]|       1.0|
|  1.0|(69

In [14]:
gbt_preds.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[121,122,123...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[122,123,148...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[123,124,125...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[125,126,127...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[127,128,129...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[152,153,154...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[153,154,155...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  1.0|(692,[99,

# Evaluation

This is a binary classification task so we can use ROC or precision, recall etc. But Multiclassclassificationevaluator works even though this is a binary classification task.

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [16]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

In [20]:
print('DTC ACCURACY: ', acc_eval.evaluate(dtc_preds))

DTC ACCURACY:  0.95


In [21]:
print('RFC ACCURACY: ', acc_eval.evaluate(rfc_preds))

RFC ACCURACY:  1.0


In [22]:
print('GBT ACCURACY: ', acc_eval.evaluate(gbt_preds))

GBT ACCURACY:  0.95


# Feature Importance
Higher the number is more important.

In [23]:
rfc_model.featureImportances

SparseVector(692, {102: 0.0006, 131: 0.0006, 155: 0.0004, 156: 0.0005, 176: 0.0005, 202: 0.0005, 204: 0.0005, 205: 0.0008, 213: 0.0005, 214: 0.0032, 231: 0.0011, 234: 0.0138, 236: 0.0009, 241: 0.0006, 242: 0.0004, 243: 0.0079, 260: 0.0005, 263: 0.0156, 264: 0.0041, 270: 0.0011, 271: 0.0086, 272: 0.0088, 273: 0.0174, 274: 0.0004, 290: 0.0172, 291: 0.0034, 299: 0.0012, 300: 0.0116, 301: 0.0239, 303: 0.0013, 317: 0.0345, 318: 0.0003, 322: 0.0009, 323: 0.0092, 327: 0.0006, 328: 0.0167, 331: 0.0006, 342: 0.0004, 346: 0.0005, 347: 0.0006, 350: 0.0083, 351: 0.0186, 353: 0.0004, 354: 0.0023, 355: 0.0048, 356: 0.0168, 357: 0.0093, 358: 0.0072, 369: 0.0015, 373: 0.0089, 374: 0.0075, 375: 0.0012, 377: 0.0051, 378: 0.0201, 379: 0.0122, 381: 0.0022, 385: 0.0071, 387: 0.0005, 397: 0.0014, 398: 0.0063, 400: 0.0149, 401: 0.0252, 402: 0.0009, 405: 0.0169, 406: 0.0812, 407: 0.0005, 409: 0.0012, 410: 0.0005, 412: 0.016, 426: 0.0004, 428: 0.0068, 429: 0.0081, 431: 0.0011, 433: 0.0329, 434: 0.0445, 435: 0.

In [24]:
dtc_model.featureImportances

SparseVector(692, {434: 1.0})

In [25]:
gbt_model.featureImportances

SparseVector(692, {434: 0.5926, 462: 0.2651, 490: 0.1423})