In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 44.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=79c51937c190ac37e061c823a965cf495e5457a13ea75e720da7a6f70ce450b1
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [2]:
from pyspark.sql import SparkSession

In [3]:
session = SparkSession.builder.appName('multiclass').master('local').getOrCreate()


In [22]:
data = session.read.csv("/content/letter_recognition.csv", header = True, inferSchema = True)

In [23]:
data.show()

+------+----+----+-----+----+-----+----+----+-----+-----+-----+-----+-----+----+-----+----+-----+
|letter|xbox|ybox|width|high|onpix|xbar|ybar|x2bar|y2bar|xybar|x2ybr|xy2br|xege|xegvy|yege|yegvx|
+------+----+----+-----+----+-----+----+----+-----+-----+-----+-----+-----+----+-----+----+-----+
|     T|   2|   8|    3|   5|    1|   8|  13|    0|    6|    6|   10|    8|   0|    8|   0|    8|
|     I|   5|  12|    3|   7|    2|  10|   5|    5|    4|   13|    3|    9|   2|    8|   4|   10|
|     D|   4|  11|    6|   8|    6|  10|   6|    2|    6|   10|    3|    7|   3|    7|   3|    9|
|     N|   7|  11|    6|   6|    3|   5|   9|    4|    6|    4|    4|   10|   6|   10|   2|    8|
|     G|   2|   1|    3|   1|    1|   8|   6|    6|    6|    6|    5|    9|   1|    7|   5|   10|
|     S|   4|  11|    5|   8|    3|   8|   8|    6|    9|    5|    6|    6|   0|    8|   9|    7|
|     B|   4|   2|    5|   4|    4|   8|   7|    6|    6|    7|    6|    6|   2|    8|   7|   10|
|     A|   1|   1|  

In [5]:
from pyspark.ml.feature import StringIndexer

In [25]:
str_obj = StringIndexer(inputCol = 'letter', outputCol = 'newletter')

In [24]:
data.columns

['letter',
 'xbox',
 'ybox',
 'width',
 'high',
 'onpix',
 'xbar',
 'ybar',
 'x2bar',
 'y2bar',
 'xybar',
 'x2ybr',
 'xy2br',
 'xege',
 'xegvy',
 'yege',
 'yegvx']

In [26]:
from pyspark.ml.feature import VectorAssembler
vec = VectorAssembler(inputCols =['xbox','ybox','width','high','onpix','xbar','ybar','x2bar','y2bar','xybar','x2ybr','xy2br','xege','xegvy','yege','yegvx'], outputCol = 'allfeatures' )

In [9]:
from pyspark.ml.classification import DecisionTreeClassifier

In [27]:
tree = DecisionTreeClassifier(featuresCol = 'allfeatures', labelCol = "newletter")

In [12]:
from pyspark.ml import Pipeline

In [29]:
my_pipeline = Pipeline(stages = [str_obj,vec,tree])

In [30]:
training, test = data.randomSplit([0.75,0.25])

In [31]:
treemodel = my_pipeline.fit(training)

In [35]:
results = treemodel.transform(test)

In [37]:
results.show(5)

+------+----+----+-----+----+-----+----+----+-----+-----+-----+-----+-----+----+-----+----+-----+---------+--------------------+--------------------+--------------------+----------+
|letter|xbox|ybox|width|high|onpix|xbar|ybar|x2bar|y2bar|xybar|x2ybr|xy2br|xege|xegvy|yege|yegvx|newletter|         allfeatures|       rawPrediction|         probability|prediction|
+------+----+----+-----+----+-----+----+----+-----+-----+-----+-----+-----+----+-----+----+-----+---------+--------------------+--------------------+--------------------+----------+
|     A|   1|   0|    2|   0|    0|   7|   3|    2|    0|    7|    2|    8|   2|    6|   1|    8|      8.0|[1.0,0.0,2.0,0.0,...|[0.0,0.0,0.0,1.0,...|[0.0,0.0,0.0,0.00...|       8.0|
|     A|   1|   0|    2|   0|    0|   7|   4|    2|    0|    7|    2|    8|   1|    7|   1|    8|      8.0|[1.0,0.0,2.0,0.0,...|[0.0,0.0,0.0,1.0,...|[0.0,0.0,0.0,0.00...|       8.0|
|     A|   1|   0|    2|   0|    0|   8|   3|    2|    0|    7|    2|    8|   2|    6|   1

In [38]:
results.show(5, truncate = False)

+------+----+----+-----+----+-----+----+----+-----+-----+-----+-----+-----+----+-----+----+-----+---------+-----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|letter|xbox|ybox|width|high|onpix|xbar|ybar|x2bar|y2bar|xybar|x2ybr|xy2br|xege|xegvy|yege|yegvx|newletter|allfeatures                                                      |rawPrediction                                                                                              |probability                                                                                                                                                 |prediction|
+------+----+----+-----+----+-----+----+----+-----+-----+-----+-----+-----+----+-----+----+-----+---

In [41]:
#Here you can not use binaryclassification evaluator; it is used when there are only two variable
# So multiclassclassifiactionevaluator will be used because we have 26 variables
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [42]:
?MulticlassClassificationEvaluator() #for help

In [45]:
evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'newletter')
evaluator.evaluate(results)

0.33291904559175134

In [46]:
# introducing different metric names
evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'newletter',metricName = 'accuracy')
evaluator.evaluate(results)

0.3739612188365651

In [48]:
evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'newletter', metricName = 'weightedRecall')
evaluator.evaluate(results)

0.37396121883656513

In [None]:
# metric names that can be used for checking precision
# evaluation (f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| logLoss|hammingLoss)')

In [49]:
# another way to check all the metrics
metriclist = ['accuracy','weightedPrecision','weightedRecall','weightedTruePositiveRate', 'weightedFalsePositiveRate','weightedFMeasure','truePositiveRateByLabel', 'falsePositiveRateByLabel','precisionByLabel','recallByLabel','fMeasureByLabel', 'logLoss','hammingLoss']
for metric in metriclist:
  evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'newletter', metricName = metric)
  print(metric,":",evaluator.evaluate(results))

accuracy : 0.3739612188365651
weightedPrecision : 0.41473212462948766
weightedRecall : 0.37396121883656513
weightedTruePositiveRate : 0.37396121883656513
weightedFalsePositiveRate : 0.026334646796937557
weightedFMeasure : 0.33291904559175134
truePositiveRateByLabel : 0.5523255813953488
falsePositiveRateByLabel : 0.013519049569848422
precisionByLabel : 0.5900621118012422
recallByLabel : 0.5523255813953488
fMeasureByLabel : 0.5705705705705705
logLoss : 2.0915989459045528
hammingLoss : 0.6260387811634349
