# Import Libraries

In [1]:
# Installing required packages
!pip install pyspark
!pip install findspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=f596f75f6e2d14b29d76c6f5e868fe6ac03ad1c4f1f9510607dad78de5235468
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [7]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession,SQLContext

# Create Spark Context

In [3]:
spark = SparkSession \
    .builder \
    .appName("ML_Classifications") \
    .getOrCreate()



In [4]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)



In [5]:
# Upload files (Only in colabs)

In [6]:
from google.colab import files


uploaded = files.upload()


Saving drybeans.csv to drybeans.csv


In [8]:
file='drybeans.csv'
df = spark.read.csv(file,header='true',inferSchema=True)

In [9]:
df.columns

['Area',
 'Perimeter',
 'MajorAxisLength',
 'MinorAxisLength',
 'AspectRation',
 'Eccentricity',
 'ConvexArea',
 'EquivDiameter',
 'Extent',
 'Solidity',
 'roundness',
 'Compactness',
 'ShapeFactor1',
 'ShapeFactor2',
 'ShapeFactor3',
 'ShapeFactor4',
 'Class']

In [10]:
df.printSchema()

root
 |-- Area: integer (nullable = true)
 |-- Perimeter: double (nullable = true)
 |-- MajorAxisLength: double (nullable = true)
 |-- MinorAxisLength: double (nullable = true)
 |-- AspectRation: double (nullable = true)
 |-- Eccentricity: double (nullable = true)
 |-- ConvexArea: integer (nullable = true)
 |-- EquivDiameter: double (nullable = true)
 |-- Extent: double (nullable = true)
 |-- Solidity: double (nullable = true)
 |-- roundness: double (nullable = true)
 |-- Compactness: double (nullable = true)
 |-- ShapeFactor1: double (nullable = true)
 |-- ShapeFactor2: double (nullable = true)
 |-- ShapeFactor3: double (nullable = true)
 |-- ShapeFactor4: double (nullable = true)
 |-- Class: string (nullable = true)



In [11]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Area,13611,53048.284549261625,29324.09571688207,20420,254616
Perimeter,13611,855.2834585996654,214.28969589196151,524.736,1985.37
MajorAxisLength,13611,320.1418673032194,85.6941859593335,183.601165,738.8601535
MinorAxisLength,13611,202.2707140828817,44.97009129411471,122.5126535,460.1984968
AspectRation,13611,1.5832419790188144,0.24667845568580432,1.024867596,2.430306447
Eccentricity,13611,0.750894929372346,0.09200176320620888,0.218951263,0.911422968
ConvexArea,13611,53768.20020571596,29774.915817000012,20684,263261
EquivDiameter,13611,253.06421992490445,59.17712014871156,161.2437642,569.3743583
Extent,13611,0.7497327873564055,0.049086366843964224,0.555314717,0.866194641


In [None]:
df.select(["Area","Perimeter","Solidity","roundness","Compactness","Class"]).show(5)

+-----+---------+-----------+-----------+-----------+-----+
| Area|Perimeter|   Solidity|  roundness|Compactness|Class|
+-----+---------+-----------+-----------+-----------+-----+
|28395|  610.291|0.988855999|0.958027126|0.913357755|SEKER|
|28734|  638.018|0.984985603|0.887033637|0.953860842|SEKER|
|29380|   624.11|0.989558774|0.947849473|0.908774239|SEKER|
|30008|  645.884|0.976695743|0.903936374|0.928328835|SEKER|
|30140|  620.134| 0.99089325|0.984877069|0.970515523|SEKER|
+-----+---------+-----------+-----------+-----------+-----+
only showing top 5 rows



In [None]:
df.groupBy('Class').count().orderBy('count').show()

+--------+-----+
|   Class|count|
+--------+-----+
|  BOMBAY|  522|
|BARBUNYA| 1322|
|    CALI| 1630|
|   HOROZ| 1928|
|   SEKER| 2027|
|    SIRA| 2636|
|DERMASON| 3546|
+--------+-----+



In [12]:
# Convert Class column from string to numerical values



#  Classification

In [68]:
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
from pyspark.ml.classification import LogisticRegression,DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [14]:
featureColumns =df.columns[:-2]

In [19]:
indexer = StringIndexer(inputCol="Class", outputCol="label")
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")


In [20]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol="features", labelCol="label")


In [22]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[indexer,assembler,lr])

In [78]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(lr.fitIntercept, [False, True]) \
    .addGrid(lr.maxIter, [5, 10,20]) \
    .build()

In [79]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(df)

In [80]:
cvModel.avgMetrics


[0.10769577809569286,
 0.11021231577854324,
 0.8410523912546809,
 0.8587447567058512,
 0.913468123684896,
 0.914367917756331]

In [114]:

(trainin, testData) = df.randomSplit([0.8,0.2], seed = 13234 )

In [115]:
predictions = cvModel.transform(testData)

In [116]:
predictions.show()

+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+--------------------+--------------------+----------+
| Area|Perimeter|MajorAxisLength|MinorAxisLength|AspectRation|Eccentricity|ConvexArea|EquivDiameter|     Extent|   Solidity|  roundness|Compactness|ShapeFactor1|ShapeFactor2|ShapeFactor3|ShapeFactor4|   Class|label|            features|       rawPrediction|         probability|prediction|
+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+--------------------+--------------------+----------+
|21397|  535.436|    192.5302973|    141.6521869| 1.359176314| 0.677264398|     21731|  165.0560709|0.772650128|0.984630252|0.9378

In [117]:
predictions.select("probability","prediction", "label").show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------+----------+-----+
|probability                                                                                                                                               |prediction|label|
+----------------------------------------------------------------------------------------------------------------------------------------------------------+----------+-----+
|[0.999999786048775,1.0073424145280991E-7,1.1321489990696637E-7,2.0830094026751393E-12,6.254532123841183E-18,6.153731567601947E-16,9.970643899141754E-17]  |0.0       |0.0  |
|[0.9999996557639533,3.279393242509332E-7,1.6231598495319047E-8,6.512276531874529E-11,1.788571766857678E-17,1.1320598855284279E-15,5.268924072760496E-17]  |0.0       |0.0  |
|[0.9999998177392065,1.6998937232110413E-7,1.2237036658045205E-8,3.438424810467422E-11,6.671245947972873E-18,2.1247358985455892E-1

In [118]:
prediction_save=predictions.select("rawprediction","probability","prediction", "label").show()

+--------------------+--------------------+----------+-----+
|       rawprediction|         probability|prediction|label|
+--------------------+--------------------+----------+-----+
|[24.3548276930700...|[0.99999978604877...|       0.0|  0.0|
|[23.8258393081131...|[0.99999965576395...|       0.0|  0.0|
|[24.6065439278946...|[0.99999981773920...|       0.0|  0.0|
|[23.8494655004945...|[0.99999950918119...|       0.0|  0.0|
|[24.0218520546913...|[0.99999974983603...|       0.0|  0.0|
|[24.4082920599424...|[0.99999877504810...|       0.0|  0.0|
|[22.7044814231854...|[0.99998874305469...|       0.0|  0.0|
|[23.4824999988647...|[0.99999953339581...|       0.0|  0.0|
|[22.6440671849201...|[0.99999914424307...|       0.0|  0.0|
|[22.2233773113037...|[0.99999835859942...|       0.0|  0.0|
|[22.4979797414059...|[0.99999860828675...|       0.0|  0.0|
|[22.4813299872641...|[0.99999771710537...|       0.0|  0.0|
|[22.2479949721322...|[0.99999845795825...|       0.0|  0.0|
|[21.8701174915775...|[0

In [119]:
predictions.select("prediction", "label").write.save(path="predictions",
                                                     format="com.databricks.spark.csv",
                                                     header='true')

In [120]:

76

# Evaluations

In [111]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [None]:
def evaluate(result):
    predictionAndLabels = result.select("prediction", "label")
    metrics = ["f1","precisionByLabel","recallByLabel","weightedPrecision","weightedRecall","accuracy"]
    for m in metrics:
        evaluator = MulticlassClassificationEvaluator(metricName=m)
        print(str(m) + ": " + str(evaluator.evaluate(predictionAndLabels)))

In [121]:
evaluate(predictions)

f1: 0.9209945149904254
precisionByLabel: 0.925561797752809
recallByLabel: 0.9052197802197802
weightedPrecision: 0.9216211770958438
weightedRecall: 0.9208711433756805
accuracy: 0.9208711433756805


In [122]:
# Evaluate model performance
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy =", accuracy)


Accuracy = 0.9208711433756805


In [130]:
prediction=predictions.select("prediction", "label")

In [131]:
metrics = MulticlassMetrics(prediction.rdd.map(tuple))



In [132]:
metrics.confusionMatrix().toArray().transpose()

array([[659.,  48.,   4.,   1.,   0.,   0.,   0.],
       [ 61., 468.,  17.,   3.,   3.,   6.,   0.],
       [  7.,   9., 394.,   0.,   0.,   1.,   0.],
       [  1.,  12.,   0., 364.,   5.,   1.,   0.],
       [  0.,   3.,   1.,   3., 313.,  20.,   0.],
       [  0.,   0.,   5.,   0.,   7., 234.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0., 105.]])

In [None]:
sc.stop()