# Import Libraries

In [1]:
# Installing required packages
!pip install pyspark
!pip install findspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=3dd97f706bc0d7543e6565b223c933402f4fb11ce486cc3410d16022ac55682a
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession,SQLContext

# Create Spark Context

In [8]:
spark = SparkSession \
    .builder \
    .appName("ML_Classifications") \
    .getOrCreate()



In [9]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [None]:
# Upload files (Only in colabs)

In [10]:
from google.colab import files


uploaded = files.upload()


Saving drybeans.csv to drybeans.csv


In [13]:
file='drybeans.csv'
df = spark.read.csv(file,header='true',inferSchema=True)

In [14]:
df.columns

['Area',
 'Perimeter',
 'MajorAxisLength',
 'MinorAxisLength',
 'AspectRation',
 'Eccentricity',
 'ConvexArea',
 'EquivDiameter',
 'Extent',
 'Solidity',
 'roundness',
 'Compactness',
 'ShapeFactor1',
 'ShapeFactor2',
 'ShapeFactor3',
 'ShapeFactor4',
 'Class']

In [72]:
df.printSchema()

root
 |-- Area: integer (nullable = true)
 |-- Perimeter: double (nullable = true)
 |-- MajorAxisLength: double (nullable = true)
 |-- MinorAxisLength: double (nullable = true)
 |-- AspectRation: double (nullable = true)
 |-- Eccentricity: double (nullable = true)
 |-- ConvexArea: integer (nullable = true)
 |-- EquivDiameter: double (nullable = true)
 |-- Extent: double (nullable = true)
 |-- Solidity: double (nullable = true)
 |-- roundness: double (nullable = true)
 |-- Compactness: double (nullable = true)
 |-- ShapeFactor1: double (nullable = true)
 |-- ShapeFactor2: double (nullable = true)
 |-- ShapeFactor3: double (nullable = true)
 |-- ShapeFactor4: double (nullable = true)
 |-- Class: string (nullable = true)
 |-- label: double (nullable = false)



In [15]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Area,13611,53048.284549261625,29324.09571688207,20420,254616
Perimeter,13611,855.2834585996654,214.28969589196151,524.736,1985.37
MajorAxisLength,13611,320.1418673032194,85.6941859593335,183.601165,738.8601535
MinorAxisLength,13611,202.2707140828817,44.97009129411471,122.5126535,460.1984968
AspectRation,13611,1.5832419790188144,0.24667845568580432,1.024867596,2.430306447
Eccentricity,13611,0.750894929372346,0.09200176320620888,0.218951263,0.911422968
ConvexArea,13611,53768.20020571596,29774.915817000012,20684,263261
EquivDiameter,13611,253.06421992490445,59.17712014871156,161.2437642,569.3743583
Extent,13611,0.7497327873564055,0.049086366843964224,0.555314717,0.866194641


In [16]:
df.select(["Area","Perimeter","Solidity","roundness","Compactness","Class"]).show(5)

+-----+---------+-----------+-----------+-----------+-----+
| Area|Perimeter|   Solidity|  roundness|Compactness|Class|
+-----+---------+-----------+-----------+-----------+-----+
|28395|  610.291|0.988855999|0.958027126|0.913357755|SEKER|
|28734|  638.018|0.984985603|0.887033637|0.953860842|SEKER|
|29380|   624.11|0.989558774|0.947849473|0.908774239|SEKER|
|30008|  645.884|0.976695743|0.903936374|0.928328835|SEKER|
|30140|  620.134| 0.99089325|0.984877069|0.970515523|SEKER|
+-----+---------+-----------+-----------+-----------+-----+
only showing top 5 rows



In [17]:
df.groupBy('Class').count().orderBy('count').show()

+--------+-----+
|   Class|count|
+--------+-----+
|  BOMBAY|  522|
|BARBUNYA| 1322|
|    CALI| 1630|
|   HOROZ| 1928|
|   SEKER| 2027|
|    SIRA| 2636|
|DERMASON| 3546|
+--------+-----+



In [18]:
# Convert Class column from string to numerical values
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="Class", outputCol="label")
df = indexer.fit(df).transform(df)

In [19]:
df.groupBy('label').count().orderBy('count').show()

+-----+-----+
|label|count|
+-----+-----+
|  6.0|  522|
|  5.0| 1322|
|  4.0| 1630|
|  3.0| 1928|
|  2.0| 2027|
|  1.0| 2636|
|  0.0| 3546|
+-----+-----+



#  Classification

In [20]:
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

In [21]:
featureColumns =df.columns[:-2]

In [22]:
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")
df_assembled = assembler.transform(df)

In [23]:
df_assembled.show(10)

+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+-----+-----+--------------------+
| Area|Perimeter|MajorAxisLength|MinorAxisLength|AspectRation|Eccentricity|ConvexArea|EquivDiameter|     Extent|   Solidity|  roundness|Compactness|ShapeFactor1|ShapeFactor2|ShapeFactor3|ShapeFactor4|Class|label|            features|
+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+-----+-----+--------------------+
|28395|  610.291|    208.1781167|     173.888747| 1.197191424| 0.549812187|     28715|  190.1410973|0.763922518|0.988855999|0.958027126|0.913357755| 0.007331506| 0.003147289| 0.834222388| 0.998723889|SEKER|  2.0|[28395.0,610.291,...|
|28734|  638.018|    200.5247957|    182.7344194| 1.097356461| 0

In [24]:
(trainingData, testData) = df_assembled.randomSplit([0.8,0.2], seed = 13234 )

In [25]:
trainingData.count(),testData.count()

(10856, 2755)

In [26]:
from pyspark.ml.classification import DecisionTreeClassifier,LogisticRegression
lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(trainingData)

In [27]:
#dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5,minInstancesPerNode=20, impurity="gini")
#pipeline = Pipeline(stages=[dt])
#model = pipeline.fit(trainingData)

In [28]:
predictions = model.transform(testData)

In [29]:
predictions.show()

+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+--------------------+--------------------+----------+
| Area|Perimeter|MajorAxisLength|MinorAxisLength|AspectRation|Eccentricity|ConvexArea|EquivDiameter|     Extent|   Solidity|  roundness|Compactness|ShapeFactor1|ShapeFactor2|ShapeFactor3|ShapeFactor4|   Class|label|            features|       rawPrediction|         probability|prediction|
+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+--------------------+--------------------+----------+
|21397|  535.436|    192.5302973|    141.6521869| 1.359176314| 0.677264398|     21731|  165.0560709|0.772650128|0.984630252|0.9378

In [30]:
predictions.select("features","rawprediction","probability","prediction", "label").show(10)

+--------------------+--------------------+--------------------+----------+-----+
|            features|       rawprediction|         probability|prediction|label|
+--------------------+--------------------+--------------------+----------+-----+
|[21397.0,535.436,...|[22.7904027277230...|[0.99997418942471...|       0.0|  0.0|
|[21706.0,546.187,...|[23.1095114002520...|[0.99994026721222...|       0.0|  0.0|
|[21830.0,543.295,...|[23.9089584754548...|[0.99984677198001...|       0.0|  0.0|
|[21961.0,551.696,...|[23.6182768552975...|[0.99993871558807...|       0.0|  0.0|
|[22158.0,542.436,...|[22.8611735301309...|[0.99997474626916...|       0.0|  0.0|
|[22162.0,568.07,2...|[25.3774634256572...|[0.98561492453775...|       0.0|  0.0|
|[22461.0,544.584,...|[20.4813579055176...|[0.99972670525664...|       0.0|  0.0|
|[22687.0,553.435,...|[22.9897326973708...|[0.99998421459430...|       0.0|  0.0|
|[22721.0,553.6,20...|[21.7195729230553...|[0.99996223979091...|       0.0|  0.0|
|[22777.0,563.86

In [31]:
prediction_save=predictions.select("features","rawprediction","probability","prediction", "label").show()

+--------------------+--------------------+--------------------+----------+-----+
|            features|       rawprediction|         probability|prediction|label|
+--------------------+--------------------+--------------------+----------+-----+
|[21397.0,535.436,...|[22.7904027277230...|[0.99997418942471...|       0.0|  0.0|
|[21706.0,546.187,...|[23.1095114002520...|[0.99994026721222...|       0.0|  0.0|
|[21830.0,543.295,...|[23.9089584754548...|[0.99984677198001...|       0.0|  0.0|
|[21961.0,551.696,...|[23.6182768552975...|[0.99993871558807...|       0.0|  0.0|
|[22158.0,542.436,...|[22.8611735301309...|[0.99997474626916...|       0.0|  0.0|
|[22162.0,568.07,2...|[25.3774634256572...|[0.98561492453775...|       0.0|  0.0|
|[22461.0,544.584,...|[20.4813579055176...|[0.99972670525664...|       0.0|  0.0|
|[22687.0,553.435,...|[22.9897326973708...|[0.99998421459430...|       0.0|  0.0|
|[22721.0,553.6,20...|[21.7195729230553...|[0.99996223979091...|       0.0|  0.0|
|[22777.0,563.86

In [32]:
predictions.select("prediction", "label").write.save(path="predictions",
                                                     format="com.databricks.spark.csv",
                                                     header='true')

In [33]:
prediction_save=predictions.select("prediction", "label")

# Evaluations

In [34]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [35]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")

In [36]:
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))

Accuracy = 0.931397 


In [37]:
# Evaluate model performance
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy =", accuracy)


Accuracy = 0.9313974591651543


In [38]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
print("Recall =", recall)


Recall = 0.9313974591651542


In [39]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(predictions)
print("F1 score = ", f1_score)


F1 score =  0.9314690565989844


In [40]:
metrics = MulticlassMetrics(prediction_save.rdd.map(tuple))



In [41]:
metrics.confusionMatrix().toArray().transpose()

array([[668.,  50.,   2.,   1.,   0.,   0.,   0.],
       [ 51., 473.,  13.,   3.,   3.,   7.,   0.],
       [  6.,   5., 400.,   0.,   0.,   2.,   0.],
       [  3.,   7.,   1., 363.,   4.,   0.,   0.],
       [  0.,   4.,   1.,   3., 317.,  13.,   0.],
       [  0.,   1.,   4.,   1.,   4., 240.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0., 105.]])


# Try with fewer features/ Try with Decision Tree

In [42]:
sc.stop()

# Exercise Diabates data