# Import Libraries

In [19]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession,SQLContext
import os

os.environ['JAVA_HOME'] = 'D:\\Java\\jdk1.8.0_202\\'

# Create Spark Context

In [20]:
spark = SparkSession \
    .builder \
    .appName("ML_Classifications") \
    .getOrCreate()



In [21]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [22]:
# Upload files (Only in colabs)

In [23]:
# from google.colab import files


# uploaded = files.upload()


In [24]:
file="\\Users\\kimil\\OneDrive\\Desktop\\MUIC_work\\BigData\\BigData\\SparkML\\data\\drybeans.csv"
df = spark.read.csv(file,header='true',inferSchema=True)

In [25]:
df.columns

['Area',
 'Perimeter',
 'MajorAxisLength',
 'MinorAxisLength',
 'AspectRation',
 'Eccentricity',
 'ConvexArea',
 'EquivDiameter',
 'Extent',
 'Solidity',
 'roundness',
 'Compactness',
 'ShapeFactor1',
 'ShapeFactor2',
 'ShapeFactor3',
 'ShapeFactor4',
 'Class']

In [26]:
df.printSchema()

root
 |-- Area: integer (nullable = true)
 |-- Perimeter: double (nullable = true)
 |-- MajorAxisLength: double (nullable = true)
 |-- MinorAxisLength: double (nullable = true)
 |-- AspectRation: double (nullable = true)
 |-- Eccentricity: double (nullable = true)
 |-- ConvexArea: integer (nullable = true)
 |-- EquivDiameter: double (nullable = true)
 |-- Extent: double (nullable = true)
 |-- Solidity: double (nullable = true)
 |-- roundness: double (nullable = true)
 |-- Compactness: double (nullable = true)
 |-- ShapeFactor1: double (nullable = true)
 |-- ShapeFactor2: double (nullable = true)
 |-- ShapeFactor3: double (nullable = true)
 |-- ShapeFactor4: double (nullable = true)
 |-- Class: string (nullable = true)



In [27]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Area,13611,53048.284549261625,29324.09571688207,20420,254616
Perimeter,13611,855.2834585996654,214.28969589196151,524.736,1985.37
MajorAxisLength,13611,320.1418673032194,85.6941859593335,183.601165,738.8601535
MinorAxisLength,13611,202.2707140828817,44.97009129411471,122.5126535,460.1984968
AspectRation,13611,1.5832419790188144,0.24667845568580432,1.024867596,2.430306447
Eccentricity,13611,0.750894929372346,0.09200176320620888,0.218951263,0.911422968
ConvexArea,13611,53768.20020571596,29774.915817000012,20684,263261
EquivDiameter,13611,253.06421992490445,59.17712014871156,161.2437642,569.3743583
Extent,13611,0.7497327873564055,0.049086366843964224,0.555314717,0.866194641


In [28]:
df.select(["Area","Perimeter","Solidity","roundness","Compactness","Class"]).show(5)

+-----+---------+-----------+-----------+-----------+-----+
| Area|Perimeter|   Solidity|  roundness|Compactness|Class|
+-----+---------+-----------+-----------+-----------+-----+
|28395|  610.291|0.988855999|0.958027126|0.913357755|SEKER|
|28734|  638.018|0.984985603|0.887033637|0.953860842|SEKER|
|29380|   624.11|0.989558774|0.947849473|0.908774239|SEKER|
|30008|  645.884|0.976695743|0.903936374|0.928328835|SEKER|
|30140|  620.134| 0.99089325|0.984877069|0.970515523|SEKER|
+-----+---------+-----------+-----------+-----------+-----+
only showing top 5 rows



In [29]:
df.groupBy('Class').count().orderBy('count').show()

+--------+-----+
|   Class|count|
+--------+-----+
|  BOMBAY|  522|
|BARBUNYA| 1322|
|    CALI| 1630|
|   HOROZ| 1928|
|   SEKER| 2027|
|    SIRA| 2636|
|DERMASON| 3546|
+--------+-----+



In [30]:
# Convert Class column from string to numerical values



#  Classification

In [55]:
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
from pyspark.ml.classification import LogisticRegression,DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [56]:
featureColumns =df.columns[:-2]

In [None]:
indexer = StringIndexer(inputCol="Class", outputCol="label")
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")


In [None]:
assembled = assembler.transform(df)

In [None]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="features_std", withStd=True, withMean=True)
scalerModel = scaler.fit(assembled)
scaledData = scalerModel.transform(assembled)

In [34]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol="features", labelCol="label")


In [35]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[indexer,assembler,lr])

In [36]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(lr.fitIntercept, [False, True]) \
    .addGrid(lr.maxIter, [5, 10,20]) \
    .build()

In [37]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(df)

In [38]:
cvModel.avgMetrics


[np.float64(0.10771821621773305),
 np.float64(0.11024982232517087),
 np.float64(0.8288508530832397),
 np.float64(0.8579349427695883),
 np.float64(0.9138862962059907),
 np.float64(0.9145236000080829)]

In [39]:

(trainin, testData) = df.randomSplit([0.8,0.2], seed = 13234 )

In [40]:
predictions = cvModel.transform(testData)

In [41]:
predictions.show()

+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+--------------------+--------------------+----------+
| Area|Perimeter|MajorAxisLength|MinorAxisLength|AspectRation|Eccentricity|ConvexArea|EquivDiameter|     Extent|   Solidity|  roundness|Compactness|ShapeFactor1|ShapeFactor2|ShapeFactor3|ShapeFactor4|   Class|label|            features|       rawPrediction|         probability|prediction|
+-----+---------+---------------+---------------+------------+------------+----------+-------------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+--------+-----+--------------------+--------------------+--------------------+----------+
|21397|  535.436|    192.5302973|    141.6521869| 1.359176314| 0.677264398|     21731|  165.0560709|0.772650128|0.984630252|0.9378

In [42]:
predictions.select("probability","prediction", "label").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------+----------+-----+
|probability                                                                                                                                                |prediction|label|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+----------+-----+
|[0.999999786048775,1.0073424145441893E-7,1.1321489990514008E-7,2.0830094026652968E-12,6.254532124028324E-18,6.15373156770763E-16,9.970643899322056E-17]    |0.0       |0.0  |
|[0.9999996557639533,3.2793932425477326E-7,1.6231598494916765E-8,6.512276531821016E-11,1.7885717668995145E-17,1.1320598855425688E-15,5.2689240728330136E-17]|0.0       |0.0  |
|[0.9999998177392065,1.699893723229929E-7,1.2237036657813397E-8,3.4384248104377254E-11,6.671245948125081E-18,2.12473589857096

In [43]:
prediction_save=predictions.select("rawprediction","probability","prediction", "label").show()

+--------------------+--------------------+----------+-----+
|       rawprediction|         probability|prediction|label|
+--------------------+--------------------+----------+-----+
|[24.3548276930614...|[0.99999978604877...|       0.0|  0.0|
|[23.8258393081091...|[0.99999965576395...|       0.0|  0.0|
|[24.6065439278903...|[0.99999981773920...|       0.0|  0.0|
|[23.8494655004921...|[0.99999950918119...|       0.0|  0.0|
|[24.0218520546834...|[0.99999974983603...|       0.0|  0.0|
|[24.4082920599468...|[0.99999877504810...|       0.0|  0.0|
|[22.7044814231760...|[0.99998874305469...|       0.0|  0.0|
|[23.4824999988585...|[0.99999953339581...|       0.0|  0.0|
|[22.6440671849137...|[0.99999914424307...|       0.0|  0.0|
|[22.2233773113014...|[0.99999835859942...|       0.0|  0.0|
|[22.4979797414027...|[0.99999860828675...|       0.0|  0.0|
|[22.4813299872630...|[0.99999771710537...|       0.0|  0.0|
|[22.2479949721277...|[0.99999845795825...|       0.0|  0.0|
|[21.8701174915726...|[0

In [44]:
predictions.select("prediction", "label").write.save(path="predictions",
                                                     format="com.databricks.spark.csv",
                                                     header='true')

# Evaluations

In [46]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

def evaluate(result):
    predictionAndLabels = result.select("prediction", "label")
    metrics = ["f1","precisionByLabel","recallByLabel","weightedPrecision","weightedRecall","accuracy"]
    for m in metrics:
        evaluator = MulticlassClassificationEvaluator(metricName=m)
        print(str(m) + ": " + str(evaluator.evaluate(predictionAndLabels)))
evaluate(predictions)

In [47]:
def evaluate(result):
    predictionAndLabels = result.select("prediction", "label")
    metrics = ["f1","precisionByLabel","recallByLabel","weightedPrecision","weightedRecall","accuracy"]
    for m in metrics:
        evaluator = MulticlassClassificationEvaluator(metricName=m)
        print(str(m) + ": " + str(evaluator.evaluate(predictionAndLabels)))

In [48]:
evaluate(predictions)

f1: 0.9209945149904254
precisionByLabel: 0.925561797752809
recallByLabel: 0.9052197802197802
weightedPrecision: 0.9216211770958438
weightedRecall: 0.9208711433756805
accuracy: 0.9208711433756805


In [49]:
# Evaluate model performance
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy =", accuracy)


Accuracy = 0.9208711433756805


In [50]:
prediction=predictions.select("prediction", "label")

In [51]:
metrics = MulticlassMetrics(prediction.rdd.map(tuple))



In [52]:
metrics.confusionMatrix().toArray().transpose()

array([[659.,  48.,   4.,   1.,   0.,   0.,   0.],
       [ 61., 468.,  17.,   3.,   3.,   6.,   0.],
       [  7.,   9., 394.,   0.,   0.,   1.,   0.],
       [  1.,  12.,   0., 364.,   5.,   1.,   0.],
       [  0.,   3.,   1.,   3., 313.,  20.,   0.],
       [  0.,   0.,   5.,   0.,   7., 234.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0., 105.]])

In [53]:
sc.stop()