In [70]:
!pip install pyspark



In [71]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ML project').getOrCreate()

In [72]:
# load data 
df = spark.read.csv('iris.data', header = True, inferSchema = True)
df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- class: string (nullable = true)



In [None]:
k=len(df.columns)
for i in range(k):
  old_name = df.columns[i]
  new_name = 'f'+str(i)
  print(old_name,new_name)
  df=df.withColumnRenamed(old_name,new_name)

In [74]:
df.show()

+---+---+---+---+-----------+
| f0| f1| f2| f3|         f4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
|5.4|3.9|1.7|0.4|Iris-setosa|
|4.6|3.4|1.4|0.3|Iris-setosa|
|5.0|3.4|1.5|0.2|Iris-setosa|
|4.4|2.9|1.4|0.2|Iris-setosa|
|4.9|3.1|1.5|0.1|Iris-setosa|
|5.4|3.7|1.5|0.2|Iris-setosa|
|4.8|3.4|1.6|0.2|Iris-setosa|
|4.8|3.0|1.4|0.1|Iris-setosa|
|4.3|3.0|1.1|0.1|Iris-setosa|
|5.8|4.0|1.2|0.2|Iris-setosa|
|5.7|4.4|1.5|0.4|Iris-setosa|
|5.4|3.9|1.3|0.4|Iris-setosa|
|5.1|3.5|1.4|0.3|Iris-setosa|
|5.7|3.8|1.7|0.3|Iris-setosa|
|5.1|3.8|1.5|0.3|Iris-setosa|
+---+---+---+---+-----------+
only showing top 20 rows



In [75]:
# Show Dataset in DataFrame
import pandas as pd
pd.DataFrame(df.take(5), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4
f0,5.1,4.9,4.7,4.6,5
f1,3.5,3,3.2,3.1,3.6
f2,1.4,1.4,1.3,1.5,1.4
f3,0.2,0.2,0.2,0.2,0.2
f4,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa


In [76]:
# Data grouping by class 
class_name = df.columns[len(df.columns)-1]
df.groupby(class_name).count().toPandas()

Unnamed: 0,f4,count
0,Iris-virginica,50
1,Iris-setosa,50
2,Iris-versicolor,50


In [77]:
# convert string to numeric 
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()
indexer.setInputCol(class_name).setOutputCol("label")
df1 = indexer.fit(df).transform(df)


In [None]:
df1.show()

+---+---+---+---+-----------+-----+
| f0| f1| f2| f3|         f4|label|
+---+---+---+---+-----------+-----+
|5.1|3.5|1.4|0.2|Iris-setosa|  0.0|
|4.9|3.0|1.4|0.2|Iris-setosa|  0.0|
|4.7|3.2|1.3|0.2|Iris-setosa|  0.0|
|4.6|3.1|1.5|0.2|Iris-setosa|  0.0|
|5.0|3.6|1.4|0.2|Iris-setosa|  0.0|
|5.4|3.9|1.7|0.4|Iris-setosa|  0.0|
|4.6|3.4|1.4|0.3|Iris-setosa|  0.0|
|5.0|3.4|1.5|0.2|Iris-setosa|  0.0|
|4.4|2.9|1.4|0.2|Iris-setosa|  0.0|
|4.9|3.1|1.5|0.1|Iris-setosa|  0.0|
|5.4|3.7|1.5|0.2|Iris-setosa|  0.0|
|4.8|3.4|1.6|0.2|Iris-setosa|  0.0|
|4.8|3.0|1.4|0.1|Iris-setosa|  0.0|
|4.3|3.0|1.1|0.1|Iris-setosa|  0.0|
|5.8|4.0|1.2|0.2|Iris-setosa|  0.0|
|5.7|4.4|1.5|0.4|Iris-setosa|  0.0|
|5.4|3.9|1.3|0.4|Iris-setosa|  0.0|
|5.1|3.5|1.4|0.3|Iris-setosa|  0.0|
|5.7|3.8|1.7|0.3|Iris-setosa|  0.0|
|5.1|3.8|1.5|0.3|Iris-setosa|  0.0|
+---+---+---+---+-----------+-----+
only showing top 20 rows



In [78]:
# Split the data
(training_data, test_data) = df1.randomSplit([0.8,0.2])

In [79]:
training_data.toPandas()

Unnamed: 0,f0,f1,f2,f3,f4,label
0,4.3,3.0,1.1,0.1,Iris-setosa,0.0
1,4.4,2.9,1.4,0.2,Iris-setosa,0.0
2,4.4,3.0,1.3,0.2,Iris-setosa,0.0
3,4.6,3.2,1.4,0.2,Iris-setosa,0.0
4,4.6,3.4,1.4,0.3,Iris-setosa,0.0
...,...,...,...,...,...,...
108,7.4,2.8,6.1,1.9,Iris-virginica,2.0
109,7.7,2.6,6.9,2.3,Iris-virginica,2.0
110,7.7,2.8,6.7,2.0,Iris-virginica,2.0
111,7.7,3.0,6.1,2.3,Iris-virginica,2.0


In [80]:
# features and class 
class_name = 'label'
feature_names = df.columns[:-1]
print(class_name)
print(feature_names)

label
['f0', 'f1', 'f2', 'f3']


In [81]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler()
assembler.setInputCols(feature_names).setOutputCol('features')

transformed_data = assembler.transform(df1)

In [82]:
transformed_data.show()

+---+---+---+---+-----------+-----+-----------------+
| f0| f1| f2| f3|         f4|label|         features|
+---+---+---+---+-----------+-----+-----------------+
|5.1|3.5|1.4|0.2|Iris-setosa|  0.0|[5.1,3.5,1.4,0.2]|
|4.9|3.0|1.4|0.2|Iris-setosa|  0.0|[4.9,3.0,1.4,0.2]|
|4.7|3.2|1.3|0.2|Iris-setosa|  0.0|[4.7,3.2,1.3,0.2]|
|4.6|3.1|1.5|0.2|Iris-setosa|  0.0|[4.6,3.1,1.5,0.2]|
|5.0|3.6|1.4|0.2|Iris-setosa|  0.0|[5.0,3.6,1.4,0.2]|
|5.4|3.9|1.7|0.4|Iris-setosa|  0.0|[5.4,3.9,1.7,0.4]|
|4.6|3.4|1.4|0.3|Iris-setosa|  0.0|[4.6,3.4,1.4,0.3]|
|5.0|3.4|1.5|0.2|Iris-setosa|  0.0|[5.0,3.4,1.5,0.2]|
|4.4|2.9|1.4|0.2|Iris-setosa|  0.0|[4.4,2.9,1.4,0.2]|
|4.9|3.1|1.5|0.1|Iris-setosa|  0.0|[4.9,3.1,1.5,0.1]|
|5.4|3.7|1.5|0.2|Iris-setosa|  0.0|[5.4,3.7,1.5,0.2]|
|4.8|3.4|1.6|0.2|Iris-setosa|  0.0|[4.8,3.4,1.6,0.2]|
|4.8|3.0|1.4|0.1|Iris-setosa|  0.0|[4.8,3.0,1.4,0.1]|
|4.3|3.0|1.1|0.1|Iris-setosa|  0.0|[4.3,3.0,1.1,0.1]|
|5.8|4.0|1.2|0.2|Iris-setosa|  0.0|[5.8,4.0,1.2,0.2]|
|5.7|4.4|1.5|0.4|Iris-setosa

In [83]:
# Split the data
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2], seed =51800364)
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 127
Test Dataset Count: 23


In [84]:
training_data.toPandas()

Unnamed: 0,f0,f1,f2,f3,f4,label,features
0,4.3,3.0,1.1,0.1,Iris-setosa,0.0,"[4.3, 3.0, 1.1, 0.1]"
1,4.4,2.9,1.4,0.2,Iris-setosa,0.0,"[4.4, 2.9, 1.4, 0.2]"
2,4.4,3.0,1.3,0.2,Iris-setosa,0.0,"[4.4, 3.0, 1.3, 0.2]"
3,4.5,2.3,1.3,0.3,Iris-setosa,0.0,"[4.5, 2.3, 1.3, 0.3]"
4,4.6,3.1,1.5,0.2,Iris-setosa,0.0,"[4.6, 3.1, 1.5, 0.2]"
...,...,...,...,...,...,...,...
122,7.7,2.6,6.9,2.3,Iris-virginica,2.0,"[7.7, 2.6, 6.9, 2.3]"
123,7.7,2.8,6.7,2.0,Iris-virginica,2.0,"[7.7, 2.8, 6.7, 2.0]"
124,7.7,3.0,6.1,2.3,Iris-virginica,2.0,"[7.7, 3.0, 6.1, 2.3]"
125,7.7,3.8,6.7,2.2,Iris-virginica,2.0,"[7.7, 3.8, 6.7, 2.2]"


In [85]:
# Define the model
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol='label', 
                            featuresCol='features',
                            maxDepth=5)

In [86]:
# Fit the model
model = rf.fit(training_data)

In [87]:
# Predict with the test dataset
rf_predictions = model.transform(test_data)

In [88]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'label', metricName = 'accuracy')
print('Random Forest classifier Accuracy:', multi_evaluator.evaluate(rf_predictions))

Random Forest classifier Accuracy: 0.9130434782608695


In [89]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(training_data)
dt_predictions = dtModel.transform(test_data)
dt_predictions.show(10)

+---+---+---+---+---------------+-----+-----------------+--------------+--------------------+----------+
| f0| f1| f2| f3|             f4|label|         features| rawPrediction|         probability|prediction|
+---+---+---+---+---------------+-----+-----------------+--------------+--------------------+----------+
|4.4|3.2|1.3|0.2|    Iris-setosa|  0.0|[4.4,3.2,1.3,0.2]|[45.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|4.9|2.5|4.5|1.7| Iris-virginica|  2.0|[4.9,2.5,4.5,1.7]|[0.0,38.0,0.0]|       [0.0,1.0,0.0]|       1.0|
|5.0|3.4|1.5|0.2|    Iris-setosa|  0.0|[5.0,3.4,1.5,0.2]|[45.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|5.1|3.5|1.4|0.2|    Iris-setosa|  0.0|[5.1,3.5,1.4,0.2]|[45.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|5.2|2.7|3.9|1.4|Iris-versicolor|  1.0|[5.2,2.7,3.9,1.4]|[0.0,38.0,0.0]|       [0.0,1.0,0.0]|       1.0|
|5.2|3.4|1.4|0.2|    Iris-setosa|  0.0|[5.2,3.4,1.4,0.2]|[45.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|5.2|3.5|1.5|0.2|    Iris-setosa|  0.0|[5.2,3.5,1.5,0.2

In [90]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'label', metricName = 'accuracy')
print('Decision Tree Accuracy:', multi_evaluator.evaluate(dt_predictions))

Decision Tree Accuracy: 0.9130434782608695


In [91]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(training_data)
lr_predictions = dtModel.transform(test_data)

In [92]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'label', metricName = 'accuracy')
print('Logistic Regression Accuracy:', multi_evaluator.evaluate(lr_predictions))

Logistic Regression Accuracy: 0.9130434782608695
