In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 63.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=5057a14268d538b65a7b6ab8ddf0838cbf4b45d76ad6363684ea1c1d703a5814
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [3]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()
# Load training data
data = spark.read.csv("Iris.csv",inferSchema=True,header=True)


In [4]:
data.dtypes

[('Id', 'int'),
 ('SepalLengthCm', 'double'),
 ('SepalWidthCm', 'double'),
 ('PetalLengthCm', 'double'),
 ('PetalWidthCm', 'double'),
 ('Species', 'string')]

In [5]:
data.show(3)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 3 rows



In [18]:
data.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



In [6]:

from pyspark.ml.feature import VectorAssembler

cols = data.columns[1:5] # our features

assembler = VectorAssembler(inputCols=cols, outputCol='features')
assembled = assembler.transform(data)
assembled.show(5)

+---+-------------+------------+-------------+------------+-----------+-----------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|
+---+-------------+------------+-------------+------------+-----------+-----------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
+---+-------------+------------+-------------+------------+-----------+-----------------+
only showing top 5 rows



In [7]:
from pyspark.ml.feature import( OneHotEncoder,StringIndexer)
indexer = StringIndexer(inputCol="Species", outputCol="label")

indexed = indexer.fit(assembled).transform(assembled)


In [8]:
# remove the unnecessary column
indexed = indexed.drop('Species')


In [9]:
#onehot encoding
encoder = OneHotEncoder(inputCols=["label"], outputCols=["label_en"])
encoded = encoder.fit(indexed).transform(indexed)
encoded.show(2)


+---+-------------+------------+-------------+------------+-----------------+-----+-------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|         features|label|     label_en|
+---+-------------+------------+-------------+------------+-----------------+-----+-------------+
|  1|          5.1|         3.5|          1.4|         0.2|[5.1,3.5,1.4,0.2]|  0.0|(2,[0],[1.0])|
|  2|          4.9|         3.0|          1.4|         0.2|[4.9,3.0,1.4,0.2]|  0.0|(2,[0],[1.0])|
+---+-------------+------------+-------------+------------+-----------------+-----+-------------+
only showing top 2 rows



In [10]:
#keeping just the culmns we need to build our models

final_data=indexed.select("features","label")


In [11]:
# Split the data into train and test
train, test= final_data.randomSplit([0.8, 0.2], seed=41)
train

DataFrame[features: vector, label: double]

In [17]:
#model1
from pyspark.ml.classification import RandomForestClassifier
cl = RandomForestClassifier(labelCol='label',   featuresCol='features', maxDepth=5)

#training
model = cl.fit(train)

#predicting
predicted=model.transform(test)
predicted

DataFrame[features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double]

In [13]:
#accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')

accuracy = evaluator.evaluate(predicted)
print("Random Forest classifier ")
print('Test Accuracy is ', accuracy)

Random Forest classifier 
Test Accuracy is  0.9354838709677419


In [14]:
#model2

from pyspark.ml.classification import DecisionTreeClassifier
cl2 = DecisionTreeClassifier(labelCol='label',   featuresCol='features', maxDepth=4)

#training
model2 = cl2.fit(train)

#predicting
predicted2=model.transform(test)

In [15]:

#accuracy

evaluator2 = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')

accuracy2 = evaluator.evaluate(predicted)
print("decision tree classifier ")
print('Test Accuracy is ', accuracy2)

decision tree classifier 
Test Accuracy is  0.9354838709677419


In [21]:
#stats on y
data.printSchema() #have a look on our variables (all have the typr str)
data.groupBy("Species").count().show()
data.describe().toPandas()

#test.count() #train.count()

#statistics max min av 

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)

+---------------+-----+
|        Species|count|
+---------------+-----+
| Iris-virginica|   50|
|    Iris-setosa|   50|
|Iris-versicolor|   50|
+---------------+-----+



Unnamed: 0,summary,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,count,150.0,150.0,150.0,150.0,150.0,150
1,mean,75.5,5.843333333333335,3.0540000000000007,3.758666666666669,1.1986666666666672,
2,stddev,43.44536799245692,0.8280661279778637,0.4335943113621737,1.764420419952262,0.7631607417008414,
3,min,1.0,4.3,2.0,1.0,0.1,Iris-setosa
4,max,150.0,7.9,4.4,6.9,2.5,Iris-virginica
