In [3]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.functions import expr, col
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression

In [16]:
spark= SparkSession.builder.appName('dataframe app').getOrCreate()

21/07/21 17:58:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [17]:
# load the ZOO dataset:
zoo_data=spark.read.csv("zoo.csv",inferSchema=True,header=True)
zoo_data = zoo_data.withColumn("IsMammal", expr("CASE WHEN Type = 1 THEN 1 ELSE 0 END"))

In [18]:
# preprocess dataset using RFormula

preprocessed_data = RFormula(formula= "IsMammal ~ Hair + Feathers + Eggs + Milk + Airborne + Aquatic +" + 
                             " Predator + Toothed + Backbone + Venomous + Fins + Legs+" +
                            "Tail + Domestic + Catsize")

preprocessed_data = preprocessed_data.fit(zoo_data)
preprocessed_data = preprocessed_data.transform(zoo_data)

In [19]:
# split dataset into training and test data
train, test = preprocessed_data.randomSplit([0.7, 0.3])

In [20]:
# configure classifier
lr = LogisticRegression(labelCol="label", featuresCol="features")

In [21]:
# train classifier
fittedLR = lr.fit(train)

In [22]:
# classify test data set
result = fittedLR.transform(preprocessed_data)
result.select('AnimalName', 'label', 'prediction').where(expr("AnimalName in ('lobster', 'hawk', 'goat', 'crayfish', 'clam', 'hamster')")).toPandas()

Unnamed: 0,AnimalName,label,prediction
0,clam,0.0,0.0
1,crayfish,0.0,0.0
2,goat,1.0,1.0
3,hamster,1.0,1.0
4,hawk,0.0,0.0
5,lobster,0.0,0.0


In [24]:
result_extracted = result.select("AnimalName", "IsMammal", "prediction")
result_extracted.show(200)

+----------+--------+----------+
|AnimalName|IsMammal|prediction|
+----------+--------+----------+
|  aardvark|       1|       1.0|
|  antelope|       1|       1.0|
|      bass|       0|       0.0|
|      bear|       1|       1.0|
|      boar|       1|       1.0|
|   buffalo|       1|       1.0|
|      calf|       1|       1.0|
|      carp|       0|       0.0|
|   catfish|       0|       0.0|
|      cavy|       1|       1.0|
|   cheetah|       1|       1.0|
|   chicken|       0|       0.0|
|      chub|       0|       0.0|
|      clam|       0|       0.0|
|      crab|       0|       0.0|
|  crayfish|       0|       0.0|
|      crow|       0|       0.0|
|      deer|       1|       1.0|
|   dogfish|       0|       0.0|
|   dolphin|       1|       1.0|
|      dove|       0|       0.0|
|      duck|       0|       0.0|
|  elephant|       1|       1.0|
|  flamingo|       0|       0.0|
|      flea|       0|       0.0|
|      frog|       0|       0.0|
|      frog|       0|       0.0|
|  fruitba