In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Ign:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,450 kB]
Get:14 http://a

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, FloatType, IntegerType, StringType

# Create Spark Session in localhost
spark = SparkSession.builder.master("local").\
    appName("ITESO-LogisticRegression").\
    getOrCreate()

# Set spark context
sc = spark.sparkContext
sc.setLogLevel("ERROR")


iris_prediction_schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("SepalLengthCm", FloatType(), True),
    StructField("SepalWidthCm", FloatType(), True),
    StructField("PetalLengthCm", FloatType(), True),
    StructField("PetalWidthCm", FloatType(), True),
    StructField("Species", StringType(), True)
])


iris_df = spark.read.format("csv").\
    option("header", "true").\
    option("mode", "permissive").\
    option("path", "/content/sample_data/iris.csv").\
    schema(iris_prediction_schema).\
    load()
iris_df.printSchema()
iris_df.show(n=10, truncate=False)

# Add a categorical numerical column for Species
iris_df = iris_df.withColumn(
    "SpeciesCode",
    when(iris_df["Species"] == "Iris-setosa", 1)
    .when(iris_df["Species"] == "Iris-versicolor", 2)
    .when(iris_df["Species"] == "Iris-virginica", 3)
    .otherwise(None)
)

iris_df.show()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: float (nullable = true)
 |-- SepalWidthCm: float (nullable = true)
 |-- PetalLengthCm: float (nullable = true)
 |-- PetalWidthCm: float (nullable = true)
 |-- Species: string (nullable = true)

+---+-------------+------------+-------------+------------+-----------+
|Id |SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|Species    |
+---+-------------+------------+-------------+------------+-----------+
|1  |5.1          |3.5         |1.4          |0.2         |Iris-setosa|
|2  |4.9          |3.0         |1.4          |0.2         |Iris-setosa|
|3  |4.7          |3.2         |1.3          |0.2         |Iris-setosa|
|4  |4.6          |3.1         |1.5          |0.2         |Iris-setosa|
|5  |5.0          |3.6         |1.4          |0.2         |Iris-setosa|
|6  |5.4          |3.9         |1.7          |0.4         |Iris-setosa|
|7  |4.6          |3.4         |1.4          |0.3         |Iris-setosa|
|8  |5.0          |3.4    

In [25]:
# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import col,isnan, when, count
iris_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in iris_df.columns]).show()

+---+-------------+------------+-------------+------------+-------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|Species|SpeciesCode|
+---+-------------+------------+-------------+------------+-------+-----------+
|  0|            0|           0|            0|           0|      0|          0|
+---+-------------+------------+-------------+------------+-------+-----------+



In [26]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

# Rename the label column to "label"
iris_df = iris_df.withColumnRenamed("SpeciesCode", "label")

# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=["SepalLengthCm" ,"SepalWidthCm" ,"PetalLengthCm" ,"PetalWidthCm"
], outputCol="features")

# Transform the data to include the features column
data_with_features = assembler.transform(iris_df).select("label", "features")

# Split the data into training and test sets
train_data, test_data = data_with_features.randomSplit([0.8, 0.2], seed=29)

# Create and fit the logistic regression model
dtc = DecisionTreeClassifier()
dtc_model = dtc.fit(train_data)

# Make predictions on the test data
predictions = dtc_model.transform(test_data)


Exception ignored in: <function JavaWrapper.__del__ at 0x7852f7683b50>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'DecisionTreeClassifier' object has no attribute '_java_obj'


In [28]:

# Show predictions with label, prediction, and probability
predictions.select("label", "prediction", "probability").show(20, truncate=False)


+-----+----------+-----------------+
|label|prediction|probability      |
+-----+----------+-----------------+
|1    |1.0       |[0.0,1.0,0.0,0.0]|
|1    |1.0       |[0.0,1.0,0.0,0.0]|
|1    |1.0       |[0.0,1.0,0.0,0.0]|
|1    |1.0       |[0.0,1.0,0.0,0.0]|
|1    |1.0       |[0.0,1.0,0.0,0.0]|
|1    |1.0       |[0.0,1.0,0.0,0.0]|
|1    |1.0       |[0.0,1.0,0.0,0.0]|
|1    |1.0       |[0.0,1.0,0.0,0.0]|
|1    |1.0       |[0.0,1.0,0.0,0.0]|
|2    |2.0       |[0.0,0.0,1.0,0.0]|
|2    |2.0       |[0.0,0.0,1.0,0.0]|
|2    |2.0       |[0.0,0.0,1.0,0.0]|
|2    |2.0       |[0.0,0.0,1.0,0.0]|
|2    |2.0       |[0.0,0.0,1.0,0.0]|
|2    |2.0       |[0.0,0.0,1.0,0.0]|
|2    |2.0       |[0.0,0.0,1.0,0.0]|
|2    |2.0       |[0.0,0.0,1.0,0.0]|
|2    |2.0       |[0.0,0.0,1.0,0.0]|
|2    |2.0       |[0.0,0.0,1.0,0.0]|
|2    |2.0       |[0.0,0.0,1.0,0.0]|
+-----+----------+-----------------+
only showing top 20 rows



In [30]:
print(dtc_model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_1114f601b0ae, depth=5, numNodes=17, numClasses=4, numFeatures=4
  If (feature 2 <= 2.449999988079071)
   Predict: 1.0
  Else (feature 2 > 2.449999988079071)
   If (feature 3 <= 1.75)
    If (feature 2 <= 4.950000047683716)
     If (feature 3 <= 1.6500000357627869)
      Predict: 2.0
     Else (feature 3 > 1.6500000357627869)
      Predict: 3.0
    Else (feature 2 > 4.950000047683716)
     If (feature 3 <= 1.550000011920929)
      Predict: 3.0
     Else (feature 3 > 1.550000011920929)
      If (feature 0 <= 6.75)
       Predict: 2.0
      Else (feature 0 > 6.75)
       Predict: 3.0
   Else (feature 3 > 1.75)
    If (feature 2 <= 4.8500001430511475)
     If (feature 0 <= 5.950000047683716)
      Predict: 2.0
     Else (feature 0 > 5.950000047683716)
      Predict: 3.0
    Else (feature 2 > 4.8500001430511475)
     Predict: 3.0



In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})

precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})

recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')

Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1: 1.0
