# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Multi classification (Decision Trees & SVM)** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

---
**Alumnos**: David Abraham Naranjo, Benjamin Zarate y Angel Cortes

In [None]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Multi classification (Decision Trees & SVM)") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/10 20:26:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
!pwd

/home/jovyan


### Leer el csv

In [4]:
from team_name.spark_utils import SparkUtils

schema = SparkUtils.generate_schema(
    [
        ("Id", "integer"),
        ("SepalLengthCm", "float"),
        ("SepalWidthCm", "float"),
        ("PetalLengthCm", "float"),
        ("PetalWidthCm", "float"),
        ("Species", "string"),
    ]
)

data = (
    spark.read.schema(schema)
    .option("header", "true")
    .csv("/home/jovyan/notebooks/data/Iris.csv")
)

data.printSchema()
data.show(5)

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: float (nullable = true)
 |-- SepalWidthCm: float (nullable = true)
 |-- PetalLengthCm: float (nullable = true)
 |-- PetalWidthCm: float (nullable = true)
 |-- Species: string (nullable = true)



                                                                                

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



### Assemble the features into a single vector column

In [5]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

data_indexer = StringIndexer(inputCol="Species", outputCol="label")
iris_indexed = data_indexer.fit(data).transform(data)

data_assembler = VectorAssembler(inputCols=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"], outputCol="features")
data_iris = data_assembler.transform(iris_indexed).select("label", "features")

                                                                                

### Split the data into training and test sets 80% training data and 20% testing data

In [6]:
train_df, test_df = data_iris.randomSplit([0.8, 0.2], seed=42)

### Show the whole dataset

In [7]:
print("Original Dataset")
# data.show()

# Print train dataset
print("train set")
train_df.show()

Original Dataset
train set
+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[4.30000019073486...|
|  0.0|[4.40000009536743...|
|  0.0|[4.40000009536743...|
|  0.0|[4.5,2.2999999523...|
|  0.0|[4.59999990463256...|
|  0.0|[4.59999990463256...|
|  0.0|[4.69999980926513...|
|  0.0|[4.69999980926513...|
|  0.0|[4.80000019073486...|
|  0.0|[4.80000019073486...|
|  0.0|[4.80000019073486...|
|  0.0|[4.80000019073486...|
|  0.0|[4.90000009536743...|
|  0.0|[4.90000009536743...|
|  0.0|[4.90000009536743...|
|  0.0|[5.0,3.0,1.600000...|
|  0.0|[5.0,3.2000000476...|
|  0.0|[5.0,3.2999999523...|
|  0.0|[5.0,3.4000000953...|
|  0.0|[5.0,3.5,1.299999...|
+-----+--------------------+
only showing top 20 rows



### Create a Decision Tree model

In [8]:
from pyspark.ml.classification import DecisionTreeClassifier

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

# TRAINNING

In [9]:
dt_model = dt.fit(train_df)

# Display model summary
print("Decision Tree model summary:{0}".format(dt_model.toDebugString))

Decision Tree model summary:DecisionTreeClassificationModel: uid=DecisionTreeClassifier_386aa629348f, depth=5, numNodes=15, numClasses=3, numFeatures=4
  If (feature 2 <= 2.449999988079071)
   Predict: 0.0
  Else (feature 2 > 2.449999988079071)
   If (feature 2 <= 4.75)
    Predict: 1.0
   Else (feature 2 > 4.75)
    If (feature 3 <= 1.75)
     If (feature 2 <= 4.950000047683716)
      Predict: 1.0
     Else (feature 2 > 4.950000047683716)
      If (feature 3 <= 1.6500000357627869)
       Predict: 2.0
      Else (feature 3 > 1.6500000357627869)
       Predict: 1.0
    Else (feature 3 > 1.75)
     If (feature 2 <= 4.8500001430511475)
      If (feature 0 <= 5.950000047683716)
       Predict: 1.0
      Else (feature 0 > 5.950000047683716)
       Predict: 2.0
     Else (feature 2 > 4.8500001430511475)
      Predict: 2.0



# PREDICTIONS

In [10]:
# Use the trained model to make predictions on the test data
predictions = dt_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction").show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[4.40000009536743...|       0.0|
|[4.59999990463256...|       0.0|
|[4.59999990463256...|       0.0|
|[4.80000019073486...|       0.0|
|[4.90000009536743...|       0.0|
|[5.0,3.4000000953...|       0.0|
|[5.09999990463256...|       0.0|
|[5.09999990463256...|       0.0|
|[5.5,3.5,1.299999...|       0.0|
|[5.5,4.1999998092...|       0.0|
|[5.69999980926513...|       0.0|
|[5.80000019073486...|       0.0|
|   [5.0,2.0,3.5,1.0]|       1.0|
|[5.40000009536743...|       1.0|
|[5.59999990463256...|       1.0|
|[5.69999980926513...|       1.0|
|[6.0,2.7000000476...|       2.0|
|[4.90000009536743...|       1.0|
|[6.19999980926513...|       2.0|
|[6.30000019073486...|       2.0|
+--------------------+----------+
only showing top 20 rows



# MODEL TESTING

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")

Accuracy: 0.9166666666666666
Precision: 0.9166666666666666
Recall: 0.9166666666666666
F1 Score: 0.9166666666666667


### SVM Model with OneVsRest

In [12]:
from pyspark.ml.classification import LinearSVC, OneVsRest

# SVM
svm = LinearSVC(maxIter=10, regParam=0.1)

# OneVsRest
ovr = OneVsRest(classifier=svm)
model_ovr = ovr.fit(train_df)

# Predictions
predictions_svm = model_ovr.transform(test_df)

25/05/10 20:27:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/05/10 20:27:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


# MODEL TESTING SVM

In [17]:
evaluator_svm = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

f1_svm = evaluator_svm.evaluate(predictions_svm, {evaluator.metricName: "f1"})

print(f"F1 Score: {f1_svm}")

[Stage 118:>                                                        (0 + 1) / 1]

F1 Score: 0.9166666666666667


                                                                                

## DF Model vs SMV Model

In [18]:
print(f"Decision Tree - F1: {f1}")
print(f"SVM (OvA)     - F1: {f1_svm}")

Decision Tree - F1: 0.9166666666666667
SVM (OvA)     - F1: 0.9166666666666667


In [19]:
sc.stop()