# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Aprendizaje Automático (Machine Learning): Decision Trees** </center>

---
**Equipo**:
- Luis Raúl Acosta Mendoza
- Samantha Abigail Quintero Valadez 
- Arturo Benjamin Vergara Romo

**Profesor**: Dr. Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Decision-Trees") \
    .master("spark://be6296989c4d:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/01 03:18:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Preparación de Datos

In [3]:

import gatubelxs.spark_utils
columns_info = [ ("Id", "integer"),
                ("SepalLenghtCm", "double"),
                ("SepalWidthCm", "double"),
                ("PetalLengthCm", "double"),
                ("PetalWidthCm", "double"),
                ("Species", "string")]

schema = gatubelxs.spark_utils.SparkUtils.generate_schema(columns_info)

# Create DataFrame
strokes_df = spark \
                .read \
                .schema(schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/Iris.csv")

### Assemble the features into a single vector column

In [4]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

label_indexer = StringIndexer(inputCol="Species", outputCol="label")
df_label = label_indexer.fit(strokes_df).transform(strokes_df)

assembler = VectorAssembler(inputCols=["SepalLenghtCm", "SepalWidthCm","PetalLengthCm","PetalWidthCm"], outputCol="features")
data_with_features = assembler.transform(df_label).select("label", "features")

                                                                                

### Split the data into training and test sets 80% training data and 20% testing data

In [5]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=97)

### Show the whole dataset

In [6]:
print("Original Dataset")
data_with_features.show()

# Print train dataset
print("train set")
train_df.show()

Original Dataset
+-----+-----------------+
|label|         features|
+-----+-----------------+
|  0.0|[5.1,3.5,1.4,0.2]|
|  0.0|[4.9,3.0,1.4,0.2]|
|  0.0|[4.7,3.2,1.3,0.2]|
|  0.0|[4.6,3.1,1.5,0.2]|
|  0.0|[5.0,3.6,1.4,0.2]|
|  0.0|[5.4,3.9,1.7,0.4]|
|  0.0|[4.6,3.4,1.4,0.3]|
|  0.0|[5.0,3.4,1.5,0.2]|
|  0.0|[4.4,2.9,1.4,0.2]|
|  0.0|[4.9,3.1,1.5,0.1]|
|  0.0|[5.4,3.7,1.5,0.2]|
|  0.0|[4.8,3.4,1.6,0.2]|
|  0.0|[4.8,3.0,1.4,0.1]|
|  0.0|[4.3,3.0,1.1,0.1]|
|  0.0|[5.8,4.0,1.2,0.2]|
|  0.0|[5.7,4.4,1.5,0.4]|
|  0.0|[5.4,3.9,1.3,0.4]|
|  0.0|[5.1,3.5,1.4,0.3]|
|  0.0|[5.7,3.8,1.7,0.3]|
|  0.0|[5.1,3.8,1.5,0.3]|
+-----+-----------------+
only showing top 20 rows

train set
+-----+-----------------+
|label|         features|
+-----+-----------------+
|  0.0|[4.3,3.0,1.1,0.1]|
|  0.0|[4.4,2.9,1.4,0.2]|
|  0.0|[4.4,3.0,1.3,0.2]|
|  0.0|[4.4,3.2,1.3,0.2]|
|  0.0|[4.6,3.1,1.5,0.2]|
|  0.0|[4.6,3.2,1.4,0.2]|
|  0.0|[4.6,3.4,1.4,0.3]|
|  0.0|[4.6,3.6,1.0,0.2]|
|  0.0|[4.7,3.2,1.6,0.2]|
|  0.0|[4.8

### Create a Decision Tree model

In [7]:
from pyspark.ml.classification import DecisionTreeClassifier

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

# TRAINNING

In [8]:
dt_model = dt.fit(train_df)

# Display model summary
print("Decision Tree model summary:{0}".format(dt_model.toDebugString))

Decision Tree model summary:DecisionTreeClassificationModel: uid=DecisionTreeClassifier_f067db00aaad, depth=5, numNodes=15, numClasses=3, numFeatures=4
  If (feature 2 <= 2.5999999999999996)
   Predict: 0.0
  Else (feature 2 > 2.5999999999999996)
   If (feature 2 <= 4.85)
    If (feature 3 <= 1.65)
     Predict: 1.0
    Else (feature 3 > 1.65)
     If (feature 1 <= 2.8499999999999996)
      Predict: 2.0
     Else (feature 1 > 2.8499999999999996)
      Predict: 1.0
   Else (feature 2 > 4.85)
    If (feature 3 <= 1.75)
     If (feature 0 <= 6.35)
      Predict: 2.0
     Else (feature 0 > 6.35)
      If (feature 0 <= 6.95)
       Predict: 1.0
      Else (feature 0 > 6.95)
       Predict: 2.0
    Else (feature 3 > 1.75)
     Predict: 2.0



# PREDICTIONS

In [9]:
# Use the trained model to make predictions on the test data
predictions = dt_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction").show()

+-----------------+----------+
|         features|prediction|
+-----------------+----------+
|[4.5,2.3,1.3,0.3]|       0.0|
|[4.7,3.2,1.3,0.2]|       0.0|
|[4.9,3.1,1.5,0.1]|       0.0|
|[5.0,3.3,1.4,0.2]|       0.0|
|[5.1,3.3,1.7,0.5]|       0.0|
|[5.1,3.8,1.5,0.3]|       0.0|
|[5.2,3.4,1.4,0.2]|       0.0|
|[5.2,4.1,1.5,0.1]|       0.0|
|[5.3,3.7,1.5,0.2]|       0.0|
|[5.4,3.9,1.3,0.4]|       0.0|
|[5.5,3.5,1.3,0.2]|       0.0|
|[5.7,3.8,1.7,0.3]|       0.0|
|[5.7,4.4,1.5,0.4]|       0.0|
|[5.0,2.3,3.3,1.0]|       1.0|
|[5.1,2.5,3.0,1.1]|       1.0|
|[5.5,2.4,3.7,1.0]|       1.0|
|[5.6,2.9,3.6,1.3]|       1.0|
|[5.6,3.0,4.1,1.3]|       1.0|
|[5.7,2.6,3.5,1.0]|       1.0|
|[5.8,2.7,4.1,1.0]|       1.0|
+-----------------+----------+
only showing top 20 rows



In [8]:
pip install numpy

[0mNote: you may need to restart the kernel to use updated packages.


In [None]:




# Show predictions
predictions_ovr.select("features", "prediction")

DataFrame[features: vector, prediction: double]

# MODEL TESTING

In [10]:
#Tree
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                            predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, 
                  {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")
precision = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")
recall = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")
f1 = evaluator.evaluate(predictions,
                {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")    

Accuracy: 0.9117647058823529
Precision: 0.9158496732026143
Recall: 0.9117647058823528
F1 Score: 0.9125951557093426


In [11]:

from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import OneVsRest

# Initialize the LinearSVC classifier for binary
# classification
lsvc = LinearSVC(maxIter=10, regParam=0.01)
# Set up OneVsRest classifier for multi-class
# classification
ovr = OneVsRest(classifier=lsvc)
# Train the model
ovr_model = ovr.fit(train_df)

predictions_ovr = ovr_model.transform(test_df)

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                            predictionCol="prediction")
f1 = evaluator.evaluate(predictions_ovr,
                {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")    

25/05/01 03:19:05 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/05/01 03:19:05 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
[Stage 122:>                                                        (0 + 1) / 1]

F1 Score: 0.9414532871972319


                                                                                

In [10]:
sc.stop()