# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Machine Learning: Decision Trees and Random Forest** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Create SparkSession

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ML: Decision Trees & Random Forest") \
    .master("spark://13f256fd17f9:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/24 00:49:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Decision Trees
## Collect Data

In [2]:
from pcamarillor.spark_utils import SparkUtils
# Create a small dataset as a list of tuples
# Format: (label, x1, x2)
data = [
      (0, 1.0, 0.5),
      (1, 2.0, 1.5),
      (0, 1.5, 0.2),
      (1, 2.2, 1.0),
      (0, 1.0, -0.3),
      (1, 2.5, 1.0)
]

# Define schema for the DataFrame
schema = SparkUtils.generate_schema([("label", "int"), ("x1", "float"), ("x2", "float")])

# Convert list to a DataFrame
df = spark.createDataFrame(data, schema)

### Assemble the features into a single vector column

In [3]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")
data_with_features = assembler.transform(df).select("label", "features")                               

## Data splitting
### 80% training data and 20% testing data

In [4]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

### Show dataset (for debugging)

In [5]:
print("Original Dataset")
df.show()

# Print train dataset
print("train set")
train_df.show()

Original Dataset


                                                                                

+-----+---+----+
|label| x1|  x2|
+-----+---+----+
|    0|1.0| 0.5|
|    1|2.0| 1.5|
|    0|1.5| 0.2|
|    1|2.2| 1.0|
|    0|1.0|-0.3|
|    1|2.5| 1.0|
+-----+---+----+

train set


[Stage 3:>                                                          (0 + 2) / 2]

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|           [1.0,0.5]|
|    1|           [2.0,1.5]|
|    0|[1.5,0.2000000029...|
|    0|[1.0,-0.300000011...|
|    1|           [2.5,1.0]|
+-----+--------------------+



                                                                                

## Create ML Model

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

## Train ML Model

In [7]:
dt_model = dt.fit(train_df)

# Display model summary
print("Decision Tree model summary:{0}".format(dt_model.toDebugString))

25/10/24 00:51:45 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 5 (= number of training instances)


Decision Tree model summary:DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7a7784f7b230, depth=1, numNodes=3, numClasses=2, numFeatures=2
  If (feature 0 <= 1.75)
   Predict: 0.0
  Else (feature 0 > 1.75)
   Predict: 1.0



## Persist the model

In [8]:
model_path = "/opt/spark/work-dir/data/mlmodels/dt/dt1"
dt_model.write().overwrite().save(model_path)

                                                                                

## Predictions

In [9]:
# Use the trained model to make predictions on the test data
predictions = dt_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction").show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[2.20000004768371...|       1.0|
+--------------------+----------+



In [10]:
from pyspark.ml.classification import DecisionTreeClassificationModel

# Retreive the saved model
saved_dt_model = DecisionTreeClassificationModel.load(model_path)

# Use the trained model to make predictions on the test data
predictions = saved_dt_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction").show()

                                                                                

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[2.20000004768371...|       1.0|
+--------------------+----------+



## Test ML Model

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                            predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, 
                  {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")

f1 = evaluator.evaluate(predictions,
                {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")  

Accuracy: 1.0
F1 Score: 1.0


# Random Forest

In [13]:
from pyspark.ml.classification import RandomForestClassifier

# Train the model
rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    numTrees=3,
    maxDepth=5,
    seed=42
)

rf_model = rf.fit(train_df)

# Save the model
rf_path = "/opt/spark/work-dir/data/mlmodels/rf/rf1"
rf_model.write().overwrite().save(rf_path)
print(f"Random forest model generated: {rf_model.toDebugString}")

25/10/24 01:01:19 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 5 (= number of training instances)


Random forest model generated: RandomForestClassificationModel: uid=RandomForestClassifier_a45ac1880507, numTrees=3, numClasses=2, numFeatures=2
  Tree 0 (weight 1.0):
    Predict: 1.0
  Tree 1 (weight 1.0):
    If (feature 0 <= 1.75)
     Predict: 0.0
    Else (feature 0 > 1.75)
     Predict: 1.0
  Tree 2 (weight 1.0):
    Predict: 0.0



In [14]:
from pyspark.ml.classification import RandomForestClassificationModel
# Read the RF model
rf_model_saved = rf_model.load(rf_path)

# Make predictions on test data
predictions_rf = rf_model_saved.transform(test_df)

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                            predictionCol="prediction")

accuracy_rf = evaluator.evaluate(predictions_rf, 
                  {evaluator.metricName: "accuracy"})
print(f"Accuracy of Random Forest: {accuracy}")

f1_rf = evaluator.evaluate(predictions_rf,
                {evaluator.metricName: "f1"})
print(f"F1 Score of Random Forest: {f1}") 

Accuracy of Random Forest: 1.0
F1 Score of Random Forest: 1.0


# Lab 12: Classifying Iris Dataset

![image.png](attachment:image.png)

# Data collection

In [16]:
# Define schema for the DataFrame

iris_schema = SparkUtils.generate_schema([
    ("Id", "int"), 
    ("SepalLengthCm", "float"), 
    ("SepalWidthCm", "float"), 
    ("PetalLengthCm", "float"), 
    ("PetalWidthCm", "float"), 
    ("Species", "string")])

# Source: https://raw.githubusercontent.com/selva86/datasets/master/Iris.csv

iris_df = spark.read \
                .option("header", "true") \
                .schema(iris_schema) \
                .csv("/opt/spark/work-dir/data/ml/decision_trees/")

iris_df.printSchema()
iris_df.show(5)


root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: float (nullable = true)
 |-- SepalWidthCm: float (nullable = true)
 |-- PetalLengthCm: float (nullable = true)
 |-- PetalWidthCm: float (nullable = true)
 |-- Species: string (nullable = true)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows


                                                                                

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

feature_cols = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]

# Convert string labels (species) to numeric
label_indexer = StringIndexer(inputCol="Species", outputCol="label")

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Define the Random Forest model
rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    numTrees=100,
    maxDepth=5,
    seed=42
)

# Build a pipeline
pipeline = Pipeline(stages=[label_indexer, assembler, rf])

# Split the data into training and test sets
train_df, test_df = iris_df.randomSplit([0.8, 0.2], seed=42)

# Train the model
model = pipeline.fit(train_df)

# Make predictions on test data
predictions = model.transform(test_df)
predictions.show()


## Decision Trees

### Data Splitting

### Create ML Model

### Train ML Model

### Persist ML Model

### Test ML Model

## Random Forest

### Data Splitting

### Create ML Model

### Train ML Model

### Persist ML Model

### Test ML Model

## Compare Decision Trees vs Random Forest

In [None]:
sc.stop()