# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Aprendizaje Automático (Machine Learning): Decision Trees** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [None]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Decision-Trees") \
    .master("spark://078b2e28e517:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

### Preparación de Datos

In [None]:
from team_name.spark_utils import SparkUtils
# Create a small dataset as a list of tuples
# Format: (label, x1, x2)
data = [
      (0, 1.0, 0.5),
      (1, 2.0, 1.5),
      (0, 1.5, 0.2),
      (1, 2.2, 1.0),
      (0, 1.0, -0.3),
      (1, 2.5, 1.0)
]

# Define schema for the DataFrame
schema = SparkUtils.generate_schema([("label", "integer"), ("x1", "float"), ("x2", "float")])

# Convert list to a DataFrame
df = spark.createDataFrame(data, schema)

### Assemble the features into a single vector column

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")
data_with_features = assembler.transform(df).select("label", "features")

### Split the data into training and test sets 80% training data and 20% testing data

In [None]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

### Show the whole dataset

In [None]:
print("Original Dataset")
data_with_features.show()

# Print train dataset
print("train set")
train_df.show()

### Create a Decision Tree model

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

# TRAINNING

In [None]:
dt_model = dt.fit(train_df)

# Display model summary
print("Decision Tree model summary:{0}".format(dt_model.toDebugString))

# PREDICTIONS

In [None]:
# Use the trained model to make predictions on the test data
predictions = dt_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction").show()

# MODEL TESTING

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                            predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, 
                  {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")
precision = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")
recall = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")
f1 = evaluator.evaluate(predictions,
                {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")    

In [None]:
sc.stop()