In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Initialize Spark session
spark = SparkSession.builder.appName("DecisionTreeTuning").getOrCreate()

# Sample data
data = [(0, 2.0, 1.0, 1.0), (1, 3.0, 2.0, 0.0), (0, 4.0, 3.0, 0.0), (1, 5.0, 4.0, 1.0)]
columns = ["label", "feature1", "feature2", "feature3"]
df = spark.createDataFrame(data, columns)

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=["feature1", "feature2", "feature3"], outputCol="features")
df = assembler.transform(df)

# Create decision tree classifier
dt = DecisionTreeClassifier(featuresCol='features', labelCol='label')

# Define parameter grid for grid search
paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [2, 3, 4]) \
    .addGrid(dt.minInstancesPerNode, [1, 2, 3]) \
    .build()

# Create cross-validator
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=dt,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)

# Run cross-validation
cvModel = cv.fit(df)

# Get best model from cross-validation
bestModel = cvModel.bestModel

# Make predictions using the best model
predictions = bestModel.transform(df)
predictions.select("features", "label", "prediction").show()

# Stop Spark session
spark.stop()
