# ML Tuning: model selection and hyperparameter tuning
## Ejemplo: Cross-Validation

Código base: https://spark.apache.org/docs/latest/ml-tuning.html

___
## Preparar el ambiente
___

In [1]:
import findspark
findspark.init()

import pyspark

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ejemplo cross-validation").getOrCreate()

___
## Primera clasificación: train/test
___

In [3]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator

# Load training data
data = spark.read.format("libsvm")\
    .load("sample_multiclass_classification_data.txt")

# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [4, 5, 4, 3]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.9523809523809523


____

## Segunda clasificación: cross-validation
___

In [4]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Load training data
data = spark.read.format("libsvm")\
    .load("sample_multiclass_classification_data.txt")

# Split the data into train-val and test
splits = data.randomSplit([0.8, 0.2], 1234)
train = splits[0]
test = splits[1]

layers = [4, 5, 4, 3]
lr = MultilayerPerceptronClassifier()

grid = ParamGridBuilder() \
    .addGrid(lr.maxIter,[100]) \
    .addGrid(lr.layers, [layers]) \
    .addGrid(lr.blockSize,[128]) \
    .addGrid(lr.seed, [1234]) \
    .build()
evaluator = MulticlassClassificationEvaluator()
crossval = CrossValidator(estimator=trainer, 
                          estimatorParamMaps=grid, 
                          evaluator=evaluator, 
                          parallelism=2,
                          numFolds=4)

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train)

# compute accuracy on the test set
result = cvModel.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.972972972972973
