# ML lib hands-on:

RandomForest para predecir la magnitud del retraso

## Descripción de las variables

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, DoubleType, StringType

from pyspark.ml.feature import *
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit

In [3]:
# Spark Context
spark = SparkSession.builder.appName("flights").config("spark.sql.caseSensitive", "True").getOrCreate()

In [4]:
# This does nothing: Spark is lazy so the read operation will be deferred until an action is executed
flightsDF = spark.read.option("header", "true")\
                      .option("inferSchema", "true")\
                      .csv("/data/flights_jan08.csv")

flightsDF.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Ca

Planteamiento inicial

* StringIndexer para convertir variables tipo String en variables categóricas pero cuyos valores son números reales con la parte decimal a 0, tal como necesitan los algoritmos de Spark.
* Bucketizer para discretizar la columna de ArrDelay sin dar nombre a las categorías, solo numeros. Será nuestra variable target.
* VectorAssembler para unir las columnas de las features en una sola de tipo vector
* RandomForest que es un estimador, como algoritmo de predicción de la severidad
* Pipeline, que es un estimador y que incluirá todos los elementos anteriores.

## Preprocesamiento

In [5]:
# Se limpia el DF de NAs y se convierte en el type adecuado

cleanFlightsDF = flightsDF.where("ArrDelay != 'NA' and DepDelay != 'NA' and DepTime != 'NA' and ArrTime != 'NA'")\
                          .withColumn("ArrDelay", F.col("ArrDelay").cast(IntegerType()))\
                          .withColumn("DepDelay", F.col("DepDelay").cast(IntegerType()))\
                          .withColumn("ArrTime", F.col("ArrTime").cast(IntegerType()))\
                          .withColumn("DepTime", F.col("DepTime").cast(IntegerType()))

# Se definen tres categorías: <15, entre 15 y 60, >60
splitsDelays = [-float("inf"), 15, 60, float("inf")]
arrDelayBucketizer = Bucketizer(splits=splitsDelays, inputCol="ArrDelay", outputCol="ArrDelayBucketed")

# Se definen varias franjas: 00:00 - 06:00, 06:00 - 12:00, 12:00 - 18:00, 18:00 - 22:00, 22:00 - 00:00
splitsDepTime = [-1, 600, 1200, 1800, 2200, 2500]
depTimeBucketizer = Bucketizer(splits=splitsDepTime, inputCol="DepTime", outputCol="DepTimeBucketed")

### Feature Engineering and preparation

In [6]:
originIndexer = StringIndexer(inputCol = "Origin", outputCol="OriginIndexed", handleInvalid="skip")
destIndexer = StringIndexer(inputCol = "Dest", outputCol="DestIndexed", handleInvalid="skip")
dowIndexer = StringIndexer(inputCol = "DayOfWeek", outputCol="DayOfWeekIndexed", handleInvalid="skip")

vectorAssembler = VectorAssembler(inputCols = ["DepDelay", "DepTimeBucketed", "OriginIndexed", "DestIndexed", "DayOfWeekIndexed"],
                                  outputCol = "featuresVector")

### Modelo (Estimador) escogido

In [7]:
randomForest = RandomForestClassifier(featuresCol = "featuresVector", labelCol = "ArrDelayBucketed", numTrees = 50, maxBins=100)

### Pipeline creation

In [8]:
pipeline = Pipeline(stages=[arrDelayBucketizer, depTimeBucketizer, dowIndexer, originIndexer, destIndexer,
                            vectorAssembler, randomForest])

### Train-test split

In [9]:
splits = cleanFlightsDF.randomSplit([0.7, 0.3], seed = 123)
trainDF = splits[0].cache()
testDF = splits[1].cache()

In [10]:
print("Hay {0} ejemplos de entrenamiento".format(trainDF.count()))

Hay 37208 ejemplos de entrenamiento


## Modelización

In [11]:
pipelineModel = pipeline.fit(trainDF)
predictionsDF = pipelineModel.transform(testDF)

In [12]:
predictionsDF.show()

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+----------------+---------------+----------------+-------------+-----------+--------------------+--------------------+--------------------+----------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|ArrDelayBucketed|DepTimeBucketed|DayOfWeekIndexed|OriginIndexed|DestIndexed|      featuresVector|       rawPrediction|         probability|prediction|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-

### Evaluación del modelo

Se evalúan las predicciones que hemos hecho sobre el DF de test

In [13]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="ArrDelayBucketed", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictionsDF)
print("Accuracy = %g " % (accuracy))

Accuracy = 0.926591 


### Ajuste de híper-parámetros utilizando Cross Validation sobre el subconjunto de train

In [14]:
paramGrid = ParamGridBuilder() \
    .addGrid(randomForest.numTrees, [50])\
    .build()
    # .addGrid(randomForest.maxDepth, [3, 4, 5]) \
    # .build()


crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

cvModel = crossval.fit(trainDF)
cvModel.bestModel

PipelineModel_e23c98914245

In [15]:
cvModel.extractParamMap()

{Param(parent='CrossValidatorModel_7acb0afe5fe8', name='seed', doc='random seed.'): -2814150791527038033,
 Param(parent='CrossValidatorModel_7acb0afe5fe8', name='numFolds', doc='number of folds for cross validation'): 3,
 Param(parent='CrossValidatorModel_7acb0afe5fe8', name='foldCol', doc="Param for the column name of user specified fold number. Once this is specified, :py:class:`CrossValidator` won't do random k-fold split. Note that this column should be integer type with range [0, numFolds) and Spark will throw exception on out-of-range fold numbers."): '',
 Param(parent='CrossValidatorModel_7acb0afe5fe8', name='estimator', doc='estimator to be cross-validated'): Pipeline_93f9fa8699a9,
 Param(parent='CrossValidatorModel_7acb0afe5fe8', name='estimatorParamMaps', doc='estimator param maps'): [{Param(parent='RandomForestClassifier_29866d1ae2a5', name='numTrees', doc='Number of trees to train (>= 1).'): 50}],
 Param(parent='CrossValidatorModel_7acb0afe5fe8', name='evaluator', doc='eval

In [16]:
cvModel.getEstimatorParamMaps()

[{Param(parent='RandomForestClassifier_29866d1ae2a5', name='numTrees', doc='Number of trees to train (>= 1).'): 50}]

In [17]:
rf = cvModel.bestModel.stages[6]
print("Número óptimo de árboles: {0}".format(rf.getNumTrees))
print("Max depth óptimo: {0}".format(rf.getOrDefault('maxDepth')))

Número óptimo de árboles: 50
Max depth óptimo: 5
