# Árboles de Regresión

In [34]:
#from pyspark import SparkContext
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext=SQLContext(sc)

In [35]:
bd5 = sqlContext.read.format(
    "com.databricks.spark.csv"
).option("header", "true").load("bd5.csv", inferSchema=True)
sqlContext.registerDataFrameAsTable(bd5, "bd5")

In [36]:
bd5.dtypes

[('Year', 'int'),
 ('Month', 'int'),
 ('DayofMonth', 'int'),
 ('DayOfWeek', 'int'),
 ('CRSDepTime', 'int'),
 ('UniqueCarrier', 'string'),
 ('TailNum', 'string'),
 ('ArrDelay', 'double'),
 ('DepDelay', 'double'),
 ('Origin', 'string'),
 ('Dest', 'string'),
 ('Distance', 'double'),
 ('Cancelled', 'double'),
 ('Diverted', 'double'),
 ('CarrierDelay', 'double'),
 ('WeatherDelay', 'double'),
 ('NASDelay', 'double'),
 ('SecurityDelay', 'double'),
 ('LateAircraftDelay', 'double'),
 ('LogD', 'double'),
 ('Retraso', 'int'),
 ('RetrasoNeto', 'double'),
 ('Horario', 'int')]

In [37]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='UniqueCarrier',outputCol='IndexUniqueCarrier') #el índice empieza en el 0!
bd6=indexer.fit(bd5).transform(bd5)

bd6.groupBy('UniqueCarrier','IndexUniqueCarrier').count().sort('IndexUniqueCarrier').show()


+-------------+------------------+-----+
|UniqueCarrier|IndexUniqueCarrier|count|
+-------------+------------------+-----+
|           AA|               0.0| 8853|
|           UA|               1.0| 6112|
|           WN|               2.0| 5395|
|           DL|               3.0| 4239|
|           VX|               4.0| 1703|
|           NK|               5.0| 1581|
|           F9|               6.0| 1295|
|           OO|               7.0| 1166|
|           B6|               8.0|  121|
|           EV|               9.0|    1|
+-------------+------------------+-----+



## Ajuste del modelo

In [38]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

a1  = VectorAssembler(
    inputCols=['DepDelay','Distance','DayOfWeek',
               'CRSDepTime','IndexUniqueCarrier'],
    outputCol='features')

bd7 = a1.transform(bd6).select(col("ArrDelay").alias("label"),'features')

### Partición Test - Train

In [39]:
(bd_train, bd_test) = bd7.randomSplit([0.7, 0.3],seed=123)
print(bd_train.count())
print(bd_test.count())

21334
9132


In [40]:
from pyspark.ml.regression import DecisionTreeRegressor as DTR

rt = DTR(maxDepth=5)

model = rt.fit(bd_train)
pred = model.transform(bd7)

In [41]:
pred.show()

+-----+--------------------+-------------------+
|label|            features|         prediction|
+-----+--------------------+-------------------+
| -7.0|[-5.0,1235.0,4.0,...| -8.462875769648678|
| -3.0|[5.0,1235.0,5.0,8...| -4.140765233821446|
| -3.0|[-3.0,1235.0,6.0,...| -8.462875769648678|
| -2.0|[-7.0,1235.0,7.0,...|-12.166792168674698|
| -2.0|[-6.0,1235.0,1.0,...|-12.166792168674698|
|  0.0|[-1.0,1235.0,2.0,...| -4.140765233821446|
| -6.0|[0.0,1235.0,3.0,8...| -4.140765233821446|
|  7.0|[0.0,1235.0,4.0,8...| -4.140765233821446|
| -9.0|[-1.0,1235.0,5.0,...| -4.140765233821446|
| -2.0|[-1.0,1235.0,6.0,...| -4.140765233821446|
| 13.0|[1.0,1235.0,7.0,8...| -4.140765233821446|
|-16.0|[-2.0,1235.0,1.0,...| -8.462875769648678|
|-21.0|[-4.0,1235.0,2.0,...| -8.462875769648678|
|-16.0|[-1.0,1235.0,3.0,...| -4.140765233821446|
|-11.0|[0.0,1235.0,4.0,8...| -4.140765233821446|
| -8.0|[0.0,1235.0,5.0,8...| -4.140765233821446|
| -4.0|[13.0,1235.0,6.0,...|  9.349387755102041|
|  0.0|[17.0,1235.0,

In [42]:
pred.groupBy('prediction').count().show(50)


+-------------------+-----+
|         prediction|count|
+-------------------+-----+
|  75.76502732240438|  275|
|              118.0|    8|
| -4.140765233821446| 6016|
|  88.06382978723404|   66|
| 112.08609271523179|  208|
| 1.1689497716894977| 1266|
|  270.8888888888889|   14|
|  32.38095238095238|  260|
|  68.85744234800839|  714|
| 34.535545023696685|  620|
| 354.55555555555554|   35|
| 100.23643410852713|  359|
|  148.2941176470588|   23|
|  188.1904761904762|  208|
|  14.88088088088088| 1449|
|  99.46666666666667|   59|
| 109.22222222222223|   38|
|   48.2593984962406|  800|
| 212.28961748633878|  240|
| 24.178368121442126|  772|
|  234.5977653631285|  252|
| 22.905405405405407|  408|
|  9.349387755102041| 1725|
|  56.70224719101124|  241|
| 179.04794520547946|  208|
|              164.0|    9|
|  3.354341736694678| 2076|
| 42.541666666666664|  291|
| 106.39495798319328|  164|
| -8.462875769648678| 7864|
|-12.166792168674698| 3791|
|  542.2857142857143|    7|
+-------------------

In [43]:
from pyspark.ml.evaluation import RegressionEvaluator
print(RegressionEvaluator(metricName="r2").evaluate(pred))

0.7229091634348275


## Tuneado de parámetros

In [44]:
# DecisionTreeRegressor(featuresCol="features", 
#    labelCol="label", 
#    predictionCol="prediction", 
#    maxDepth=5, 
#    maxBins=32, 
#    minInstancesPerNode=1, 
#    minInfoGain=0.0, 
#    maxMemoryInMB=256, 
#    impurity="variance")

In [45]:
rt = DTR(maxDepth=20,minInstancesPerNode=10,maxBins=50)
model = rt.fit(bd_train)
pred = model.transform(bd7)
print(RegressionEvaluator(metricName="r2").evaluate(pred))

0.783032295751161


### Validación externa

In [46]:
pred2 = model.transform(bd_test)
print(RegressionEvaluator(metricName="r2").evaluate(pred2))

0.7221506330431235
