# Modelo de Regresión Múltiple (2)

In [1]:
#from pyspark import SparkContext
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext=SQLContext(sc)

In [2]:
bd5 = sqlContext.read.format(
    "com.databricks.spark.csv"
).option("header", "true").load("bd5.csv", inferSchema=True)
sqlContext.registerDataFrameAsTable(bd5, "bd5")

In [3]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

bd5 = bd5.withColumn('Horario1',(bd5.Horario==1) 
).withColumn('Horario2',(bd5.Horario==2) 
).withColumn('Horario3',(bd5.Horario==3))

# Términos cuadráticos añadidos manualmente
bd5 = bd5.withColumn('DepDelay2',(bd5.DepDelay**2)
).withColumn('DepD_Distance',(bd5.DepDelay * bd5.Distance)) 

a1  = VectorAssembler(
    inputCols=['DepDelay','Distance','DayOfWeek',
               'CRSDepTime','Horario1','Horario2',
               'Horario3','DepDelay2','DepD_Distance'],
    outputCol='features')

bd6 = a1.transform(bd5).select(col("ArrDelay").alias("label"),'features')

In [4]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression()
model = lr.fit(bd6)
pred = model.transform(bd6)



In [5]:
print(model.intercept,model.coefficients)
print(RegressionEvaluator(metricName="r2").evaluate(pred))

-4.635889634541887 [1.05553557286,-0.00289488941365,0.135668773288,0.00144922041931,-0.242298226852,1.62235576514,0.159579377181,-1.79458480022e-05,-2.10428631003e-05]
0.9178062697891254


## Selección de variables y Regularización

$\sum{V(Y_i,f(X_i))} +\lambda C(f)$

En regresión lineal: 

$ f(X_i) = \beta_0 + \beta_1 X_{1i} + ... \beta_p X_{pi}$

$\sum{(Y_i - f(X_i))^2} +\lambda \sum{\beta^\alpha} $


regParam=0, sin penalización (OLS).

elasticNetParam = 0, penalización L2 (Ridge). 

elasticNetParam = 1, penalización L1 (Lasso).

### Ridge Regression

In [6]:
lr = LinearRegression(
    maxIter=5, 
    regParam=5.0,
    elasticNetParam=0.0,
    solver="normal")
model = lr.fit(bd6)
pred = model.transform(bd6)



In [7]:
print(model.intercept,model.coefficients)
print(RegressionEvaluator(metricName="r2").evaluate(pred))


-1.6293446782818521 [0.779667508841,-0.0053345412431,0.24260292539,0.00231370712626,-1.16255761132,1.03064452132,0.0555277791927,6.76631283342e-05,0.000140493731922]
0.903221414577995


In [8]:
bd6.select('features').dtypes

[('features', 'vector')]

### Lasso Regression

In [9]:
lr = LinearRegression(
    maxIter=5, 
    regParam=0.7,
    elasticNetParam=1.0,
    solver="auto")
model = lr.fit(bd6)
pred = model.transform(bd6)



In [10]:
print(model.intercept,model.coefficients)
print(RegressionEvaluator(metricName="r2").evaluate(pred))

3.2946644725630456 [0.869003269943,-0.00710768434873,0.0,0.000116522843855,0.0,0.0,0.0,-1.67295913798e-05,9.73290986538e-05]
0.909270946443443


### ¿La estandaricación de las variables es necesaria?

In [11]:
lr = LinearRegression(
    maxIter=5, 
    regParam=1.0,
    elasticNetParam=1.0,
    solver="auto",
    standardization=True,
    featuresCol = "features" ) 
model = lr.fit(bd6)
pred = model.transform(bd6)
print(model.intercept,model.coefficients)
print(RegressionEvaluator(metricName="r2").evaluate(pred))



2.8780392155255115 (9,[0,1,8],[0.873087274085,-0.00659023376409,9.56286564526e-05])
0.9103469157085913


In [12]:
lr2 = LinearRegression(
    maxIter=5, 
    regParam=0.7,
    elasticNetParam=1.0,
    solver="auto",
    standardization=False,
    featuresCol = "features" ) 
model = lr2.fit(bd6)
pred = model.transform(bd6)
print(model.intercept,model.coefficients)
print(RegressionEvaluator(metricName="r2").evaluate(pred))



1.8246622701084778 [0.867109334427,-0.00721589949825,0.0,0.00126658111863,0.0,0.0,0.0,-7.62140637153e-05,0.000111806161815]
0.9067726238943679


__Estandarizar las variables__ nos ayuda a escalar los coeficientes del modelo a magnitudes comparables. Sin la estandarización la penalización se basa en los coeficientes brutos.