# Reducción de la dimensionalidad

In [1]:
#from pyspark import SparkContext
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext=SQLContext(sc)

In [2]:
bd5 = sqlContext.read.format(
    "com.databricks.spark.csv"
).option("header", "true").load("bd5.csv", inferSchema=True)
sqlContext.registerDataFrameAsTable(bd5, "bd5")

## Reducción de dimensionalidad: PCA

In [3]:
bd5.dtypes

[('Year', 'int'),
 ('Month', 'int'),
 ('DayofMonth', 'int'),
 ('DayOfWeek', 'int'),
 ('CRSDepTime', 'int'),
 ('UniqueCarrier', 'string'),
 ('TailNum', 'string'),
 ('ArrDelay', 'double'),
 ('DepDelay', 'double'),
 ('Origin', 'string'),
 ('Dest', 'string'),
 ('Distance', 'double'),
 ('Cancelled', 'double'),
 ('Diverted', 'double'),
 ('CarrierDelay', 'double'),
 ('WeatherDelay', 'double'),
 ('NASDelay', 'double'),
 ('SecurityDelay', 'double'),
 ('LateAircraftDelay', 'double'),
 ('LogD', 'double'),
 ('Retraso', 'int'),
 ('RetrasoNeto', 'double'),
 ('Horario', 'int')]

In [4]:
from pyspark.ml.feature import VectorAssembler

a1  = VectorAssembler(
    inputCols=['DepDelay','Distance','DayOfWeek',
               'CRSDepTime','Horario','LogD'],
    outputCol='features')

bd6 = a1.transform(bd5)

In [5]:
bd6

DataFrame[Year: int, Month: int, DayofMonth: int, DayOfWeek: int, CRSDepTime: int, UniqueCarrier: string, TailNum: string, ArrDelay: double, DepDelay: double, Origin: string, Dest: string, Distance: double, Cancelled: double, Diverted: double, CarrierDelay: double, WeatherDelay: double, NASDelay: double, SecurityDelay: double, LateAircraftDelay: double, LogD: double, Retraso: int, RetrasoNeto: double, Horario: int, features: vector]

## PCA sin estandarización

In [6]:
from pyspark.ml.feature import PCA

#2 componentes
pca=PCA(k=2,inputCol='features',outputCol='pca_features')

In [7]:
model=pca.fit(bd6)
bd6pca=model.transform(bd6)
bd6pca.select('features','pca_features').show()

+--------------------+--------------------+
|            features|        pca_features|
+--------------------+--------------------+
|[-5.0,1235.0,4.0,...|[778.181012013560...|
|[5.0,1235.0,5.0,8...|[778.140019681721...|
|[-3.0,1235.0,6.0,...|[778.172787156525...|
|[-7.0,1235.0,7.0,...|[778.189163563186...|
|[-6.0,1235.0,1.0,...|[778.185153765040...|
|[-1.0,1235.0,2.0,...|[778.164650268380...|
|[0.0,1235.0,3.0,8...|[778.160537839863...|
|[0.0,1235.0,4.0,8...|[778.160523178381...|
|[-1.0,1235.0,5.0,...|[778.164606283936...|
|[-1.0,1235.0,6.0,...|[778.164591622454...|
|[1.0,1235.0,7.0,8...|[778.156381426901...|
|[-2.0,1235.0,1.0,...|[778.168762696897...|
|[-4.0,1235.0,2.0,...|[778.176943569487...|
|[-1.0,1235.0,3.0,...|[778.164635606899...|
|[0.0,1235.0,4.0,8...|[778.160523178381...|
|[0.0,1235.0,5.0,8...|[778.160508516900...|
|[13.0,1235.0,6.0,...|[778.107222883954...|
|[17.0,1235.0,7.0,...|[778.090817154330...|
|[12.0,1235.0,1.0,...|[778.111393958398...|
|[19.0,1235.0,2.0,...|[778.08269

In [8]:
#Extraemos la primera y segunda componentes

from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

p1=udf(lambda v:float(v[0]),FloatType())
p2=udf(lambda v:float(v[1]),FloatType())

bd6pca=bd6pca.withColumn('pca1',p1('pca_features')).withColumn('pca2',p2('pca_features'))


In [9]:
bd6pca.select('pca1','pca2').describe().show()

+-------+------------------+-------------------+
|summary|              pca1|               pca2|
+-------+------------------+-------------------+
|  count|             30466|              30466|
|   mean|300.93254488836135|-1601.4748862329209|
| stddev| 557.2501211351527|  495.7211976411699|
|    min|        -731.57623|         -2935.9211|
|    max|          1751.152|         -487.57773|
+-------+------------------+-------------------+



## PCA con estandarización

In [10]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)
scalerModel = scaler.fit(bd6)
bd6std = scalerModel.transform(bd6)

bd6std.select('features','scaledFeatures').show()

+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[-5.0,1235.0,4.0,...|[-0.4459454808573...|
|[5.0,1235.0,5.0,8...|[-0.2452533483159...|
|[-3.0,1235.0,6.0,...|[-0.4058070543490...|
|[-7.0,1235.0,7.0,...|[-0.4860839073656...|
|[-6.0,1235.0,1.0,...|[-0.4660146941114...|
|[-1.0,1235.0,2.0,...|[-0.3656686278407...|
|[0.0,1235.0,3.0,8...|[-0.3455994145866...|
|[0.0,1235.0,4.0,8...|[-0.3455994145866...|
|[-1.0,1235.0,5.0,...|[-0.3656686278407...|
|[-1.0,1235.0,6.0,...|[-0.3656686278407...|
|[1.0,1235.0,7.0,8...|[-0.3255302013325...|
|[-2.0,1235.0,1.0,...|[-0.3857378410949...|
|[-4.0,1235.0,2.0,...|[-0.4258762676032...|
|[-1.0,1235.0,3.0,...|[-0.3656686278407...|
|[0.0,1235.0,4.0,8...|[-0.3455994145866...|
|[0.0,1235.0,5.0,8...|[-0.3455994145866...|
|[13.0,1235.0,6.0,...|[-0.0846996422828...|
|[17.0,1235.0,7.0,...|[-0.0044227892663...|
|[12.0,1235.0,1.0,...|[-0.1047688555370...|
|[19.0,1235.0,2.0,...|[0.0357156

In [11]:
from pyspark.ml.feature import PCA

#2 componentes
pca2=PCA(k=2,inputCol='scaledFeatures',outputCol='pca_scaledfeatures')

In [16]:
model2=pca2.fit(bd6std)
bd6pca2=model2.transform(bd6std)
bd6pca2.select('pca_scaledfeatures').show()


+--------------------+
|  pca_scaledfeatures|
+--------------------+
|[1.25667411910508...|
|[1.23672540371479...|
|[1.24059964059000...|
|[1.23917988807288...|
|[1.27813884218153...|
|[1.26480761353166...|
|[1.25677037427412...|
|[1.25005663236466...|
|[1.24466638780329...|
|[1.23795264589383...|
|[1.22859190928821...|
|[1.27284485278920...|
|[1.26877810557591...|
|[1.25809387162220...|
|[1.25005663236466...|
|[1.24334289045521...|
|[1.21942368302067...|
|[1.20741595171888...|
|[1.25431588991603...|
|[1.23833766656999...|
+--------------------+
only showing top 20 rows



In [13]:
bd6pca2=bd6pca2.withColumn('pca1',p1('pca_scaledfeatures')).withColumn('pca2',p2('pca_scaledfeatures')) 
bd6pca2.select('pca1','pca2').describe().toPandas()

Unnamed: 0,summary,pca1,pca2
0,count,30466.0,30466.0
1,mean,1.4992589980211242e-10,-2.3862638117704308e-11
2,stddev,1.4247512515634884,1.2476208654224727
3,min,-3.9466622,-9.316214
4,max,3.2699375,2.831249


Nota: Las componentes obtenidas también se pueden estandarizar de nuevo.

In [14]:
pdf6 = bd6pca2.sample(False,0.1).select('DepDelay','Distance','DayOfWeek',
               'CRSDepTime','Horario','LogD','pca1','pca2').toPandas()

pdf6.corr()

Unnamed: 0,DepDelay,Distance,DayOfWeek,CRSDepTime,Horario,LogD,pca1,pca2
DepDelay,1.0,-0.000737,0.000761,0.124187,0.120019,-0.005971,-0.094481,-0.313865
Distance,-0.000737,1.0,-0.02989,-0.078393,-0.049484,0.957456,0.926898,-0.359012
DayOfWeek,0.000761,-0.02989,1.0,-0.010417,0.016774,-0.035493,-0.040535,-0.010466
CRSDepTime,0.124187,-0.078393,-0.010417,1.0,0.579708,-0.088783,-0.385502,-0.777299
Horario,0.120019,-0.049484,0.016774,0.579708,1.0,-0.046617,-0.349004,-0.792548
LogD,-0.005971,0.957456,-0.035493,-0.088783,-0.046617,1.0,0.928783,-0.353973
pca1,-0.094481,0.926898,-0.040535,-0.385502,-0.349004,0.928783,1.0,-0.015296
pca2,-0.313865,-0.359012,-0.010466,-0.777299,-0.792548,-0.353973,-0.015296,1.0
