## Geração e Extração de características: PCA
- Alta disponibilidade:
    - Menor capacidade de generalização

- PCA: Redução de Dimensionalidade
- Cria Atributos sintéticos, sem compreensão funcional
- Estes novos atributos buscam manter as características importantes dos dados
- Representação dos atributos originais: projeção
Não permite avaliar importância de atributos e não mais representam o negócio analisado

In [3]:
from pyspark.sql import SparkSession

from pyspark.ml.feature import PCA, VectorAssembler

spark = (
    SparkSession
    .builder
    .appName('PCA')
    .getOrCreate()
)
spark

In [4]:
df_cars = (
    spark.read.format('csv')
    .option('header', True)
    .option('inferSchema', True)
    .option('delimiter', ';')
    .load('data/Carros.csv')
)

df_cars.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [5]:
cols = df_cars.columns
cols

['Consumo',
 'Cilindros',
 'Cilindradas',
 'RelEixoTraseiro',
 'Peso',
 'Tempo',
 'TipoMotor',
 'Transmissao',
 'Marchas',
 'Carburadors',
 'HP']

In [7]:
df_cars_assembler = VectorAssembler(
    inputCols = cols[:-1],
    outputCol = 'features'
).transform(df_cars)

df_cars_assembler.select('features').show(5, truncate = False)

+-----------------------------------------------------+
|features                                             |
+-----------------------------------------------------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |
+-----------------------------------------------------+
only showing top 5 rows



In [10]:
pca = PCA(
    k=3,
    inputCol='features',
    outputCol='features_pca'
)

model = pca.fit(df_cars_assembler)

In [13]:
result = model.transform(df_cars_assembler)
result.select('features', 'features_pca').show(10, truncate=False)

+-----------------------------------------------------+-----------------------------------------------------------+
|features                                             |features_pca                                               |
+-----------------------------------------------------+-----------------------------------------------------------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |[618.7707206779613,-937.712394997354,1231.963352994551]    |
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |[3112.9887675342197,-161.05746385491523,1191.8619913054383]|
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |[640.4959007710695,-1120.718886511042,1320.0756315189049]  |
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|[3466.0956877556673,-149.69421418298353,1401.204178036853] |
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |[661.4577445758732,-812.4592128844115,1395.2949328316356]  |
|[181.0,6.0,225.0,276.0,346.0,2022.0,1.0,0.0,3.0,1.0] |[769.234367178773