# Importação de dados

### Aula 01 Importações

In [1]:
from pyspark.ml.feature import (
    VectorAssembler, 
    PCA,
    Binarizer,
    StringIndexer,
    IndexToString,
    OneHotEncoder,
    Imputer,
    PolynomialExpansion
)

from pyspark.sql import SparkSession, functions as F

spark = (
    SparkSession
    .builder
    .appName('machine-learning-pyspark-mllib')
    .getOrCreate()
)
spark

22/05/25 11:29:21 WARN Utils: Your hostname, andre-UBUNTU20-04 resolves to a loopback address: 127.0.1.1; using 192.168.0.136 instead (on interface wlp2s0)
22/05/25 11:29:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/25 11:29:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/25 11:29:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

#### Importando CSV, Parquet, JSON e ORC

In [3]:
# csv
schema = """
    id INT,
    nome STRING,
    status STRING,
    cidade STRING,
    vendas INT,
    data DATE 
"""
df_despachantes = (
    spark.read.format('csv')
    .schema(schema=schema)
    .load('data/despachantes.csv')
)
df_despachantes.show()


+---+-------------------+------+-------------+------+----------+
| id|               nome|status|       cidade|vendas|      data|
+---+-------------------+------+-------------+------+----------+
|  1|   Carminda Pestana| Ativo|  Santa Maria|    23|2020-08-11|
|  2|    Deolinda Vilela| Ativo|Novo Hamburgo|    34|2020-03-05|
|  3|   Emídio Dornelles| Ativo| Porto Alegre|    34|2020-02-05|
|  4|Felisbela Dornelles| Ativo| Porto Alegre|    36|2020-02-05|
|  5|     Graça Ornellas| Ativo| Porto Alegre|    12|2020-02-05|
|  6|   Matilde Rebouças| Ativo| Porto Alegre|    22|2019-01-05|
|  7|    Noêmia   Orriça| Ativo|  Santa Maria|    45|2019-10-05|
|  8|      Roque Vásquez| Ativo| Porto Alegre|    65|2020-03-05|
|  9|      Uriel Queiroz| Ativo| Porto Alegre|    54|2018-05-05|
| 10|   Viviana Sequeira| Ativo| Porto Alegre|     0|2020-09-05|
+---+-------------------+------+-------------+------+----------+



                                                                                

In [4]:
# Parquet
df_despachantes = (
    spark.read.format('parquet')
    .option('inferSchema', True)
    .load('data/despachantes.parquet')
)
df_despachantes.show()

+---+-------------------+-----+-------------+---+----------+
|_c0|                _c1|  _c2|          _c3|_c4|       _c5|
+---+-------------------+-----+-------------+---+----------+
|  1|   Carminda Pestana|Ativo|  Santa Maria| 23|2020-08-11|
|  2|    Deolinda Vilela|Ativo|Novo Hamburgo| 34|2020-03-05|
|  3|   Emídio Dornelles|Ativo| Porto Alegre| 34|2020-02-05|
|  4|Felisbela Dornelles|Ativo| Porto Alegre| 36|2020-02-05|
|  5|     Graça Ornellas|Ativo| Porto Alegre| 12|2020-02-05|
|  6|   Matilde Rebouças|Ativo| Porto Alegre| 22|2019-01-05|
|  7|    Noêmia   Orriça|Ativo|  Santa Maria| 45|2019-10-05|
|  8|      Roque Vásquez|Ativo| Porto Alegre| 65|2020-03-05|
|  9|      Uriel Queiroz|Ativo| Porto Alegre| 54|2018-05-05|
| 10|   Viviana Sequeira|Ativo| Porto Alegre|  0|2020-09-05|
+---+-------------------+-----+-------------+---+----------+



In [5]:
# JSON
df_despachantes = (
    spark.read.format('json')
    .option('inferSchema', True)
    .load('data/despachantes.json')
)
df_despachantes.show()

+-------------+-----------+---+-------------------+------+------+
|       cidade|       data| id|               nome|status|vendas|
+-------------+-----------+---+-------------------+------+------+
|  Santa Maria| 2020-08-11|  1|   Carminda Pestana| Ativo|    23|
|Novo Hamburgo| 2020-03-05|  2|    Deolinda Vilela| Ativo|    34|
| Porto Alegre| 2020-02-05|  3|   Emídio Dornelles| Ativo|    34|
| Porto Alegre| 2020-02-05|  4|Felisbela Dornelles| Ativo|    36|
| Porto Alegre| 2020-02-05|  5|     Graça Ornellas| Ativo|    12|
| Porto Alegre| 2019-01-05|  6|   Matilde Rebouças| Ativo|    22|
|  Santa Maria| 2019-10-05|  7|    Noêmia   Orriça| Ativo|    45|
| Porto Alegre| 2020-03-05|  8|      Roque Vásquez| Ativo|    65|
| Porto Alegre| 2018-05-05|  9|      Uriel Queiroz| Ativo|    54|
| Porto Alegre| 2020-09-05| 10|   Viviana Sequeira| Ativo|     0|
+-------------+-----------+---+-------------------+------+------+



In [6]:
# ORC
df_despachantes = (
    spark.read.format('orc')
    .option('inferSchema', True)
    .load('data/despachantes.orc')
)
df_despachantes.show()

+---+-------------------+-----+-------------+---+----------+
|_c0|                _c1|  _c2|          _c3|_c4|       _c5|
+---+-------------------+-----+-------------+---+----------+
|  1|   Carminda Pestana|Ativo|  Santa Maria| 23|2020-08-11|
|  2|    Deolinda Vilela|Ativo|Novo Hamburgo| 34|2020-03-05|
|  3|   Emídio Dornelles|Ativo| Porto Alegre| 34|2020-02-05|
|  4|Felisbela Dornelles|Ativo| Porto Alegre| 36|2020-02-05|
|  5|     Graça Ornellas|Ativo| Porto Alegre| 12|2020-02-05|
|  6|   Matilde Rebouças|Ativo| Porto Alegre| 22|2019-01-05|
|  7|    Noêmia   Orriça|Ativo|  Santa Maria| 45|2019-10-05|
|  8|      Roque Vásquez|Ativo| Porto Alegre| 65|2020-03-05|
|  9|      Uriel Queiroz|Ativo| Porto Alegre| 54|2018-05-05|
| 10|   Viviana Sequeira|Ativo| Porto Alegre|  0|2020-09-05|
+---+-------------------+-----+-------------+---+----------+



# Engenharia de Atributos e Pré-Processamento

### Aula 03 - Vetorização de Atributos com VectorAssembler
- Recebe como entrada um data frame com diversos atributos
- Produz como saída um único atributo, que é um vetor dos atributos de
entrada

In [7]:
df_carros = (
    spark.read.format('csv')
    .option('inferSchema', True)
    .option('header', True)
    .option('delimiter', ';')
    .load('data/Carros.csv')
)
df_carros.show(5, truncate=False)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors|HP |
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|21     |6        |160        |39             |262 |1646 |0        |1          |4      |4          |110|
|21     |6        |160        |39             |2875|1702 |0        |1          |4      |4          |110|
|228    |4        |108        |385            |232 |1861 |1        |1          |4      |1          |93 |
|214    |6        |258        |308            |3215|1944 |1        |0          |3      |1          |110|
|187    |8        |360        |315            |344 |1702 |0        |0          |3      |2          |175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [8]:
df_carros_vector_assembler =  VectorAssembler(
    inputCols = df_carros.columns[:-1],
    outputCol = 'features'
).transform(df_carros)

df_carros_vector_assembler.select('features').show(5, truncate = False)

+-----------------------------------------------------+
|features                                             |
+-----------------------------------------------------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |
+-----------------------------------------------------+
only showing top 5 rows



###  Aula 04 - Geração de Caracterśticas com PCA (Principal Component analysis)

- Alta dimensionalidade:
    - Menor capacidade de generalização
- PCA: Redução de Dimensionalidade
- Cria atributos sintéticos, sem compreensão funcional
- Estes novos atributos buscam manter as características importantes dos dados
- Representação dos atributos originais: projeção
- Não permite avaliar importância de atributos e não mais representam o negocio
analisado



In [9]:
pca_modelo = PCA(
    k = 3,
    inputCol = 'features',
    outputCol = 'features_pca'
).fit(df_carros_vector_assembler)

df_carros_pca = pca_modelo.transform(df_carros_vector_assembler)
df_carros_pca.select('features', 'features_pca').show(5, truncate = False)  

22/05/25 11:29:45 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
22/05/25 11:29:45 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


+-----------------------------------------------------+-----------------------------------------------------------+
|features                                             |features_pca                                               |
+-----------------------------------------------------+-----------------------------------------------------------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |[618.7707206779613,-937.712394997354,1231.963352994551]    |
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |[3112.9887675342197,-161.05746385491523,1191.8619913054383]|
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |[640.4959007710695,-1120.718886511042,1320.0756315189049]  |
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|[3466.0956877556673,-149.69421418298353,1401.204178036853] |
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |[661.4577445758732,-812.4592128844115,1395.2949328316356]  |
+-----------------------------------------------------+-----------------

### Aula 05 - Binarização de Atributos
- Parametro threshold
- \> threshold = 1
- < threshold = 0

In [10]:
df_iris = (
    spark.read.format('csv')
    .option('inferSchema', True)
    .option('header', True)
    .load('data/iris.csv')
)

df_iris.show(5)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [11]:
df_iris_binarizer = Binarizer(
    threshold = 4.9, 
    inputCol = 'sepallength',
    outputCol = 'sepallength_binarizer'
).transform(df_iris)

df_iris_binarizer.show(5)

+-----------+----------+-----------+----------+-----------+---------------------+
|sepallength|sepalwidth|petallength|petalwidth|      class|sepallength_binarizer|
+-----------+----------+-----------+----------+-----------+---------------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|                  1.0|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|                  0.0|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|                  0.0|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|                  0.0|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|                  1.0|
+-----------+----------+-----------+----------+-----------+---------------------+
only showing top 5 rows



### Aula 06 - Indexação de Texto com StringIndexer

- Técnica de Categorical Encoding: transforma categorias em números
- Itens mais frequentes recebem os números menores
- Você cria um modelo com um conjunto de dados, e usar este modelo para transformar outros conjuntos de dados
    - Rótulos não conhecidos encontrados são tratados pelo parâmetro handleInvalid, que pode ter os valores
        - Exceção (default): ‘error’
        - Omitir: 'skip'
        - Colocar “desconhecidos” em uma categoria especial: 'keep'

In [12]:
df_churn = (
    spark.read.format('csv')
    .option('header', True)
    .option('inferSchema', True)
    .option('delimiter', ';')
    .load('data/Churn.csv')
)

df_churn.show(5)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [13]:
model_churn_string_idx = StringIndexer(
    inputCols = ['Geography', 'Gender'],
    outputCols = ['Geography_idx', 'Gender_idx']
).fit(df_churn)

df_churn_string_idx = model_churn_string_idx.transform(df_churn)
df_churn_string_idx.select('Geography', 
                            'Geography_idx', 
                            'Gender', 
                            'Gender_idx'
                            ).show(5)

df_churn_string_idx.groupBy('Geography', 'Gender').agg(F.count('Geography_idx'), F.count('Gender_idx') )                            

+---------+-------------+------+----------+
|Geography|Geography_idx|Gender|Gender_idx|
+---------+-------------+------+----------+
|   France|          0.0|Female|       1.0|
|    Spain|          2.0|Female|       1.0|
|   France|          0.0|Female|       1.0|
|   France|          0.0|Female|       1.0|
|    Spain|          2.0|Female|       1.0|
+---------+-------------+------+----------+
only showing top 5 rows



Geography,Gender,count(Geography_idx),count(Gender_idx)
Germany,Female,1193,1193
France,Male,2753,2753
France,Female,2261,2261
Spain,Male,1388,1388
Germany,Male,1316,1316
Spain,Female,1089,1089


### Aula 7 - Índice para Texto com IndexToString
- As vezes precisamos converter de volta um
índice para a categoria
    - Explicar o modelo
    - Mostrar valores “reais”
- IndexToString cria um atributo com a coluna original

In [14]:
df_churn_index_to_string = IndexToString(
    inputCol = 'Geography_idx',
    outputCol = 'Geography_idx_to_str'
).transform(df_churn_string_idx)

df_churn_index_to_string.select('Geography', 'Geography_idx', 'Geography_idx_to_str').show()

+---------+-------------+--------------------+
|Geography|Geography_idx|Geography_idx_to_str|
+---------+-------------+--------------------+
|   France|          0.0|              France|
|    Spain|          2.0|               Spain|
|   France|          0.0|              France|
|   France|          0.0|              France|
|    Spain|          2.0|               Spain|
|    Spain|          2.0|               Spain|
|   France|          0.0|              France|
|  Germany|          1.0|             Germany|
|   France|          0.0|              France|
|   France|          0.0|              France|
|   France|          0.0|              France|
|    Spain|          2.0|               Spain|
|   France|          0.0|              France|
|   France|          0.0|              France|
|    Spain|          2.0|               Spain|
|  Germany|          1.0|             Germany|
|  Germany|          1.0|             Germany|
|    Spain|          2.0|               Spain|
|    Spain|  

### Aula 08 - One Hot Encoding
- OneHotEncoding produz um único atributo de saída com uma matriz densa, a partir de n atributos numéricos
-  Espera atributos numéricos: Podemos usar StringIndexer para transformar

In [15]:
model_one_hot_encoding = OneHotEncoder(
    inputCols=['Geography_idx', 'Gender_idx'],
    outputCols=['Geography_oneHotEncoder','Gender_oneHotEncoder' ]

).fit(df_churn_string_idx)

df_one_hot_encoding = model_one_hot_encoding.transform(df_churn_string_idx)
df_one_hot_encoding.select( 'Geography', 'Gender','Geography_idx', 'Gender_idx', 'Geography_oneHotEncoder','Gender_oneHotEncoder' ).show(8, truncate = False)

+---------+------+-------------+----------+-----------------------+--------------------+
|Geography|Gender|Geography_idx|Gender_idx|Geography_oneHotEncoder|Gender_oneHotEncoder|
+---------+------+-------------+----------+-----------------------+--------------------+
|France   |Female|0.0          |1.0       |(2,[0],[1.0])          |(1,[],[])           |
|Spain    |Female|2.0          |1.0       |(2,[],[])              |(1,[],[])           |
|France   |Female|0.0          |1.0       |(2,[0],[1.0])          |(1,[],[])           |
|France   |Female|0.0          |1.0       |(2,[0],[1.0])          |(1,[],[])           |
|Spain    |Female|2.0          |1.0       |(2,[],[])              |(1,[],[])           |
|Spain    |Male  |2.0          |0.0       |(2,[],[])              |(1,[0],[1.0])       |
|France   |Male  |0.0          |0.0       |(2,[0],[1.0])          |(1,[0],[1.0])       |
|Germany  |Female|1.0          |1.0       |(2,[1],[1.0])          |(1,[],[])           |
+---------+------+---

In [16]:
# +-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+-------------+----------+-----------------------+--------------------+
# |CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Geography_idx|Gender_idx|Geography_oneHotEncoder|Gender_oneHotEncoder|
# +-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+-------------+----------+-----------------------+--------------------+
# |        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# |        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|          2.0|       1.0|              (2,[],[])|           (1,[],[])|
# |        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# |        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# |        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|          2.0|       1.0|              (2,[],[])|           (1,[],[])|
# |        645|    Spain|  Male| 44|     8|11375578|            2|        1|             0|       14975671|     1|          2.0|       0.0|              (2,[],[])|       (1,[0],[1.0])|
# |        822|   France|  Male| 50|     7|       0|            2|        1|             1|         100628|     0|          0.0|       0.0|          (2,[0],[1.0])|       (1,[0],[1.0])|
# |        376|  Germany|Female| 29|     4|11504674|            4|        1|             0|       11934688|     1|          1.0|       1.0|          (2,[1],[1.0])|           (1,[],[])|
# |        501|   France|  Male| 44|     4|14205107|            2|        0|             1|         749405|     0|          0.0|       0.0|          (2,[0],[1.0])|       (1,[0],[1.0])|
# |        684|   France|  Male| 27|     2|13460388|            1|        1|             1|        7172573|     0|          0.0|       0.0|          (2,[0],[1.0])|       (1,[0],[1.0])|
# |        528|   France|  Male| 31|     6|10201672|            2|        0|             0|        8018112|     0|          0.0|       0.0|          (2,[0],[1.0])|       (1,[0],[1.0])|
# |        497|    Spain|  Male| 24|     3|       0|            2|        1|             0|        7639001|     0|          2.0|       0.0|              (2,[],[])|       (1,[0],[1.0])|
# |        476|   France|Female| 34|    10|       0|            2|        1|             0|        2626098|     0|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# |        549|   France|Female| 25|     5|       0|            2|        0|             0|       19085779|     0|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# |        635|    Spain|Female| 35|     7|       0|            2|        1|             1|        6595165|     0|          2.0|       1.0|              (2,[],[])|           (1,[],[])|
# |        616|  Germany|  Male| 45|     3|14312941|            2|        0|             1|        6432726|     0|          1.0|       0.0|          (2,[1],[1.0])|       (1,[0],[1.0])|
# |        653|  Germany|  Male| 58|     1|13260288|            1|        1|             0|         509767|     1|          1.0|       0.0|          (2,[1],[1.0])|       (1,[0],[1.0])|
# |        549|    Spain|Female| 24|     9|       0|            2|        1|             1|        1440641|     0|          2.0|       1.0|              (2,[],[])|           (1,[],[])|
# |        587|    Spain|  Male| 45|     6|       0|            1|        0|             0|       15868481|     0|          2.0|       0.0|              (2,[],[])|       (1,[0],[1.0])|
# |        726|   France|Female| 24|     6|       0|            2|        1|             1|        5472403|     0|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# +-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+-------------+----------+-----------------------+--------------------+

### Aula 09 - Imputer tratando valores ausentes

- Substitui valores ausentes
- Usa uma estratégia de substituição: média, mediana ou moda
- Pode ainda substituir qualquer outro valor (por exemplo, zero) usando parâmetro setMissingValue

In [17]:
df_carros_nan = (
    spark.read.format('csv')
    .option('header', True)
    .option('inferSchema', True)
    .option('delimiter', ';')
    .load('data/CarrosNAN.csv')
)

df_carros_nan.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|       null|             39|2875| null|        0|          1|      4|          4|110|
|    228|        0|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        0|       null|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        0|        360|            315|null| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [21]:
# Substituindo 0 pela média(default)

df_carros_nan_imput = Imputer(
    inputCols = ['Cilindradas', 'Peso'],
    outputCols = ['Cilindradas', 'Peso']
).fit(df_carros_nan).transform(df_carros_nan)

df_carros_nan_imput.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        848|             39|2875| null|        0|          1|      4|          4|110|
|    228|        0|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        0|        848|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        0|        360|            315|1318| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [19]:
# Substituindo 0 pela mediana

df_carros_0_to_median_imput = Imputer(
    inputCol = 'Cilindros',
    outputCol = 'Cilindros',
    missingValue = 0,
    strategy = 'median',
    
).fit(df_carros_nan_imput).transform(df_carros_nan_imput)

df_carros_0_to_median_imput.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        848|             39|2875| null|        0|          1|      4|          4|110|
|    228|        6|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        848|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        6|        360|            315|1318| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



# Aula 10 - Polinomial Expansion / Expansão de atributos
- Expande um atributo de acordo com o grau, criando novos atributos
- Exemplo, dois atributos x e y expandidos com grau 2:
- x, x * x, y, x * y, y * y
- Entrada deve ser uma coluna com vetor de atributos

In [20]:
df_polinomial = (
    df_carros_0_to_median_imput.select('Consumo', 'Cilindros', 'Cilindradas')
)

df_polinomial_assembler = VectorAssembler(
    inputCols = df_polinomial.columns,
    outputCol = 'features'
).transform(df_polinomial)

df_polymonial_explansion = PolynomialExpansion(
    degree=2,
    inputCol='features',
    outputCol = 'features_polynomial'
).transform(df_polinomial_assembler)


df_polymonial_explansion.select('features', 'features_polynomial').show(5, truncate = False)


+-----------------+--------------------------------------------------------------+
|features         |features_polynomial                                           |
+-----------------+--------------------------------------------------------------+
|[21.0,6.0,160.0] |[21.0,441.0,6.0,126.0,36.0,160.0,3360.0,960.0,25600.0]        |
|[21.0,6.0,848.0] |[21.0,441.0,6.0,126.0,36.0,848.0,17808.0,5088.0,719104.0]     |
|[228.0,6.0,108.0]|[228.0,51984.0,6.0,1368.0,36.0,108.0,24624.0,648.0,11664.0]   |
|[214.0,6.0,848.0]|[214.0,45796.0,6.0,1284.0,36.0,848.0,181472.0,5088.0,719104.0]|
|[187.0,6.0,360.0]|[187.0,34969.0,6.0,1122.0,36.0,360.0,67320.0,2160.0,129600.0] |
+-----------------+--------------------------------------------------------------+
only showing top 5 rows



# Aula 00 ...

# Aula 00 ...

# Aula 00 ...

# Aula 00 ...

# Aula 00 ...