# Importação de dados

### Aula 01 Importações

In [52]:
from pyspark.ml.feature import (
    VectorAssembler,
    PCA,
    Binarizer,
    StringIndexer,
    IndexToString,
    OneHotEncoder,
    Imputer,
    PolynomialExpansion,
    Normalizer,
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    MaxAbsScaler,
    QuantileDiscretizer,
    RFormula,
    VectorSlicer,
    ChiSqSelector,
    UnivariateFeatureSelector
)

from pyspark.sql import SparkSession, functions as F

spark = (
    SparkSession
    .builder
    .appName('machine-learning-pyspark-mllib')
    .getOrCreate()
)
spark


In [24]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)


#### Importando CSV, Parquet, JSON e ORC

In [25]:
# csv
schema = """
    id INT,
    nome STRING,
    status STRING,
    cidade STRING,
    vendas INT,
    data DATE 
"""
df_despachantes = (
    spark.read.format('csv')
    .schema(schema=schema)
    .load('data/despachantes.csv')
)
df_despachantes.show()


+---+-------------------+------+-------------+------+----------+
| id|               nome|status|       cidade|vendas|      data|
+---+-------------------+------+-------------+------+----------+
|  1|   Carminda Pestana| Ativo|  Santa Maria|    23|2020-08-11|
|  2|    Deolinda Vilela| Ativo|Novo Hamburgo|    34|2020-03-05|
|  3|   Emídio Dornelles| Ativo| Porto Alegre|    34|2020-02-05|
|  4|Felisbela Dornelles| Ativo| Porto Alegre|    36|2020-02-05|
|  5|     Graça Ornellas| Ativo| Porto Alegre|    12|2020-02-05|
|  6|   Matilde Rebouças| Ativo| Porto Alegre|    22|2019-01-05|
|  7|    Noêmia   Orriça| Ativo|  Santa Maria|    45|2019-10-05|
|  8|      Roque Vásquez| Ativo| Porto Alegre|    65|2020-03-05|
|  9|      Uriel Queiroz| Ativo| Porto Alegre|    54|2018-05-05|
| 10|   Viviana Sequeira| Ativo| Porto Alegre|     0|2020-09-05|
+---+-------------------+------+-------------+------+----------+



In [26]:
# Parquet
df_despachantes = (
    spark.read.format('parquet')
    .option('inferSchema', True)
    .load('data/despachantes.parquet')
)
df_despachantes.show()


+---+-------------------+-----+-------------+---+----------+
|_c0|                _c1|  _c2|          _c3|_c4|       _c5|
+---+-------------------+-----+-------------+---+----------+
|  1|   Carminda Pestana|Ativo|  Santa Maria| 23|2020-08-11|
|  2|    Deolinda Vilela|Ativo|Novo Hamburgo| 34|2020-03-05|
|  3|   Emídio Dornelles|Ativo| Porto Alegre| 34|2020-02-05|
|  4|Felisbela Dornelles|Ativo| Porto Alegre| 36|2020-02-05|
|  5|     Graça Ornellas|Ativo| Porto Alegre| 12|2020-02-05|
|  6|   Matilde Rebouças|Ativo| Porto Alegre| 22|2019-01-05|
|  7|    Noêmia   Orriça|Ativo|  Santa Maria| 45|2019-10-05|
|  8|      Roque Vásquez|Ativo| Porto Alegre| 65|2020-03-05|
|  9|      Uriel Queiroz|Ativo| Porto Alegre| 54|2018-05-05|
| 10|   Viviana Sequeira|Ativo| Porto Alegre|  0|2020-09-05|
+---+-------------------+-----+-------------+---+----------+



In [27]:
# JSON
df_despachantes = (
    spark.read.format('json')
    .option('inferSchema', True)
    .load('data/despachantes.json')
)
df_despachantes.show()


+-------------+-----------+---+-------------------+------+------+
|       cidade|       data| id|               nome|status|vendas|
+-------------+-----------+---+-------------------+------+------+
|  Santa Maria| 2020-08-11|  1|   Carminda Pestana| Ativo|    23|
|Novo Hamburgo| 2020-03-05|  2|    Deolinda Vilela| Ativo|    34|
| Porto Alegre| 2020-02-05|  3|   Emídio Dornelles| Ativo|    34|
| Porto Alegre| 2020-02-05|  4|Felisbela Dornelles| Ativo|    36|
| Porto Alegre| 2020-02-05|  5|     Graça Ornellas| Ativo|    12|
| Porto Alegre| 2019-01-05|  6|   Matilde Rebouças| Ativo|    22|
|  Santa Maria| 2019-10-05|  7|    Noêmia   Orriça| Ativo|    45|
| Porto Alegre| 2020-03-05|  8|      Roque Vásquez| Ativo|    65|
| Porto Alegre| 2018-05-05|  9|      Uriel Queiroz| Ativo|    54|
| Porto Alegre| 2020-09-05| 10|   Viviana Sequeira| Ativo|     0|
+-------------+-----------+---+-------------------+------+------+



In [28]:
# ORC
df_despachantes = (
    spark.read.format('orc')
    .option('inferSchema', True)
    .load('data/despachantes.orc')
)
df_despachantes.show()


+---+-------------------+-----+-------------+---+----------+
|_c0|                _c1|  _c2|          _c3|_c4|       _c5|
+---+-------------------+-----+-------------+---+----------+
|  1|   Carminda Pestana|Ativo|  Santa Maria| 23|2020-08-11|
|  2|    Deolinda Vilela|Ativo|Novo Hamburgo| 34|2020-03-05|
|  3|   Emídio Dornelles|Ativo| Porto Alegre| 34|2020-02-05|
|  4|Felisbela Dornelles|Ativo| Porto Alegre| 36|2020-02-05|
|  5|     Graça Ornellas|Ativo| Porto Alegre| 12|2020-02-05|
|  6|   Matilde Rebouças|Ativo| Porto Alegre| 22|2019-01-05|
|  7|    Noêmia   Orriça|Ativo|  Santa Maria| 45|2019-10-05|
|  8|      Roque Vásquez|Ativo| Porto Alegre| 65|2020-03-05|
|  9|      Uriel Queiroz|Ativo| Porto Alegre| 54|2018-05-05|
| 10|   Viviana Sequeira|Ativo| Porto Alegre|  0|2020-09-05|
+---+-------------------+-----+-------------+---+----------+



# Engenharia de Atributos e Pré-Processamento

### Aula 03 - Vetorização de Atributos com VectorAssembler
- Recebe como entrada um data frame com diversos atributos
- Produz como saída um único atributo, que é um vetor dos atributos de
entrada

In [29]:
df_carros = (
    spark.read.format('csv')
    .option('inferSchema', True)
    .option('header', True)
    .option('delimiter', ';')
    .load('data/Carros.csv')
)
df_carros.show(5, truncate=False)


+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors|HP |
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|21     |6        |160        |39             |262 |1646 |0        |1          |4      |4          |110|
|21     |6        |160        |39             |2875|1702 |0        |1          |4      |4          |110|
|228    |4        |108        |385            |232 |1861 |1        |1          |4      |1          |93 |
|214    |6        |258        |308            |3215|1944 |1        |0          |3      |1          |110|
|187    |8        |360        |315            |344 |1702 |0        |0          |3      |2          |175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [30]:
df_carros_vector_assembler = VectorAssembler(
    inputCols=df_carros.columns[:-1],
    outputCol='features'
).transform(df_carros)

df_carros_vector_assembler.select('features').show(5, truncate=False)


+-----------------------------------------------------+
|features                                             |
+-----------------------------------------------------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |
+-----------------------------------------------------+
only showing top 5 rows



###  Aula 04 - Geração de Caracterśticas com PCA (Principal Component analysis)

- Alta dimensionalidade:
    - Menor capacidade de generalização
- PCA: Redução de Dimensionalidade
- Cria atributos sintéticos, sem compreensão funcional
- Estes novos atributos buscam manter as características importantes dos dados
- Representação dos atributos originais: projeção
- Não permite avaliar importância de atributos e não mais representam o negocio
analisado



In [31]:
pca_modelo = PCA(
    k=3,
    inputCol='features',
    outputCol='features_pca'
).fit(df_carros_vector_assembler)

df_carros_pca = pca_modelo.transform(df_carros_vector_assembler)
df_carros_pca.select('features', 'features_pca').show(5, truncate=False)


+-----------------------------------------------------+-----------------------------------------------------------+
|features                                             |features_pca                                               |
+-----------------------------------------------------+-----------------------------------------------------------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |[618.7707206779613,-937.712394997354,1231.963352994551]    |
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |[3112.9887675342197,-161.05746385491523,1191.8619913054383]|
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |[640.4959007710695,-1120.718886511042,1320.0756315189049]  |
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|[3466.0956877556673,-149.69421418298353,1401.204178036853] |
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |[661.4577445758732,-812.4592128844115,1395.2949328316356]  |
+-----------------------------------------------------+-----------------

### Aula 05 - Binarização de Atributos
- Parametro threshold
- \> threshold = 1
- < threshold = 0

In [32]:
df_iris = (
    spark.read.format('csv')
    .option('inferSchema', True)
    .option('header', True)
    .load('data/iris.csv')
)

df_iris.show(5)


+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [33]:
df_iris_binarizer = Binarizer(
    threshold=4.9,
    inputCol='sepallength',
    outputCol='sepallength_binarizer'
).transform(df_iris)

df_iris_binarizer.show(5)


+-----------+----------+-----------+----------+-----------+---------------------+
|sepallength|sepalwidth|petallength|petalwidth|      class|sepallength_binarizer|
+-----------+----------+-----------+----------+-----------+---------------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|                  1.0|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|                  0.0|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|                  0.0|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|                  0.0|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|                  1.0|
+-----------+----------+-----------+----------+-----------+---------------------+
only showing top 5 rows



### Aula 06 - Indexação de Texto com StringIndexer

- Técnica de Categorical Encoding: transforma categorias em números
- Itens mais frequentes recebem os números menores
- Você cria um modelo com um conjunto de dados, e usar este modelo para transformar outros conjuntos de dados
    - Rótulos não conhecidos encontrados são tratados pelo parâmetro handleInvalid, que pode ter os valores
        - Exceção (default): ‘error’
        - Omitir: 'skip'
        - Colocar “desconhecidos” em uma categoria especial: 'keep'

In [34]:
df_churn = (
    spark.read.format('csv')
    .option('header', True)
    .option('inferSchema', True)
    .option('delimiter', ';')
    .load('data/Churn.csv')
)

df_churn.show(5)


+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [35]:
model_churn_string_idx = StringIndexer(
    inputCols=['Geography', 'Gender'],
    outputCols=['Geography_idx', 'Gender_idx']
).fit(df_churn)

df_churn_string_idx = model_churn_string_idx.transform(df_churn)
df_churn_string_idx.select('Geography',
                           'Geography_idx',
                           'Gender',
                           'Gender_idx'
                           ).show(5)

df_churn_string_idx.groupBy('Geography', 'Gender').agg(
    F.count('Geography_idx'), F.count('Gender_idx'))


+---------+-------------+------+----------+
|Geography|Geography_idx|Gender|Gender_idx|
+---------+-------------+------+----------+
|   France|          0.0|Female|       1.0|
|    Spain|          2.0|Female|       1.0|
|   France|          0.0|Female|       1.0|
|   France|          0.0|Female|       1.0|
|    Spain|          2.0|Female|       1.0|
+---------+-------------+------+----------+
only showing top 5 rows



Geography,Gender,count(Geography_idx),count(Gender_idx)
Germany,Female,1193,1193
France,Male,2753,2753
France,Female,2261,2261
Spain,Male,1388,1388
Germany,Male,1316,1316
Spain,Female,1089,1089


### Aula 7 - Índice para Texto com IndexToString
- As vezes precisamos converter de volta um
índice para a categoria
    - Explicar o modelo
    - Mostrar valores “reais”
- IndexToString cria um atributo com a coluna original

In [36]:
df_churn_index_to_string = IndexToString(
    inputCol='Geography_idx',
    outputCol='Geography_idx_to_str'
).transform(df_churn_string_idx)

df_churn_index_to_string.select(
    'Geography', 'Geography_idx', 'Geography_idx_to_str').show()


+---------+-------------+--------------------+
|Geography|Geography_idx|Geography_idx_to_str|
+---------+-------------+--------------------+
|   France|          0.0|              France|
|    Spain|          2.0|               Spain|
|   France|          0.0|              France|
|   France|          0.0|              France|
|    Spain|          2.0|               Spain|
|    Spain|          2.0|               Spain|
|   France|          0.0|              France|
|  Germany|          1.0|             Germany|
|   France|          0.0|              France|
|   France|          0.0|              France|
|   France|          0.0|              France|
|    Spain|          2.0|               Spain|
|   France|          0.0|              France|
|   France|          0.0|              France|
|    Spain|          2.0|               Spain|
|  Germany|          1.0|             Germany|
|  Germany|          1.0|             Germany|
|    Spain|          2.0|               Spain|
|    Spain|  

### Aula 08 - One Hot Encoding
- OneHotEncoding produz um único atributo de saída com uma matriz densa, a partir de n atributos numéricos
-  Espera atributos numéricos: Podemos usar StringIndexer para transformar

In [37]:
model_one_hot_encoding = OneHotEncoder(
    inputCols=['Geography_idx', 'Gender_idx'],
    outputCols=['Geography_oneHotEncoder', 'Gender_oneHotEncoder']

).fit(df_churn_string_idx)

df_one_hot_encoding = model_one_hot_encoding.transform(df_churn_string_idx)
df_one_hot_encoding.select('Geography', 'Gender', 'Geography_idx', 'Gender_idx',
                           'Geography_oneHotEncoder', 'Gender_oneHotEncoder').show(8, truncate=False)


+---------+------+-------------+----------+-----------------------+--------------------+
|Geography|Gender|Geography_idx|Gender_idx|Geography_oneHotEncoder|Gender_oneHotEncoder|
+---------+------+-------------+----------+-----------------------+--------------------+
|France   |Female|0.0          |1.0       |(2,[0],[1.0])          |(1,[],[])           |
|Spain    |Female|2.0          |1.0       |(2,[],[])              |(1,[],[])           |
|France   |Female|0.0          |1.0       |(2,[0],[1.0])          |(1,[],[])           |
|France   |Female|0.0          |1.0       |(2,[0],[1.0])          |(1,[],[])           |
|Spain    |Female|2.0          |1.0       |(2,[],[])              |(1,[],[])           |
|Spain    |Male  |2.0          |0.0       |(2,[],[])              |(1,[0],[1.0])       |
|France   |Male  |0.0          |0.0       |(2,[0],[1.0])          |(1,[0],[1.0])       |
|Germany  |Female|1.0          |1.0       |(2,[1],[1.0])          |(1,[],[])           |
+---------+------+---

In [38]:
# +-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+-------------+----------+-----------------------+--------------------+
# |CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Geography_idx|Gender_idx|Geography_oneHotEncoder|Gender_oneHotEncoder|
# +-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+-------------+----------+-----------------------+--------------------+
# |        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# |        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|          2.0|       1.0|              (2,[],[])|           (1,[],[])|
# |        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# |        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# |        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|          2.0|       1.0|              (2,[],[])|           (1,[],[])|
# |        645|    Spain|  Male| 44|     8|11375578|            2|        1|             0|       14975671|     1|          2.0|       0.0|              (2,[],[])|       (1,[0],[1.0])|
# |        822|   France|  Male| 50|     7|       0|            2|        1|             1|         100628|     0|          0.0|       0.0|          (2,[0],[1.0])|       (1,[0],[1.0])|
# |        376|  Germany|Female| 29|     4|11504674|            4|        1|             0|       11934688|     1|          1.0|       1.0|          (2,[1],[1.0])|           (1,[],[])|
# |        501|   France|  Male| 44|     4|14205107|            2|        0|             1|         749405|     0|          0.0|       0.0|          (2,[0],[1.0])|       (1,[0],[1.0])|
# |        684|   France|  Male| 27|     2|13460388|            1|        1|             1|        7172573|     0|          0.0|       0.0|          (2,[0],[1.0])|       (1,[0],[1.0])|
# |        528|   France|  Male| 31|     6|10201672|            2|        0|             0|        8018112|     0|          0.0|       0.0|          (2,[0],[1.0])|       (1,[0],[1.0])|
# |        497|    Spain|  Male| 24|     3|       0|            2|        1|             0|        7639001|     0|          2.0|       0.0|              (2,[],[])|       (1,[0],[1.0])|
# |        476|   France|Female| 34|    10|       0|            2|        1|             0|        2626098|     0|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# |        549|   France|Female| 25|     5|       0|            2|        0|             0|       19085779|     0|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# |        635|    Spain|Female| 35|     7|       0|            2|        1|             1|        6595165|     0|          2.0|       1.0|              (2,[],[])|           (1,[],[])|
# |        616|  Germany|  Male| 45|     3|14312941|            2|        0|             1|        6432726|     0|          1.0|       0.0|          (2,[1],[1.0])|       (1,[0],[1.0])|
# |        653|  Germany|  Male| 58|     1|13260288|            1|        1|             0|         509767|     1|          1.0|       0.0|          (2,[1],[1.0])|       (1,[0],[1.0])|
# |        549|    Spain|Female| 24|     9|       0|            2|        1|             1|        1440641|     0|          2.0|       1.0|              (2,[],[])|           (1,[],[])|
# |        587|    Spain|  Male| 45|     6|       0|            1|        0|             0|       15868481|     0|          2.0|       0.0|              (2,[],[])|       (1,[0],[1.0])|
# |        726|   France|Female| 24|     6|       0|            2|        1|             1|        5472403|     0|          0.0|       1.0|          (2,[0],[1.0])|           (1,[],[])|
# +-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+-------------+----------+-----------------------+--------------------+


### Aula 09 - Imputer tratando valores ausentes

- Substitui valores ausentes
- Usa uma estratégia de substituição: média, mediana ou moda
- Pode ainda substituir qualquer outro valor (por exemplo, zero) usando parâmetro setMissingValue

In [39]:
df_carros_nan = (
    spark.read.format('csv')
    .option('header', True)
    .option('inferSchema', True)
    .option('delimiter', ';')
    .load('data/CarrosNAN.csv')
)

df_carros_nan.show(5)


+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|       null|             39|2875| null|        0|          1|      4|          4|110|
|    228|        0|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        0|       null|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        0|        360|            315|null| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [40]:
# Substituindo 0 pela média(default)

df_carros_nan_imput = Imputer(
    inputCols=['Cilindradas', 'Peso'],
    outputCols=['Cilindradas', 'Peso']
).fit(df_carros_nan).transform(df_carros_nan)

df_carros_nan_imput.show(5)


+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        848|             39|2875| null|        0|          1|      4|          4|110|
|    228|        0|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        0|        848|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        0|        360|            315|1318| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



In [41]:
# Substituindo 0 pela mediana

df_carros_0_to_median_imput = Imputer(
    inputCol='Cilindros',
    outputCol='Cilindros',
    missingValue=0,
    strategy='median',

).fit(df_carros_nan_imput).transform(df_carros_nan_imput)

df_carros_0_to_median_imput.show(5)


+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        848|             39|2875| null|        0|          1|      4|          4|110|
|    228|        6|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        848|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        6|        360|            315|1318| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



### Aula 10 - Polinomial Expansion / Expansão de atributos
- Expande um atributo de acordo com o grau, criando novos atributos
- Exemplo, dois atributos x e y expandidos com grau 2:
- x, x * x, y, x * y, y * y
- Entrada deve ser uma coluna com vetor de atributos

In [42]:
df_polinomial = (
    df_carros_0_to_median_imput.select('Consumo', 'Cilindros', 'Cilindradas')
)

df_polinomial_assembler = VectorAssembler(
    inputCols=df_polinomial.columns,
    outputCol='features'
).transform(df_polinomial)

df_polymonial_explansion = PolynomialExpansion(
    degree=2,
    inputCol='features',
    outputCol='features_polynomial'
).transform(df_polinomial_assembler)


df_polymonial_explansion.select(
    'features', 'features_polynomial').show(5, truncate=False)


+-----------------+--------------------------------------------------------------+
|features         |features_polynomial                                           |
+-----------------+--------------------------------------------------------------+
|[21.0,6.0,160.0] |[21.0,441.0,6.0,126.0,36.0,160.0,3360.0,960.0,25600.0]        |
|[21.0,6.0,848.0] |[21.0,441.0,6.0,126.0,36.0,848.0,17808.0,5088.0,719104.0]     |
|[228.0,6.0,108.0]|[228.0,51984.0,6.0,1368.0,36.0,108.0,24624.0,648.0,11664.0]   |
|[214.0,6.0,848.0]|[214.0,45796.0,6.0,1284.0,36.0,848.0,181472.0,5088.0,719104.0]|
|[187.0,6.0,360.0]|[187.0,34969.0,6.0,1122.0,36.0,360.0,67320.0,2160.0,129600.0] |
+-----------------+--------------------------------------------------------------+
only showing top 5 rows



### Aula 11 - Normalização de dados com Normalizer
- "Padronizador" de dados
- Parametro p (p-norm) usando para normalização, default 2

In [43]:
df_car_assembler.columns[:3]


NameError: name 'df_car_assembler' is not defined

In [None]:
df_car_assembler = df_carros_vector_assembler.select(
    'Consumo', 'Cilindros', 'Cilindradas')

df_car_assembler = VectorAssembler(
    inputCols=df_car_assembler.columns[:3],
    outputCol='features'
).transform(df_car_assembler)

df_car_assembler.show(5, truncate=False)


+-------+---------+-----------+-----------------+
|Consumo|Cilindros|Cilindradas|features         |
+-------+---------+-----------+-----------------+
|21     |6        |160        |[21.0,6.0,160.0] |
|21     |6        |160        |[21.0,6.0,160.0] |
|228    |4        |108        |[228.0,4.0,108.0]|
|214    |6        |258        |[214.0,6.0,258.0]|
|187    |8        |360        |[187.0,8.0,360.0]|
+-------+---------+-----------+-----------------+
only showing top 5 rows



In [None]:
df_car_normalizer = Normalizer(
    inputCol='features',
    outputCol='features_normalized',
    p=1
).transform(df_car_assembler)

df_car_normalizer.show(5, truncate=False)


+-------+---------+-----------+-----------------+-------------------------------------------------------------+
|Consumo|Cilindros|Cilindradas|features         |features_normalized                                          |
+-------+---------+-----------+-----------------+-------------------------------------------------------------+
|21     |6        |160        |[21.0,6.0,160.0] |[0.11229946524064172,0.03208556149732621,0.8556149732620321] |
|21     |6        |160        |[21.0,6.0,160.0] |[0.11229946524064172,0.03208556149732621,0.8556149732620321] |
|228    |4        |108        |[228.0,4.0,108.0]|[0.6705882352941176,0.011764705882352941,0.3176470588235294] |
|214    |6        |258        |[214.0,6.0,258.0]|[0.4476987447698745,0.012552301255230125,0.5397489539748954] |
|187    |8        |360        |[187.0,8.0,360.0]|[0.33693693693693694,0.014414414414414415,0.6486486486486487]|
+-------+---------+-----------+-----------------+-------------------------------------------------------

### Aula 12 - Padronização de Dados com StandardScaler
- Normaliza os atributos para o desvio padrão ou média zero.
- withStd: transforma os dados para o desvio padrão da unidade. Padrão True.
- withMean: Antes de transformar, centraliza os dados pela média. Padrão False.

In [None]:
df_escala = StandardScaler(
    inputCol='features',
    outputCol='features_std',
    withStd=True,
    withMean=False,
).fit(df_car_assembler).transform(df_car_assembler)

df_escala.show(5, truncate=False)


+-------+---------+-----------+-----------------+-----------------------------------------------------------+
|Consumo|Cilindros|Cilindradas|features         |features_std                                               |
+-------+---------+-----------+-----------------+-----------------------------------------------------------+
|21     |6        |160        |[21.0,6.0,160.0] |[0.24996122082808128,3.359609874407659,0.20137542427273997]|
|21     |6        |160        |[21.0,6.0,160.0] |[0.24996122082808128,3.359609874407659,0.20137542427273997]|
|228    |4        |108        |[228.0,4.0,108.0]|[2.713864683276311,2.239739916271773,0.13592841138409947]  |
|214    |6        |258        |[214.0,6.0,258.0]|[2.5472238693909235,3.359609874407659,0.32471787163979315] |
|187    |8        |360        |[187.0,8.0,360.0]|[2.2258451568976763,4.479479832543546,0.4530947046136649]  |
+-------+---------+-----------+-----------------+-----------------------------------------------------------+
only showi

### Aula 13 - Padronização de dados com RobustScaler

- Faz a padronização dos dados de acordo com um Quantil
- Melhor quando os dados tem outliers se comparado do StandardScaler (Robust)
- Parâmetros:
    - lower: Quantil inferior usado no cálculo dos intervalos. Padrão 0.25
    - upper: Quantil superior usado no cálculo dos intervalos. Padrão 0.75
    - withScaling: Dimensiona os dados para o quantil. Padrão True
    - withCentering: Centraliza os dados com a mediana antes de transformar. Padrão False


In [None]:
df_escala_robust_scaler = RobustScaler(
    inputCol='features',
    outputCol='features_robust_scaler',
    withScaling=True,
    withCentering=False,
    lower=0.25,
    upper=0.75
).fit(df_car_assembler).transform(df_car_assembler)

df_escala_robust_scaler.show(5, truncate=False)


+-------+---------+-----------+-----------------+---------------------------------------------+
|Consumo|Cilindros|Cilindradas|features         |features_robust_scaler                       |
+-------+---------+-----------+-----------------+---------------------------------------------+
|21     |6        |160        |[21.0,6.0,160.0] |[0.29166666666666663,1.5,0.16967126193001061]|
|21     |6        |160        |[21.0,6.0,160.0] |[0.29166666666666663,1.5,0.16967126193001061]|
|228    |4        |108        |[228.0,4.0,108.0]|[3.1666666666666665,1.0,0.11452810180275717] |
|214    |6        |258        |[214.0,6.0,258.0]|[2.972222222222222,1.5,0.27359490986214213]  |
|187    |8        |360        |[187.0,8.0,360.0]|[2.597222222222222,2.0,0.3817603393425239]   |
+-------+---------+-----------+-----------------+---------------------------------------------+
only showing top 5 rows



### Aula 14 - Padronização de Dados com MinMaxScaler

- Transforma os dados para os limites de um intervalo. Normalmente entre zero e 1.
- Parâmetros:
    - Min: limite inferior. Padrão é zero
    - Max: limite superior: Padrão é 1

In [None]:
df_min_max_scaler = MinMaxScaler(
    inputCol='features',
    outputCol='MinMaxScaler',
    min=0.0,
    max=1.0
).fit(df_car_assembler).transform(df_car_assembler)

df_min_max_scaler.show(5, truncate=False)


+-------+---------+-----------+-----------------+-----------------------------------------------+
|Consumo|Cilindros|Cilindradas|features         |MinMaxScaler                                   |
+-------+---------+-----------+-----------------+-----------------------------------------------+
|21     |6        |160        |[21.0,6.0,160.0] |[0.018518518518518517,0.5,0.030235162374020158]|
|21     |6        |160        |[21.0,6.0,160.0] |[0.018518518518518517,0.5,0.030235162374020158]|
|228    |4        |108        |[228.0,4.0,108.0]|[0.6574074074074073,0.0,0.010824934677118328]  |
|214    |6        |258        |[214.0,6.0,258.0]|[0.6141975308641975,0.5,0.06681597611048899]   |
|187    |8        |360        |[187.0,8.0,360.0]|[0.5308641975308641,1.0,0.10488988428518103]   |
+-------+---------+-----------+-----------------+-----------------------------------------------+
only showing top 5 rows



### Aula 15 - Padronização de Dados com MaxAbsScaler

- Padroniza os dados entre -1 e 1
- Não centraliza os dados de nenhuma forma, então dados não perdem suas características de dispersão

In [None]:
df_MaxAbsScaler = MaxAbsScaler(
    inputCol='features',
    outputCol='features_MaxAbsScaler'
).fit(df_car_assembler).transform(df_car_assembler)

df_MaxAbsScaler.show(5, truncate=False)


+-------+---------+-----------+-----------------+-----------------------------------------------+
|Consumo|Cilindros|Cilindradas|features         |features_MaxAbsScaler                          |
+-------+---------+-----------+-----------------+-----------------------------------------------+
|21     |6        |160        |[21.0,6.0,160.0] |[0.061946902654867256,0.75,0.05801305293691081]|
|21     |6        |160        |[21.0,6.0,160.0] |[0.061946902654867256,0.75,0.05801305293691081]|
|228    |4        |108        |[228.0,4.0,108.0]|[0.6725663716814159,0.5,0.0391588107324148]    |
|214    |6        |258        |[214.0,6.0,258.0]|[0.6312684365781711,0.75,0.09354604786076867]  |
|187    |8        |360        |[187.0,8.0,360.0]|[0.551622418879056,1.0,0.13052936910804933]    |
+-------+---------+-----------+-----------------+-----------------------------------------------+
only showing top 5 rows



### Aula 16 - QuantileDiscretizer

- Discretrização é a transformação de dados contínuos em discretos
- Parâmetro:
    - buckets: quantos valores discretos serão usados (não necessariamente utiliza todos)

In [None]:
df_churn.show(5)


+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [None]:
# Discretizando a coluna Tenure

df_churn_QuantileDiscretizer = QuantileDiscretizer(
    numBuckets=4,
    inputCol='Tenure',
    outputCol='Tenure_QuantileDiscretizer'
).fit(df_churn).transform(df_churn)

(
    df_churn_QuantileDiscretizer
    .select('Tenure', 'Tenure_QuantileDiscretizer')
    .show(20, truncate=False)
)


+------+--------------------------+
|Tenure|Tenure_QuantileDiscretizer|
+------+--------------------------+
|2     |1.0                       |
|1     |0.0                       |
|8     |3.0                       |
|1     |0.0                       |
|2     |1.0                       |
|8     |3.0                       |
|7     |3.0                       |
|4     |1.0                       |
|4     |1.0                       |
|2     |1.0                       |
|6     |2.0                       |
|3     |1.0                       |
|10    |3.0                       |
|5     |2.0                       |
|7     |3.0                       |
|3     |1.0                       |
|1     |0.0                       |
|9     |3.0                       |
|6     |2.0                       |
|6     |2.0                       |
+------+--------------------------+
only showing top 20 rows



### Aula 17 - Transformação com RFormula

- Linguagem R permite definir modelo através de fórmula
- [variável dependente] ~ [variável independentes]
- Variáveis Independentes podem ser definidas através de +
- Ponto define todas os atributos – variável dependente
- Spark implemente Rformula
- Combina variáveis independentes em uma única coluna
##### Ex:
HP ~ Consumo + Cilindros+ Cilindradas<br>
HP ~ .

<br>Spark implementa RFormula
- Colunas numéricas serão transformadas em double
- Strings serão transformadas com StringIndexer, e a última categoria é excluída e então aplica One HotEncoding

In [None]:
df_carros.show(3)


+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 3 rows



In [None]:
# Consumo+Cilindros+Cilindrada

df_RFormula = RFormula(
    formula='HP ~ Consumo + Cilindros + Cilindradas',
    featuresCol='features',
    labelCol='label'
).fit(df_carros).transform(df_carros)

df_RFormula.show(5, truncate=False)


+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+-----------------+-----+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors|HP |features         |label|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+-----------------+-----+
|21     |6        |160        |39             |262 |1646 |0        |1          |4      |4          |110|[21.0,6.0,160.0] |110.0|
|21     |6        |160        |39             |2875|1702 |0        |1          |4      |4          |110|[21.0,6.0,160.0] |110.0|
|228    |4        |108        |385            |232 |1861 |1        |1          |4      |1          |93 |[228.0,4.0,108.0]|93.0 |
|214    |6        |258        |308            |3215|1944 |1        |0          |3      |1          |110|[214.0,6.0,258.0]|110.0|
|187    |8        |360        |315            |344 |1702 |0        |0          |3      |2        

### Aula 18 - Divisor de Vetores com VectorSlicer

- Recebe uma coluna com um vetor de atributos
- Cria uma nova coluna, com os atributos especificados pelo índice

In [51]:
df_vector_slice = VectorSlicer(
    inputCol ='features',
    outputCol = 'features_slice',
    indices = [1, 2, 6]
).transform(df_carros_vector_assembler)

df_vector_slice.select('features', 'features_slice').show(5, truncate = False)

+-----------------------------------------------------+---------------+
|features                                             |features_slice |
+-----------------------------------------------------+---------------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |[6.0,160.0,0.0]|
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |[6.0,160.0,0.0]|
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |[4.0,108.0,1.0]|
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|[6.0,258.0,1.0]|
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |[8.0,360.0,0.0]|
+-----------------------------------------------------+---------------+
only showing top 5 rows



### Aula 19 - Seleção de atributos com  ChiSqSelector (qui-quadrado)

- Seleção de atributos: técnica de engenharia de atributos que busca melhorar a performance do modelo através da seleção de um subconjunto de atributos mais relevantes para prever a classe
- Usa o teste de independência do qui-quadrado para selecionar os atributos
- Recebe uma coluna com um vetor de atributos e produz uma coluna com um vetor de atributos mais relevantes

- selectorType: numTopFeatures (padrão), percentile, fpr, fdr, fwe. - numTopFeatures: quantos atributos devem ser selecionados. Método adrão, com valor = 50
- percentile: percentual de atributos que devem ser selecionados. Padrão 0.1
- fpr e fwe: seleciona atributos que os valores-p estejam abaixo de um parâmetro. Padrão 0.05
- fdr: aplica o critério de FDR (False Discovery Rate) . Padrão 0.05

In [56]:
df_chi_selector = ChiSqSelector(
    selectorType = 'fdr', 
    fdr = 0.01,
    featuresCol = 'features', 
    outputCol = 'features_selecionados',
    labelCol = 'HP'
).fit(df_carros_vector_assembler).transform(df_carros_vector_assembler)

df_chi_selector.select('features_selecionados').show()

+---------------------+
|features_selecionados|
+---------------------+
|         [160.0,39.0]|
|         [160.0,39.0]|
|        [108.0,385.0]|
|        [258.0,308.0]|
|        [360.0,315.0]|
|        [225.0,276.0]|
|        [360.0,321.0]|
|       [1467.0,369.0]|
|       [1408.0,392.0]|
|       [1676.0,392.0]|
|       [1676.0,392.0]|
|       [2758.0,307.0]|
|       [2758.0,307.0]|
|       [2758.0,307.0]|
|        [472.0,293.0]|
|          [460.0,3.0]|
|        [440.0,323.0]|
|        [787.0,408.0]|
|        [757.0,493.0]|
|        [711.0,422.0]|
+---------------------+
only showing top 20 rows



### Aula 16 - QuantileDiscretizer