## One Hot Encoding:
- Produz um único atributo de saída com uma matriz densa, a partir de n atributos numéricos
- Espera atributos numéricos: Podemos usar StringIndexer para transformar


In [7]:
from pyspark.sql import SparkSession

from pyspark.ml.feature import OneHotEncoder, StringIndexer

spark = (
    SparkSession
    .builder
    .appName('OneHotEncoding')
    .getOrCreate()
)
spark

In [5]:
df_churn = (
    spark.read.format('csv')
    .option('inferSchema', True)
    .option('delimiter', ';')
    .option('header', True)
    .load('data/Churn.csv')
)

df_churn.show(5)

                                                                                

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [9]:
indexer = StringIndexer(
    inputCols=['Geography', 'Gender'],
    outputCols=['Geography_idx', 'Gender_idx'],
    handleInvalid='keep'
)

model = indexer.fit(df_churn)

df_churn_idx = model.transform(df_churn)
df_churn_idx.show(5)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+-------------+----------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Geography_idx|Gender_idx|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+-------------+----------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|          0.0|       1.0|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|          2.0|       1.0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|          0.0|       1.0|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|          0.0|       1.0|
|        850|    Spain|Female| 43|

In [17]:
onehot = OneHotEncoder(
    inputCols=['Geography_idx', 'Gender_idx'],
    outputCols=['onehot_c1', 'onehot_c2']
).fit(df_churn_idx)



In [21]:
(
    onehot.transform(df_churn_idx)
    .select('Geography', 
            'Gender', 
            'Geography_idx', 
            'Gender_idx',  
            'onehot_c1',  
            'onehot_c2'
            )
    .show(truncate=False)
)

+---------+------+-------------+----------+-------------+-------------+
|Geography|Gender|Geography_idx|Gender_idx|onehot_c1    |onehot_c2    |
+---------+------+-------------+----------+-------------+-------------+
|France   |Female|0.0          |1.0       |(3,[0],[1.0])|(2,[1],[1.0])|
|Spain    |Female|2.0          |1.0       |(3,[2],[1.0])|(2,[1],[1.0])|
|France   |Female|0.0          |1.0       |(3,[0],[1.0])|(2,[1],[1.0])|
|France   |Female|0.0          |1.0       |(3,[0],[1.0])|(2,[1],[1.0])|
|Spain    |Female|2.0          |1.0       |(3,[2],[1.0])|(2,[1],[1.0])|
|Spain    |Male  |2.0          |0.0       |(3,[2],[1.0])|(2,[0],[1.0])|
|France   |Male  |0.0          |0.0       |(3,[0],[1.0])|(2,[0],[1.0])|
|Germany  |Female|1.0          |1.0       |(3,[1],[1.0])|(2,[1],[1.0])|
|France   |Male  |0.0          |0.0       |(3,[0],[1.0])|(2,[0],[1.0])|
|France   |Male  |0.0          |0.0       |(3,[0],[1.0])|(2,[0],[1.0])|
|France   |Male  |0.0          |0.0       |(3,[0],[1.0])|(2,[0],