## IndexToString
- As vezes precisamos converter de volta um índice para a categoria
    - explicar o modelo
    - Mostrar valors 'reais'

- IndexToString cria um atributo com a coluna original

In [1]:
from pyspark.sql import SparkSession

from pyspark.ml.feature import StringIndexer, IndexToString


spark = (
    SparkSession
    .builder
    .appName('IndexToString')
    .getOrCreate()
)

spark

22/05/17 15:29:34 WARN Utils: Your hostname, andre-UBUNTU20-04 resolves to a loopback address: 127.0.1.1; using 192.168.0.136 instead (on interface wlp2s0)
22/05/17 15:29:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/17 15:29:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df_churn = (
    spark.read.format('csv')
    .option('inferSchema', True)
    .option('delimiter', ';')
    .option('header', True)
    .load('data/Churn.csv')
)

df_churn.show(5)

                                                                                

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [3]:
indexer = StringIndexer(
    inputCol='Geography',
    outputCol='Geography_idx',
    handleInvalid='keep'
)

model = indexer.fit(df_churn)

df_churn_idx = model.transform(df_churn)
df_churn_idx.show(5)

                                                                                

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+-------------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Geography_idx|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+-------------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|          0.0|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|          2.0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|          0.0|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|          0.0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0

In [4]:
# Caminho inverso StringIndexer para IndexToString
df_index_to_string = IndexToString(
    inputCol='Geography_idx',
    outputCol='Categoria_Original'
).transform(df_churn_idx)

df_index_to_string.select('Geography', 'Geography_idx', 'Categoria_Original').show(5)

+---------+-------------+------------------+
|Geography|Geography_idx|Categoria_Original|
+---------+-------------+------------------+
|   France|          0.0|            France|
|    Spain|          2.0|             Spain|
|   France|          0.0|            France|
|   France|          0.0|            France|
|    Spain|          2.0|             Spain|
+---------+-------------+------------------+
only showing top 5 rows



In [5]:
df_churn_idx.select('Geography', 'Geography_idx').orderBy('Geography').show(50)

+---------+-------------+
|Geography|Geography_idx|
+---------+-------------+
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France|          0.0|
|   France| 