In [7]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F 

In [8]:
spark = (
    SparkSession.builder.appName('Aula 4 - WITHCOLUMN: Adicionando ou Modificando Colunas')
    .config('spark.sql.repl.eagerEval.enabled', True)
    .getOrCreate())

In [9]:
spark

In [12]:
help(F)

Help on module pyspark.sql.functions in pyspark.sql:

NAME
    pyspark.sql.functions - A collections of builtin functions

FUNCTIONS
    abs(col: 'ColumnOrName') -> pyspark.sql.column.Column
        Computes the absolute value.

        .. versionadded:: 1.3

    acos(col: 'ColumnOrName') -> pyspark.sql.column.Column
        Computes inverse cosine of the input column.

        .. versionadded:: 1.4.0

        Returns
        -------
        :class:`~pyspark.sql.Column`
            inverse cosine of `col`, as if computed by `java.lang.Math.acos()`

    acosh(col: 'ColumnOrName') -> pyspark.sql.column.Column
        Computes inverse hyperbolic cosine of the input column.

        .. versionadded:: 3.1.0

        Returns
        -------
        :class:`~pyspark.sql.Column`

    add_months(start: 'ColumnOrName', months: Union[ForwardRef('ColumnOrName'), int]) -> pyspark.sql.column.Column
        Returns the date that is `months` months after `start`

        .. versionadded:: 1.5.0

     

In [10]:
local_file = r'C:\Users\mluiz\Documents\05_Python\PySpark\datasets\LOGINS.parquet'

In [5]:
df = spark.read.parquet(local_file)

In [6]:
df.show(3)

+--------------+--------------------+----------+------------------+------+-------------+--------------+------------+--------------------+---------------+
|           cpf|               email|     senha|data_de_nascimento|estado|data_cadastro|          ipv4|cor_favorita|           profissao|       telefone|
+--------------+--------------------+----------+------------------+------+-------------+--------------+------------+--------------------+---------------+
|981.507.362-12|pedro-lucas53@gma...|+7^7E%xFBc|        2006-12-18|    RR|   2023-02-26|99.107.250.210|        Roxo|    Jogador De Golfe|   31 7785-4046|
|493.705.168-75|rezendeisaac@hotm...|_O_2GRnGOe|        1992-06-17|    GO|   2023-02-16| 197.11.26.213|       Ciano|Atleta De Arremes...|(031) 0803-6753|
|398.471.625-73|felipepires@uol.c...|*Aw5EOAvy9|        1921-11-11|    MG|   2023-01-02|  181.90.63.58|        Azul|      Papiloscopista|   11 9674-0553|
+--------------+--------------------+----------+------------------+------+--

In [15]:
df.withColumn('pais', F.lit('Brasil') ).show(3) # lit retorna um valor literal. Todas as linhas vão receber o nome Brasil

+--------------+--------------------+----------+------------------+------+-------------+--------------+------------+--------------------+---------------+------+
|           cpf|               email|     senha|data_de_nascimento|estado|data_cadastro|          ipv4|cor_favorita|           profissao|       telefone|  pais|
+--------------+--------------------+----------+------------------+------+-------------+--------------+------------+--------------------+---------------+------+
|981.507.362-12|pedro-lucas53@gma...|+7^7E%xFBc|        2006-12-18|    RR|   2023-02-26|99.107.250.210|        Roxo|    Jogador De Golfe|   31 7785-4046|Brasil|
|493.705.168-75|rezendeisaac@hotm...|_O_2GRnGOe|        1992-06-17|    GO|   2023-02-16| 197.11.26.213|       Ciano|Atleta De Arremes...|(031) 0803-6753|Brasil|
|398.471.625-73|felipepires@uol.c...|*Aw5EOAvy9|        1921-11-11|    MG|   2023-01-02|  181.90.63.58|        Azul|      Papiloscopista|   11 9674-0553|Brasil|
+--------------+------------------

In [16]:
df.withColumn('sigla_estado', F.col('estado')).show(3) # Copia a coluna estado e renomeia para sigla_estado

+--------------+--------------------+----------+------------------+------+-------------+--------------+------------+--------------------+---------------+------------+
|           cpf|               email|     senha|data_de_nascimento|estado|data_cadastro|          ipv4|cor_favorita|           profissao|       telefone|sigla_estado|
+--------------+--------------------+----------+------------------+------+-------------+--------------+------------+--------------------+---------------+------------+
|981.507.362-12|pedro-lucas53@gma...|+7^7E%xFBc|        2006-12-18|    RR|   2023-02-26|99.107.250.210|        Roxo|    Jogador De Golfe|   31 7785-4046|          RR|
|493.705.168-75|rezendeisaac@hotm...|_O_2GRnGOe|        1992-06-17|    GO|   2023-02-16| 197.11.26.213|       Ciano|Atleta De Arremes...|(031) 0803-6753|          GO|
|398.471.625-73|felipepires@uol.c...|*Aw5EOAvy9|        1921-11-11|    MG|   2023-01-02|  181.90.63.58|        Azul|      Papiloscopista|   11 9674-0553|          MG

In [25]:
df.withColumn('pais', F.lit('Brasil')).withColumn('sigla_estado', F.col('estado')).show(3)

+--------------+--------------------+----------+------------------+------+-------------+--------------+------------+--------------------+---------------+------+------------+
|           cpf|               email|     senha|data_de_nascimento|estado|data_cadastro|          ipv4|cor_favorita|           profissao|       telefone|  pais|sigla_estado|
+--------------+--------------------+----------+------------------+------+-------------+--------------+------------+--------------------+---------------+------+------------+
|981.507.362-12|pedro-lucas53@gma...|+7^7E%xFBc|        2006-12-18|    RR|   2023-02-26|99.107.250.210|        Roxo|    Jogador De Golfe|   31 7785-4046|Brasil|          RR|
|493.705.168-75|rezendeisaac@hotm...|_O_2GRnGOe|        1992-06-17|    GO|   2023-02-16| 197.11.26.213|       Ciano|Atleta De Arremes...|(031) 0803-6753|Brasil|          GO|
|398.471.625-73|felipepires@uol.c...|*Aw5EOAvy9|        1921-11-11|    MG|   2023-01-02|  181.90.63.58|        Azul|      Papilosc

In [28]:
(
    df
    .withColumn('nome_estados', F.when(df.estado == 'AC','Acre')
                                 .when(df.estado == 'SP','Sao Paulo')
                                 .when(df.estado == 'RJ','Rio de Janeiro')
                                 .when(df.estado == 'SC','Santa Catarina')
                                 .when(df.estado == 'MG','Minas Gerais')
                                 .when(df.estado == 'RS','Rio Grande do Sul'))
).show(10)

+--------------+--------------------+----------+------------------+------+-------------+---------------+------------+--------------------+-------------------+------------+
|           cpf|               email|     senha|data_de_nascimento|estado|data_cadastro|           ipv4|cor_favorita|           profissao|           telefone|nome_estados|
+--------------+--------------------+----------+------------------+------+-------------+---------------+------------+--------------------+-------------------+------------+
|981.507.362-12|pedro-lucas53@gma...|+7^7E%xFBc|        2006-12-18|    RR|   2023-02-26| 99.107.250.210|        Roxo|    Jogador De Golfe|       31 7785-4046|        null|
|493.705.168-75|rezendeisaac@hotm...|_O_2GRnGOe|        1992-06-17|    GO|   2023-02-16|  197.11.26.213|       Ciano|Atleta De Arremes...|    (031) 0803-6753|        null|
|398.471.625-73|felipepires@uol.c...|*Aw5EOAvy9|        1921-11-11|    MG|   2023-01-02|   181.90.63.58|        Azul|      Papiloscopista|  

In [30]:
(
    df
    .withColumn('nome_estados', F.when(df.estado == 'AC','Acre')
                                 .when(df.estado == 'SP','Sao Paulo')
                                 .when(df.estado == 'RJ','Rio de Janeiro')
                                 .when(df.estado == 'SC','Santa Catarina')
                                 .when(df.estado == 'MG','Minas Gerais')
                                 .when(df.estado == 'RS','Rio Grande do Sul')
                                 .otherwise('Outros'))
    .withColumn('flag_rosa', F.when(df.cor_favorita == 'Rosa', 1).otherwise(0))
                                 
).show(10)

+--------------+--------------------+----------+------------------+------+-------------+---------------+------------+--------------------+-------------------+------------+---------+
|           cpf|               email|     senha|data_de_nascimento|estado|data_cadastro|           ipv4|cor_favorita|           profissao|           telefone|nome_estados|flag_rosa|
+--------------+--------------------+----------+------------------+------+-------------+---------------+------------+--------------------+-------------------+------------+---------+
|981.507.362-12|pedro-lucas53@gma...|+7^7E%xFBc|        2006-12-18|    RR|   2023-02-26| 99.107.250.210|        Roxo|    Jogador De Golfe|       31 7785-4046|      Outros|        0|
|493.705.168-75|rezendeisaac@hotm...|_O_2GRnGOe|        1992-06-17|    GO|   2023-02-16|  197.11.26.213|       Ciano|Atleta De Arremes...|    (031) 0803-6753|      Outros|        0|
|398.471.625-73|felipepires@uol.c...|*Aw5EOAvy9|        1921-11-11|    MG|   2023-01-02|  