<a href="https://colab.research.google.com/github/luasampaio/data-engineering/blob/main/37_FuncaoArrayProcessado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Funcao para processar a leitura de uma coluna do tipo array

In [8]:
import requests
import os
from pyspark.sql import SparkSession

# Criando sessão Spark
spark = SparkSession.builder.appName("DataIngestion").getOrCreate()

def download_file(url, local_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(local_path, "wb") as f:
            f.write(response.content)
        print(f"Arquivo salvo em: {local_path}")
        return local_path
    else:
        print(f"Erro ao baixar arquivo: {response.status_code}")
        return None

def ingest_data(source_url: str, local_path: str, format: str = "json"):
    try:
        # Baixa o arquivo JSON
        file_path = download_file(source_url, local_path)
        if file_path:
            # Lendo o arquivo JSON no Spark
            df = spark.read.format(format).load(file_path)
            return df
        else:
            return None
    except Exception as e:
        print(f"Erro ao carregar dados: {e}")
        return None

# Definir URL e caminho local temporário
source_url = "https://github.com/luasampaio/datasets/raw/main/dados_locacao_imoveis.json"
local_path = "/tmp/dados_locacao_imoveis.json"

# Carregar os dados no Spark
df = ingest_data(source_url, local_path)

# Exibir esquema se a leitura for bem-sucedida
if df:
    df.printSchema()
    df.show(5)


Arquivo salvo em: /tmp/dados_locacao_imoveis.json
root
 |-- dados_locacao: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- apartamento: string (nullable = true)
 |    |    |-- datas_combinadas_pagamento: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- datas_de_pagamento: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- valor_aluguel: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)

+--------------------+
|       dados_locacao|
+--------------------+
|[{A101 (blocoAP),...|
+--------------------+



In [9]:
df.printSchema()

root
 |-- dados_locacao: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- apartamento: string (nullable = true)
 |    |    |-- datas_combinadas_pagamento: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- datas_de_pagamento: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- valor_aluguel: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)



In [13]:
from pyspark.sql.functions import explode

df_exploded = df.select(explode(df["dados_locacao"]).alias("detalhe"))
df_exploded.show()



+--------------------+
|             detalhe|
+--------------------+
|{A101 (blocoAP), ...|
|{A102 (blocoAP), ...|
|{B201 (blocoAP), ...|
|{B202 (blocoAP), ...|
|{C301 (blocoAP), ...|
|{C302 (blocoAP), ...|
|{D401 (blocoAP), ...|
|{D402 (blocoAP), ...|
|{E501 (blocoAP), ...|
|{E502 (blocoAP), ...|
|{F601 (blocoAP), ...|
|{F602 (blocoAP), ...|
|{G701 (blocoAP), ...|
|{G702 (blocoAP), ...|
|{H801 (blocoAP), ...|
+--------------------+



In [15]:
df_exploded = df_exploded.select("detalhe.*")
df_exploded.show()

+--------------+--------------------------+--------------------+--------------------+
|   apartamento|datas_combinadas_pagamento|  datas_de_pagamento|       valor_aluguel|
+--------------+--------------------------+--------------------+--------------------+
|A101 (blocoAP)|      [01/06/2022, 01/0...|[05/06/2022, 03/0...|[$ 1000,0 reais, ...|
|A102 (blocoAP)|      [02/06/2022, 02/0...|[02/06/2022, 06/0...|[$ 1100,0 reais, ...|
|B201 (blocoAP)|      [03/06/2022, 03/0...|[07/06/2022, 03/0...|[$ 1200,0 reais, ...|
|B202 (blocoAP)|      [04/06/2022, 04/0...|[07/06/2022, 05/0...|[$ 1300,0 reais, ...|
|C301 (blocoAP)|      [05/06/2022, 05/0...|[10/06/2022, 09/0...|[$ 1400,0 reais, ...|
|C302 (blocoAP)|      [06/06/2022, 06/0...|[08/06/2022, 12/0...|[$ 1500,0 reais, ...|
|D401 (blocoAP)|      [07/06/2022, 07/0...|[07/06/2022, 09/0...|[$ 1600,0 reais, ...|
|D402 (blocoAP)|      [08/06/2022, 08/0...|[10/06/2022, 14/0...|[$ 1700,0 reais, ...|
|E501 (blocoAP)|      [09/06/2022, 09/0...|[10/06/2022

In [16]:
display(df_exploded)

DataFrame[apartamento: string, datas_combinadas_pagamento: array<string>, datas_de_pagamento: array<string>, valor_aluguel: array<string>]

In [18]:
df_exploded  = df_exploded.withColumnRenamed("bairro", "bairro_locacao")
df_exploded.show()

+--------------+--------------------------+--------------------+--------------------+
|   apartamento|datas_combinadas_pagamento|  datas_de_pagamento|       valor_aluguel|
+--------------+--------------------------+--------------------+--------------------+
|A101 (blocoAP)|      [01/06/2022, 01/0...|[05/06/2022, 03/0...|[$ 1000,0 reais, ...|
|A102 (blocoAP)|      [02/06/2022, 02/0...|[02/06/2022, 06/0...|[$ 1100,0 reais, ...|
|B201 (blocoAP)|      [03/06/2022, 03/0...|[07/06/2022, 03/0...|[$ 1200,0 reais, ...|
|B202 (blocoAP)|      [04/06/2022, 04/0...|[07/06/2022, 05/0...|[$ 1300,0 reais, ...|
|C301 (blocoAP)|      [05/06/2022, 05/0...|[10/06/2022, 09/0...|[$ 1400,0 reais, ...|
|C302 (blocoAP)|      [06/06/2022, 06/0...|[08/06/2022, 12/0...|[$ 1500,0 reais, ...|
|D401 (blocoAP)|      [07/06/2022, 07/0...|[07/06/2022, 09/0...|[$ 1600,0 reais, ...|
|D402 (blocoAP)|      [08/06/2022, 08/0...|[10/06/2022, 14/0...|[$ 1700,0 reais, ...|
|E501 (blocoAP)|      [09/06/2022, 09/0...|[10/06/2022

In [19]:
df_exploded.printSchema()

root
 |-- apartamento: string (nullable = true)
 |-- datas_combinadas_pagamento: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- datas_de_pagamento: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- valor_aluguel: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [25]:
from pyspark.sql.functions import explode

df_exploded2 = df_exploded.withColumn("valor_aluguel", explode(df_exploded["valor_aluguel"]))
df_exploded2.show()

+--------------+--------------------------+--------------------+--------------+
|   apartamento|datas_combinadas_pagamento|  datas_de_pagamento| valor_aluguel|
+--------------+--------------------------+--------------------+--------------+
|A101 (blocoAP)|      [01/06/2022, 01/0...|[05/06/2022, 03/0...|$ 1000,0 reais|
|A101 (blocoAP)|      [01/06/2022, 01/0...|[05/06/2022, 03/0...|$ 2500,0 reais|
|A102 (blocoAP)|      [02/06/2022, 02/0...|[02/06/2022, 06/0...|$ 1100,0 reais|
|A102 (blocoAP)|      [02/06/2022, 02/0...|[02/06/2022, 06/0...|$ 2600,0 reais|
|B201 (blocoAP)|      [03/06/2022, 03/0...|[07/06/2022, 03/0...|$ 1200,0 reais|
|B201 (blocoAP)|      [03/06/2022, 03/0...|[07/06/2022, 03/0...|$ 2700,0 reais|
|B202 (blocoAP)|      [04/06/2022, 04/0...|[07/06/2022, 05/0...|$ 1300,0 reais|
|B202 (blocoAP)|      [04/06/2022, 04/0...|[07/06/2022, 05/0...|$ 2800,0 reais|
|C301 (blocoAP)|      [05/06/2022, 05/0...|[10/06/2022, 09/0...|$ 1400,0 reais|
|C301 (blocoAP)|      [05/06/2022, 05/0.

In [26]:
from pyspark.sql.functions import col, explode

def processar_coluna_array(df, coluna):
    """
    Função para ler e manipular colunas do tipo array em um DataFrame do PySpark.

    Parâmetros:
        df (DataFrame): DataFrame do PySpark.
        coluna (str): Nome da coluna que contém o array.

    Retorno:
        DataFrame processado com os elementos do array separados em linhas.
    """
    # Verificar se a coluna existe
    if coluna not in df.columns:
        raise ValueError(f"A coluna '{coluna}' não existe no DataFrame.")

    # Explodir os valores do array em linhas separadas
    df_exploded = df.select(explode(col(coluna)).alias(f"{coluna}_elemento"))

    return df_exploded


In [32]:

df_array_processado = processar_coluna_array(df_exploded, "valor_aluguel")

In [33]:
df_array_processado.show()

+----------------------+
|valor_aluguel_elemento|
+----------------------+
|        $ 1000,0 reais|
|        $ 2500,0 reais|
|        $ 1100,0 reais|
|        $ 2600,0 reais|
|        $ 1200,0 reais|
|        $ 2700,0 reais|
|        $ 1300,0 reais|
|        $ 2800,0 reais|
|        $ 1400,0 reais|
|        $ 2900,0 reais|
|        $ 1500,0 reais|
|        $ 1200,0 reais|
|        $ 1600,0 reais|
|        $ 1300,0 reais|
|        $ 1700,0 reais|
|        $ 1400,0 reais|
|        $ 1800,0 reais|
|        $ 1500,0 reais|
|        $ 1900,0 reais|
|        $ 1600,0 reais|
+----------------------+
only showing top 20 rows



In [34]:
# Exibir o resultado
df_array_processado.show()

+----------------------+
|valor_aluguel_elemento|
+----------------------+
|        $ 1000,0 reais|
|        $ 2500,0 reais|
|        $ 1100,0 reais|
|        $ 2600,0 reais|
|        $ 1200,0 reais|
|        $ 2700,0 reais|
|        $ 1300,0 reais|
|        $ 2800,0 reais|
|        $ 1400,0 reais|
|        $ 2900,0 reais|
|        $ 1500,0 reais|
|        $ 1200,0 reais|
|        $ 1600,0 reais|
|        $ 1300,0 reais|
|        $ 1700,0 reais|
|        $ 1400,0 reais|
|        $ 1800,0 reais|
|        $ 1500,0 reais|
|        $ 1900,0 reais|
|        $ 1600,0 reais|
+----------------------+
only showing top 20 rows

