In [1]:
import pandas as pd
import numpy as np

from datetime import datetime

import ast
import json
from datetime import datetime

from pyspark.sql.functions import lit
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, from_json, array_join, transform, explode_outer, to_timestamp, from_unixtime, lit
from pyspark.sql.types import ArrayType, StructType, StringType, LongType

StatementMeta(, 019b0dda-afc2-4074-9241-4fe0e9137764, 3, Finished, Available, Finished)

In [2]:
def expandir_colunas(df, colunas_alvo=None, chaves=["codigo", "nome"], separador="|", ordem_final=None):
    """
    Expande colunas que são listas de dicionários em colunas separadas com os campos desejados (ex: codigo, nome).
    Suporta colunas como string JSON ou já como StructType/ArrayType(StructType).

    Parâmetros:
        df: DataFrame Spark original
        colunas_alvo: lista de colunas a expandir (default = ['classe', 'sistema', 'formato', 'orgaoJulgador', 'assuntos'])
        chaves: chaves dentro de cada dict a extrair (ex: 'codigo', 'nome')
        separador: separador entre elementos concatenados
        ordem_final: ordem final das colunas no DataFrame (lista)

    Retorna:
        DataFrame com colunas expandidas e reorganizadas.
    """
    if colunas_alvo is None:
        colunas_alvo = ['classe', 'sistema', 'formato', 'orgaoJulgador', 'assuntos']

    for coluna in colunas_alvo:
        tipo_coluna = df.schema[coluna].dataType

        # Se for string, assume que é uma lista de dicts em string JSON
        if isinstance(tipo_coluna, StringType):
            schema_array = ArrayType(StructType().add("codigo", StringType()).add("nome", StringType()))
            df = df.withColumn(f"__parsed_{coluna}", from_json(col(coluna), schema_array))
            for chave in chaves:
                nova_coluna = f"{coluna}_{chave}"
                df = df.withColumn(nova_coluna, array_join(transform(col(f"__parsed_{coluna}"), lambda x: x[chave]), separador))

        # Se já for ArrayType(StructType(...))
        elif isinstance(tipo_coluna, ArrayType):
            if isinstance(tipo_coluna.elementType, StructType):
                for chave in chaves:
                    nova_coluna = f"{coluna}_{chave}"
                    df = df.withColumn(nova_coluna, array_join(transform(col(coluna), lambda x: x[chave]), separador))

        # Se for StructType
        elif isinstance(tipo_coluna, StructType):
            for chave in chaves:
                nova_coluna = f"{coluna}_{chave}"
                df = df.withColumn(nova_coluna, col(f"{coluna}.{chave}"))

        else:
            raise ValueError(f"Tipo não esperado para coluna {coluna}: {tipo_coluna}")

    # Reordena as colunas, se desejado
    if ordem_final:
        colunas_disponiveis = [c for c in ordem_final if c in df.columns]
        df = df.select(*colunas_disponiveis)

    return df


StatementMeta(, 019b0dda-afc2-4074-9241-4fe0e9137764, 4, Finished, Available, Finished)

In [3]:
CAMINHO_PASTA_LOTES = "Files/DATAJUD/resultado-api/lotes"
TABELA_API = "api_datajud"
TABELA_MOVIMENTOS = "movimentos_datajud"

StatementMeta(, 019b0dda-afc2-4074-9241-4fe0e9137764, 5, Finished, Available, Finished)

In [4]:
# Leitura de todos os arquivos Parquet na pasta
df_lotes = spark.read.format("parquet").load(CAMINHO_PASTA_LOTES)

print(f"Registros carregados dos lotes: {df_lotes.count()}")

StatementMeta(, 019b0dda-afc2-4074-9241-4fe0e9137764, 6, Finished, Available, Finished)

Registros carregados dos lotes: 20743476


In [5]:
# Verifica se a tabela Delta já existe
tabelas_existentes = [t.name for t in spark.catalog.listTables()]
tabela_existe = TABELA_API in tabelas_existentes

ordem_colunas = [
    'numeroProcesso', 'classe', 'classe_codigo', 'classe_nome',  
    'sistema', 'sistema_codigo', 'sistema_nome', 'formato', 'formato_codigo',
    'formato_nome', 'tribunal', 'dataHoraUltimaAtualizacao', 'grau',
    '@timestamp', 'dataAjuizamento', 'movimentos', 'id', 'nivelSigilo',
    'orgaoJulgador', 'orgaoJulgador_codigo', 'orgaoJulgador_nome',
    'assuntos', 'assuntos_codigo', 'assuntos_nome', '_id', 'data_download'
]

df_expandido = expandir_colunas(df_lotes, ordem_final=ordem_colunas)

# Adiciona coluna de data de ingestão 
df_expandido = df_expandido.withColumn("data_ingestao", lit(datetime.now().isoformat()))

if not tabela_existe:
    print(f"Criando nova tabela Delta: {TABELA_API}")
    df_expandido.write.format("delta").mode("overwrite").saveAsTable(TABELA_API)
else:
    print(f"Tabela já existe. Iniciando merge com deduplicação por _id")

    # Lê tabela existente
    df_existente = spark.table(TABELA_API).select("_id")

    # Remove duplicados (merge incremental)
    df_novos = df_expandido.join(df_existente, on="_id", how="left_anti")

    if df_novos.count() == 0:
        print("✅ Nenhum dado novo a inserir.")
    else:
        print(f"Inserindo {df_novos.count()} novos registros...")
        df_novos.write.format("delta").mode("append").saveAsTable(TABELA_API)

print("✅ Ingestão finalizada.")

StatementMeta(, 15a409ad-16d3-4cb0-b346-0596a128e0ca, 7, Finished, Available, Finished)

Criando nova tabela Delta: api_datajud
✅ Ingestão finalizada.


### TABELA DE MOVIMENTAÇÕES

In [6]:
# from pyspark.sql import DataFrame
# from pyspark.sql.functions import col, explode_outer, to_timestamp, from_unixtime, lit
# from pyspark.sql.types import LongType

def extrair_movimentos_completos(df: DataFrame) -> DataFrame:
    """
    Extrai os movimentos e seus complementos de um DataFrame Spark que contém uma coluna 'movimentos',
    estruturada como lista de structs. Retorna um DataFrame ordenado por _id e dataHora.
    """

    df_mov = df.withColumn("movimento", explode_outer("movimentos"))

    df_mov = df_mov.withColumn("complemento", explode_outer("movimento.complementosTabelados"))

    df_resultado = df_mov.select(
        col("_id"),
        col("numeroProcesso"),
        col("movimento.codigo").alias("movimentos_codigo"),
        col("movimento.nome").alias("movimentos_nome"),
        to_timestamp(col("movimento.dataHora")).alias("movimentos_dataHora"),
        col("complemento.codigo").alias("movimentos_complementosTabelados_codigo"),
        col("complemento.descricao").alias("movimentos_complementosTabelados_descricao"),
        col("complemento.valor").alias("movimentos_complementosTabelados_valor"),
        col("complemento.nome").alias("movimentos_complementosTabelados_nome"),
        col("data_download")
        
    )

    return df_resultado.orderBy(["_id", "movimentos_dataHora"], ascending=[True, False])

StatementMeta(, 019b0dda-afc2-4074-9241-4fe0e9137764, 7, Finished, Available, Finished)

In [7]:
movimentos = spark.sql("""SELECT _id, numeroProcesso, movimentos, data_download
FROM DOL_arqs_auxiliares.api_datajud""")

StatementMeta(, 019b0dda-afc2-4074-9241-4fe0e9137764, 8, Finished, Available, Finished)

In [8]:
mov_datajud = extrair_movimentos_completos(movimentos)

StatementMeta(, 019b0dda-afc2-4074-9241-4fe0e9137764, 9, Finished, Available, Finished)

In [24]:
display(mov_datajud.tail(100))

StatementMeta(, 019b0dda-afc2-4074-9241-4fe0e9137764, 12, Finished, Available, Finished)

Py4JJavaError: An error occurred while calling o7896.tailToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 25 in stage 38.0 failed 4 times, most recent failure: Lost task 25.3 in stage 38.0 (TID 7458) (vm-4d492599 executor 43): ExecutorLostFailure (executor 43 exited caused by one of the running tasks) Reason: Container from a bad node: container_1755784154804_0001_01_000044 on host: vm-4d492599. Exit status: 137. Diagnostics: [2025-08-21 14:42:29.059]Container killed on request. Exit code is 137
[2025-08-21 14:42:29.081]Container exited with a non-zero exit code 137. 
[2025-08-21 14:42:29.084]Killed by external signal
.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:3055)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2991)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2990)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2990)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1294)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1294)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1294)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3262)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3193)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3182)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1028)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2568)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2589)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2608)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2633)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1056)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:411)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1055)
	at org.apache.spark.RangePartitioner$.sketch(Partitioner.scala:320)
	at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:187)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.prepareShuffleDependency(ShuffleExchangeExec.scala:296)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:179)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:173)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture$lzycompute(ShuffleExchangeExec.scala:149)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture(ShuffleExchangeExec.scala:145)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.$anonfun$submitShuffleJob$1(ShuffleExchangeExec.scala:73)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:268)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:265)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.submitShuffleJob(ShuffleExchangeExec.scala:73)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.submitShuffleJob$(ShuffleExchangeExec.scala:72)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.submitShuffleJob(ShuffleExchangeExec.scala:120)
	at org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.shuffleFuture$lzycompute(QueryStageExec.scala:188)
	at org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.shuffleFuture(QueryStageExec.scala:188)
	at org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.doMaterialize(QueryStageExec.scala:190)
	at org.apache.spark.sql.execution.adaptive.QueryStageExec.materialize(QueryStageExec.scala:61)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$6(AdaptiveSparkPlanExec.scala:297)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$6$adapted(AdaptiveSparkPlanExec.scala:295)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$1(AdaptiveSparkPlanExec.scala:295)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:961)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.getFinalPhysicalPlan(AdaptiveSparkPlanExec.scala:266)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:412)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeTail(AdaptiveSparkPlanExec.scala:393)
	at org.apache.spark.sql.Dataset.$anonfun$tailToPython$1(Dataset.scala:4184)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4348)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:810)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4346)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:132)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:220)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:101)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:961)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4346)
	at org.apache.spark.sql.Dataset.tailToPython(Dataset.scala:4181)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [9]:
# Verifica se a tabela Delta já existe
tabelas_existentes = [t.name for t in spark.catalog.listTables()]
tabela_mov_existe = TABELA_MOVIMENTOS in tabelas_existentes

# AJUSTA PROBLEMA DE DATAS < 01/01/1900
mov_datajud = mov_datajud.filter(col("movimentos_dataHora") >= "1900-01-01") 

mov_datajud = mov_datajud.withColumn("data_ingestao", lit(datetime.now().isoformat()))

# Define política de escrita de datas antigas
# spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")

if not tabela_mov_existe:
    print(f"Criando nova tabela Delta: {TABELA_MOVIMENTOS}")
    mov_datajud.write.format("delta").mode("overwrite").saveAsTable(TABELA_MOVIMENTOS)

else:
    print(f"Tabela já existe. Iniciando merge com deduplicação por múltiplas colunas")

    # Define a chave de deduplicação
    chave_deduplicacao = [
        "_id",
        'numeroProcesso',
        "movimentos_codigo",
        "movimentos_nome",
        "movimentos_dataHora"
    ]

    # Lê dados existentes apenas com as colunas de deduplicação
    df_existente = spark.table(TABELA_MOVIMENTOS).select(*chave_deduplicacao).dropDuplicates()

    # Identifica novos registros via anti join
    df_novos = mov_datajud.join(df_existente, on=chave_deduplicacao, how="left_anti")

    # Verifica se há dados novos
    if df_novos.rdd.isEmpty():
        print("✅ Nenhum dado novo a inserir.")
    else:
        total = df_novos.count()
        print(f"Inserindo {total} novos registros...")
        df_novos.write.format("delta").mode("append").saveAsTable(TABELA_MOVIMENTOS)

print("✅ Ingestão finalizada.")

StatementMeta(, 019b0dda-afc2-4074-9241-4fe0e9137764, 11, Finished, Available, Finished)

Criando nova tabela Delta: movimentos_datajud


Py4JJavaError: An error occurred while calling o7899.saveAsTable.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 73 in stage 37.0 failed 4 times, most recent failure: Lost task 73.3 in stage 37.0 (TID 7249) (vm-c2983333 executor 19): ExecutorLostFailure (executor 19 exited caused by one of the running tasks) Reason: Container from a bad node: container_1755784154804_0001_01_000020 on host: vm-c2983333. Exit status: 137. Diagnostics: [2025-08-21 14:21:20.846]Container killed on request. Exit code is 137
[2025-08-21 14:21:20.859]Container exited with a non-zero exit code 137. 
[2025-08-21 14:21:20.861]Killed by external signal
.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:3055)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2991)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2990)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2990)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1294)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1294)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1294)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3262)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3193)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3182)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1028)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2568)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2589)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2608)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2633)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1056)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:411)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1055)
	at org.apache.spark.RangePartitioner$.sketch(Partitioner.scala:320)
	at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:187)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.prepareShuffleDependency(ShuffleExchangeExec.scala:296)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:179)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:173)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture$lzycompute(ShuffleExchangeExec.scala:149)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture(ShuffleExchangeExec.scala:145)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.$anonfun$submitShuffleJob$1(ShuffleExchangeExec.scala:73)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:268)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:265)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.submitShuffleJob(ShuffleExchangeExec.scala:73)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeLike.submitShuffleJob$(ShuffleExchangeExec.scala:72)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.submitShuffleJob(ShuffleExchangeExec.scala:120)
	at org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.shuffleFuture$lzycompute(QueryStageExec.scala:188)
	at org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.shuffleFuture(QueryStageExec.scala:188)
	at org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.doMaterialize(QueryStageExec.scala:190)
	at org.apache.spark.sql.execution.adaptive.QueryStageExec.materialize(QueryStageExec.scala:61)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$6(AdaptiveSparkPlanExec.scala:297)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$6$adapted(AdaptiveSparkPlanExec.scala:295)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$1(AdaptiveSparkPlanExec.scala:295)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:961)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.getFinalPhysicalPlan(AdaptiveSparkPlanExec.scala:266)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:412)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.finalPhysicalPlan(AdaptiveSparkPlanExec.scala:258)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.materializeAdaptiveSparkPlan$1(DeltaFileFormatWriter.scala:226)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.$anonfun$write$8(DeltaFileFormatWriter.scala:227)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.materializeAdaptiveSparkPlan$1(DeltaFileFormatWriter.scala:227)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.$anonfun$write$8(DeltaFileFormatWriter.scala:227)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.materializeAdaptiveSparkPlan$1(DeltaFileFormatWriter.scala:227)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.$anonfun$write$10(DeltaFileFormatWriter.scala:233)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.write(DeltaFileFormatWriter.scala:233)
	at org.apache.spark.sql.delta.files.TransactionalWrite.callDeltaFileFormatWriter$1(TransactionalWrite.scala:572)
	at org.apache.spark.sql.delta.files.TransactionalWrite.$anonfun$tryWriteFiles$5(TransactionalWrite.scala:583)
	at scala.Option.flatMap(Option.scala:271)
	at org.apache.spark.sql.delta.files.TransactionalWrite.$anonfun$tryWriteFiles$3(TransactionalWrite.scala:583)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:132)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:220)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:101)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:961)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68)
	at org.apache.spark.sql.delta.files.TransactionalWrite.tryWriteFiles(TransactionalWrite.scala:519)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles(TransactionalWrite.scala:411)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles$(TransactionalWrite.scala:385)
	at org.apache.spark.sql.delta.OptimisticTransaction.writeFiles(OptimisticTransaction.scala:150)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles(TransactionalWrite.scala:382)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles$(TransactionalWrite.scala:372)
	at org.apache.spark.sql.delta.OptimisticTransaction.writeFiles(OptimisticTransaction.scala:150)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles(TransactionalWrite.scala:257)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles$(TransactionalWrite.scala:253)
	at org.apache.spark.sql.delta.OptimisticTransaction.writeFiles(OptimisticTransaction.scala:150)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles(TransactionalWrite.scala:242)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles$(TransactionalWrite.scala:239)
	at org.apache.spark.sql.delta.OptimisticTransaction.writeFiles(OptimisticTransaction.scala:150)
	at org.apache.spark.sql.delta.commands.WriteIntoDelta.writeFiles(WriteIntoDelta.scala:362)
	at org.apache.spark.sql.delta.commands.WriteIntoDelta.writeAndReturnCommitData(WriteIntoDelta.scala:310)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.doDeltaWrite$1(CreateDeltaTableCommand.scala:250)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.handleCreateTableAsSelect(CreateDeltaTableCommand.scala:277)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.handleCommit(CreateDeltaTableCommand.scala:150)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.$anonfun$run$2(CreateDeltaTableCommand.scala:110)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:169)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:167)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.recordFrameProfile(CreateDeltaTableCommand.scala:57)
	at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:137)
	at com.microsoft.spark.telemetry.delta.SynapseLoggingShim.recordOperation(SynapseLoggingShim.scala:111)
	at com.microsoft.spark.telemetry.delta.SynapseLoggingShim.recordOperation$(SynapseLoggingShim.scala:93)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.recordOperation(CreateDeltaTableCommand.scala:57)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:136)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:126)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:116)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.recordDeltaOperation(CreateDeltaTableCommand.scala:57)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.run(CreateDeltaTableCommand.scala:110)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog.$anonfun$createDeltaTable$1(DeltaCatalog.scala:184)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:169)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:167)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog.recordFrameProfile(DeltaCatalog.scala:65)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog.org$apache$spark$sql$delta$catalog$DeltaCatalog$$createDeltaTable(DeltaCatalog.scala:95)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog$StagedDeltaTableV2.$anonfun$commitStagedChanges$1(DeltaCatalog.scala:545)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:169)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:167)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog.recordFrameProfile(DeltaCatalog.scala:65)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog$StagedDeltaTableV2.commitStagedChanges(DeltaCatalog.scala:507)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.$anonfun$writeToTable$1(WriteToDataSourceV2Exec.scala:589)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.writeToTable(WriteToDataSourceV2Exec.scala:582)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.writeToTable$(WriteToDataSourceV2Exec.scala:576)
	at org.apache.spark.sql.execution.datasources.v2.AtomicReplaceTableAsSelectExec.writeToTable(WriteToDataSourceV2Exec.scala:192)
	at org.apache.spark.sql.execution.datasources.v2.AtomicReplaceTableAsSelectExec.run(WriteToDataSourceV2Exec.scala:225)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:214)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:132)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:220)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:101)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:961)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:214)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:202)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:36)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:36)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:36)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:202)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:186)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:180)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:262)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:905)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:664)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:593)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [11]:
# display(spark.sql("""
# SELECT *
# FROM DOL_arqs_auxiliares.movimentos_datajud
# limit 100"""))

StatementMeta(, f885abcb-2cc9-4f14-89ef-7565e69ca288, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f4ee6c05-4a0b-4e52-b598-a40ed010a59d)