In [0]:
%sql
use redox.main;

In [0]:
%sql
select count(*) from ccda_bronze;

In [0]:
destination_volume = "/Volumes/redox/main/extract_staging/ccda"

In [0]:
from pyspark.sql.functions import col

In [0]:
df = (spark.readStream.format("delta")
  .option("readChangeFeed", "true") 
  .table("ccda_bronze")
  .filter(col("_change_type").isin("update", "insert"))
) 


In [0]:
streaming_query = df.writeStream.format("memory").queryName("in_memory_table").trigger(availableNow=True).start()
streaming_query.awaitTermination()

In [0]:
distinct_commits = (
  spark.sql("select * from in_memory_table")
  .groupBy("_commit_version", "_commit_timestamp")
  .count()
  .orderBy(col("_commit_timestamp").desc())
  )
display(distinct_commits)

In [0]:
display(df)

In [0]:
new_files = df.select(col("file_metadata.file_path"), col("file_metadata.file_name"))

In [0]:
# The query object is a StreamingQuery, which manages the state of the streaming job.
# When using format("console"), the data is only printed to the notebook output and is not accessible programmatically.
# To collect the data, write the stream to an in-memory table instead:

new_files.writeStream.format("memory").queryName("new_files_table").trigger(availableNow=True).start().awaitTermination()

# Then, you can access the data as a Spark DataFrame:
result_df = spark.sql("SELECT * FROM new_files_table")
# display(result_df)

In [0]:
file_list = result_df.collect()

file_dicts = [{"file_path": row["file_path"], "file_name": row["file_name"]} for row in file_list]

file_dicts

In [0]:
for file in file_dicts:
    dbutils.fs.cp(file["file_path"], f"{destination_volume}/{file['file_name']}")

In [0]:
display(spark.sql(f"LIST '{destination_volume}';"))