### Init our resources and catalog

In [0]:
%run ./00-init $reset_all_data=false

### Create Volume

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS cronos_unity_catalog.cdata.banking_unity_catalog;

In [0]:
volume_folder =  f"/Volumes/{catalog}/{db}/banking_unity_catalog"

### Ingesting PDF files as binary format using Databricks cloudFiles (Autoloader)

In [0]:
df = (
 spark.readStream.format("cloudFiles")
  .option("cloudfiles.format", "binaryfile")
  .load(volume_folder+"/PDFs")
)

In [0]:
from datetime import datetime
# Get current timestamp in a specific format (adjust as needed)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 
# Construct the checkpoint location with the timestamp
checkpoint_location = f'dbfs:{volume_folder}/checkpoints/raw_pdfs_{timestamp}'
(
 df.writeStream.format("delta")
 .option("checkpointLocation", checkpoint_location)
 .outputMode("append").trigger(availableNow=True).toTable(f"{catalog}.{db}.documents_raw")
)

<pyspark.sql.streaming.query.StreamingQuery at 0x7ab630d5ed50>

In [0]:
%sql
SELECT * FROM cronos_unity_catalog.cdata.documents_raw

path,modificationTime,length,content
dbfs:/Volumes/cronos_unity_catalog/cdata/banking_unity_catalog/PDFs/Woodgrove - Insurance_Summary Plan Description_Employee Benefits-1.pdf,2025-05-27T09:10:28Z,1822981,JVBERi0xLjcKJeLjz9MKMSAwIG9iago8PAovVHlwZSAvUGFnZXMKL0NvdW50IDEKL0tpZHMgWyA0IDAgUiBdCj4+CmVuZG9iagoyIDAgb2JqCjw8Ci9Qcm9kdWNlciAocHlwZGYpCj4+CmVuZG9iagozIDAgb2JqCjw8Ci9UeXA= (truncated)
dbfs:/Volumes/cronos_unity_catalog/cdata/banking_unity_catalog/PDFs/DAILY NEWS 2.pdf,2025-05-27T09:10:28Z,597777,JVBERi0xLjcKCjEgMCBvYmoKICA8PCAvVHlwZSAvWE9iamVjdAogICAgIC9TdWJ0eXBlIC9JbWFnZQogICAgIC9CaXRzUGVyQ29tcG9uZW50IDgKICAgICAvTGVuZ3RoIDIgMCBSCiAgICAgL0hlaWdodCA5NTYKICAgICAvV2k= (truncated)
dbfs:/Volumes/cronos_unity_catalog/cdata/banking_unity_catalog/PDFs/DAILY NEWS 1.pdf,2025-05-27T09:10:28Z,589116,JVBERi0xLjcKCjEgMCBvYmoKICA8PCAvVHlwZSAvWE9iamVjdAogICAgIC9TdWJ0eXBlIC9JbWFnZQogICAgIC9CaXRzUGVyQ29tcG9uZW50IDgKICAgICAvTGVuZ3RoIDIgMCBSCiAgICAgL0hlaWdodCA5OTIKICAgICAvV2k= (truncated)
dbfs:/Volumes/cronos_unity_catalog/cdata/banking_unity_catalog/PDFs/Woodgrove - Insurance_Summary Plan Description_Employee Benefits-5.pdf,2025-05-27T09:10:28Z,386321,JVBERi0xLjcKJeLjz9MKMSAwIG9iago8PAovVHlwZSAvUGFnZXMKL0NvdW50IDEKL0tpZHMgWyA0IDAgUiBdCj4+CmVuZG9iagoyIDAgb2JqCjw8Ci9Qcm9kdWNlciAocHlwZGYpCj4+CmVuZG9iagozIDAgb2JqCjw8Ci9UeXA= (truncated)
dbfs:/Volumes/cronos_unity_catalog/cdata/banking_unity_catalog/PDFs/Woodgrove - Insurance_Summary Plan Description_Employee Benefits-13.pdf,2025-05-27T09:10:28Z,311881,JVBERi0xLjcKJeLjz9MKMSAwIG9iago8PAovVHlwZSAvUGFnZXMKL0NvdW50IDEKL0tpZHMgWyA0IDAgUiBdCj4+CmVuZG9iagoyIDAgb2JqCjw8Ci9Qcm9kdWNlciAocHlwZGYpCj4+CmVuZG9iagozIDAgb2JqCjw8Ci9UeXA= (truncated)
dbfs:/Volumes/cronos_unity_catalog/cdata/banking_unity_catalog/PDFs/Woodgrove - Insurance_Summary Plan Description_Employee Benefits-12.pdf,2025-05-27T09:10:28Z,181288,JVBERi0xLjcKJeLjz9MKMSAwIG9iago8PAovVHlwZSAvUGFnZXMKL0NvdW50IDEKL0tpZHMgWyA0IDAgUiBdCj4+CmVuZG9iagoyIDAgb2JqCjw8Ci9Qcm9kdWNlciAocHlwZGYpCj4+CmVuZG9iagozIDAgb2JqCjw8Ci9UeXA= (truncated)
dbfs:/Volumes/cronos_unity_catalog/cdata/banking_unity_catalog/PDFs/Woodgrove - Insurance_Summary Plan Description_Employee Benefits-0.pdf,2025-05-27T09:10:27Z,132937,JVBERi0xLjcKJeLjz9MKMSAwIG9iago8PAovVHlwZSAvUGFnZXMKL0NvdW50IDEKL0tpZHMgWyA0IDAgUiBdCj4+CmVuZG9iagoyIDAgb2JqCjw8Ci9Qcm9kdWNlciAocHlwZGYpCj4+CmVuZG9iagozIDAgb2JqCjw8Ci9UeXA= (truncated)
dbfs:/Volumes/cronos_unity_catalog/cdata/banking_unity_catalog/PDFs/Woodgrove - Insurance_Summary Plan Description_Employee Benefits-2.pdf,2025-05-27T09:10:28Z,87406,JVBERi0xLjcKJeLjz9MKMSAwIG9iago8PAovVHlwZSAvUGFnZXMKL0NvdW50IDEKL0tpZHMgWyA0IDAgUiBdCj4+CmVuZG9iagoyIDAgb2JqCjw8Ci9Qcm9kdWNlciAocHlwZGYpCj4+CmVuZG9iagozIDAgb2JqCjw8Ci9UeXA= (truncated)
