In [0]:
dbutils.widgets.text("sql_query", "")
dbutils.widgets.text("database", "")
sql_query = dbutils.widgets.get("sql_query")
database = dbutils.widgets.get("database")

if not database:
  database = "ifrs17_cur"

spark.sql(f"USE {database}")

if not sql_query:
  sql_query = "SELECT * FROM data_ifrs17 LIMIT 10"

In [0]:
df = spark.sql(sql_query)

In [0]:
import hashlib
import time

# Generate a unique hash based on the timestamp
timestamp = str(int(time.time() * 10000))
short_hash = hashlib.md5(timestamp.encode()).hexdigest()[:6]

# Define paths based on naming convention
base_folder = f"dbfs:/tmp/api_responses/result_{short_hash}.csv/"
summary_file = f"dbfs:/tmp/api_responses/result_{short_hash}_summary.csv"

In [0]:
# Repartition the DataFrame into chunks of approximately 1000 records
chunk_size = 1000
num_chunks = (df.count() // chunk_size) + 1
df_repartitioned = df.repartition(num_chunks)

# Write the data in chunks to DBFS
df_repartitioned.write.mode("overwrite").option("header", "true").csv(base_folder)

# List the CSV part files generated for each chunk
chunk_files = []
folders = dbutils.fs.ls(base_folder)
for folder in folders:
    if folder.isDir():
        part_files = dbutils.fs.ls(folder.path)
        chunk_files.extend([file.path for file in part_files if file.path.endswith(".csv")])

In [None]:
# Create the summary CSV with the file paths of all chunk files
summary_data = [(file,) for file in chunk_files]

# Create a DataFrame for the summary
summary_df = spark.createDataFrame(summary_data, ["csv_file_path"])

# Write the summary file
summary_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(summary_file)

# Return the path to the summary file
dbutils.notebook.exit(summary_file)