In [0]:
dbutils.widgets.text("sql_query", "")
dbutils.widgets.text("database", "")
sql_query = dbutils.widgets.get("sql_query")
database = dbutils.widgets.get("database")

if not database:
  database = "ifrs17_cur"

spark.sql(f"USE {database}")

if not sql_query:
  sql_query = "SELECT * FROM data_ifrs17 LIMIT 10"

In [0]:
df = spark.sql(sql_query)

In [0]:
import hashlib
import time

# Generate a unique hash for the operation
timestamp = str(int(time.time() * 10000))
short_hash = hashlib.md5(timestamp.encode()).hexdigest()[:6]

# Base folder for chunk files
base_folder = f"dbfs:/tmp/api_responses/result_{short_hash}.csv"
summary_folder = f"dbfs:/tmp/api_responses/result_{short_hash}_summary.csv"

In [0]:
# Count total records
record_count = df.count()

# Calculate the number of partitions based on a fixed chunk size
chunk_size = 1000
num_partitions = max(1, -(-record_count // chunk_size))  # Ceiling division

# Split the DataFrame into chunks of 1000 records
df_repartitioned = df.repartition(num_partitions)  # Adjust the number of partitions based on data size

# Write the chunk files to the base folder
df_repartitioned.write.mode("overwrite").option("header", "true").csv(base_folder)

# Collect the paths of chunk files
chunk_files = [
    file.path
    for file in dbutils.fs.ls(base_folder)
    if file.path.endswith(".csv")
]

In [None]:
# Create the summary CSV with file paths only
summary_data = [(chunk_file,) for chunk_file in chunk_files]
summary_df = spark.createDataFrame(summary_data, ["csv_file_path"])

# Write the summary file to the summary folder
summary_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(summary_folder)

# Collect the actual part file path of the summary
summary_files = [
    file.path
    for file in dbutils.fs.ls(summary_folder)
    if file.path.endswith(".csv")
]

# Ensure the notebook output points to the full path of the summary file part
if len(summary_files) == 1:
    dbutils.notebook.exit(summary_files[0])
else:
    raise ValueError("Error generating summary file.")