# Create Dummy Data for the process

## Adds additional directories for importing custom modules

In [None]:
import sys
sys.path.append('../generate_dummy_data')

## Import modules

In [None]:
from data_generator import generate_dummy_data, write_csv, write_json_batches

## Generate dummy data and write to a CSV file

In [None]:
data = generate_dummy_data(100)
write_csv(data, "../data/dummy_data.csv")

## Generate dummy data and multiple JSON files

In [None]:
write_json_batches(total_records=100, batch_size=10, output_folder="../data/json_batches")

# Uploading files to MinIO

## Adds additional directories for importing custom modules

In [None]:
import sys
sys.path.append('../file_uploader')

## Import modules for MinIO Upload

In [None]:
from minio import Minio
from minio.error import S3Error
import os
import time
from minio_util import get_minio_client, upload_batch_file, upload_json_files

## Initialize the MinIO client.

In [None]:
client = get_minio_client(endpoint="minio:9000")

## Upload a single CSV file.

In [None]:
csv_source_file = "../data/dummy_data.csv"
upload_batch_file(client, csv_source_file, "python-batch-bucket")

## Upload all JSON batch files from a directory.

In [None]:
json_directory = "../data/json_batches"
upload_json_files(client, json_directory, "python-process-bucket")

# Process files with Spark into Delta Lake

## Import Modules for Delta Lake and PySpark

In [None]:
from pyspark.sql import SparkSession
from delta import *

## Set up SparkSession with Delta and MinIO

In [None]:
spark = SparkSession.builder \
    .appName("DeltaLakeOnMinIO") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.3.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.endpoint", "http://minio:9000")  # use the Docker service name or IP
hadoop_conf.set("fs.s3a.access.key", "ROOTNAME")
hadoop_conf.set("fs.s3a.secret.key", "CHANGEME123")
hadoop_conf.set("fs.s3a.path.style.access", "true")  # Required for MinIO


## Read CSV batch file from MinIO

In [None]:
df = spark.read\
    .option("header", "true")\
    .option("delimiter", ",")\
    .option("ignoreLeadingWhiteSpace", "true")\
    .option("ignoreTrailingWhiteSpace", "true")\
    .option("inferSchema", "true")\
    .csv("s3a://python-batch-bucket/1741243817566_dummy_data.csv")

In [None]:
### Show data (Optional)

In [None]:
df.show(1, truncate=False)

In [None]:
df.printSchema()

## Clean Data

In [None]:
df_transformed = df.filter(df["salary"] > 10000)

### Skip cleaning (Optional)

In [None]:
df_transformed = df

## Save as Delta Table

In [None]:
df_transformed.write.format("delta").mode("overwrite").save("s3a://python-batch-bucket/delta_output")

## Create Delta Table

In [None]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS csv_batch_table
    USING DELTA
    LOCATION 's3a://python-batch-bucket/delta_output'
""")

## Read from csv_batch_table

In [None]:
spark.sql("SELECT * FROM csv_batch_table").show()

## Directly read

In [None]:
df_check = spark.read.format("delta").load("s3a://python-batch-bucket/delta_output")
df_check.show()