# Create Dummy Data for the process

## Import modules for Dummy Data Creation

In [None]:
# Adds additional directories for importing custom modules
import sys
sys.path.append('../generate_dummy_data')

from data_generator import generate_dummy_data, write_csv, write_json_batches

## Generate dummy data and write to a CSV file

In [None]:
data = generate_dummy_data(100)
write_csv(data, "../data/dummy_data.csv")

## Generate dummy data and multiple JSON files

In [None]:
write_json_batches(total_records=100, batch_size=10, output_folder="../data/json_batches")

# Uploading files to MinIO

## Import modules for MinIO Upload

In [None]:
# Adds additional directories for importing custom modules
import sys
sys.path.append('../file_uploader')

from minio import Minio
from minio.error import S3Error
import os
import time
from minio_util import get_minio_client, upload_batch_file, upload_json_files

## Initialize the MinIO client.

In [None]:
client = get_minio_client(endpoint="minio:9000")

## Upload a single CSV file.

In [None]:
csv_source_file = "../data/dummy_data.csv"
upload_batch_file(client, csv_source_file, "python-batch-bucket")

## Upload all JSON batch files from a directory.

In [None]:
json_directory = "../data/json_batches"
upload_json_files(client, json_directory, "python-process-bucket")

# Process batch file with Spark into Delta Lake Format and Saving in MinIO

## Import Modules for Delta Lake and PySpark

In [None]:
from pyspark.sql import SparkSession
from delta import *

## Set up SparkSession with Delta and MinIO

In [None]:
spark = SparkSession.builder \
    .appName("DeltaLakeOnMinIO") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.endpoint", "http://minio:9000")  # use the Docker service name or IP
hadoop_conf.set("fs.s3a.access.key", "ROOTNAME")
hadoop_conf.set("fs.s3a.secret.key", "CHANGEME123")
hadoop_conf.set("fs.s3a.path.style.access", "true")  # Required for MinIO


## Read CSV batch file from MinIO 
> "**minio_csv_batch_file_full_path**" value, may need to be manually updated

In [None]:
minio_csv_batch_file_full_path = "s3a://python-batch-bucket/1741409275634_dummy_data.csv"
df = spark.read\
    .option("header", "true")\
    .option("delimiter", ",")\
    .option("ignoreLeadingWhiteSpace", "true")\
    .option("ignoreTrailingWhiteSpace", "true")\
    .option("inferSchema", "true")\
    .csv(minio_csv_batch_file_full_path)

## Data Exploration (Optional)

In [None]:
df.printSchema()

In [None]:
df.head()

In [None]:
df.show(5, truncate=False)

## Clean Data

In [None]:
# Fill null values with a specified value
df_clean = df.na.fill({"is_active": False})

In [None]:
# Drop rows with any null values
df_clean = df_clean.na.drop()

In [None]:
# Filter based upon Salary
df_clean = df_clean.filter(df_clean["salary"] > 10000)

### Overwriting variable to re-use with data exploration segment (Optional)

In [None]:
df = df_clean

## Writes DataFrame in Delta Lake format to MinIO

In [None]:
df_clean.write.format("delta").mode("overwrite").save("s3a://python-batch-bucket/delta_output")

# Data Warehouse using Delta Lake files

## Create Delta Table

In [None]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS csv_batch_table
    USING DELTA
    LOCATION 's3a://python-batch-bucket/delta_output'
""")

## Read from csv_batch_table

In [None]:
spark.sql("SELECT * FROM csv_batch_table limit 10").show()

## Directly read Delta Lake file by loading it into a DataFrame

In [None]:
df_check = spark.read.format("delta").load("s3a://python-batch-bucket/delta_output")
df_check.show()

# Delete MinIO Object(s)

## Remove object
https://github.com/minio/minio-py/blob/88f4244fe89fb9f23de4f183bdf79524c712deaa/examples/remove_object.py#L25

### Import modules for MinIO deletion

In [None]:
from minio import Minio

In [None]:
client.remove_object("python-batch-bucket","file_name")

## Remove a prefix recursively
https://github.com/minio/minio-py/blob/88f4244fe89fb9f23de4f183bdf79524c712deaa/examples/remove_objects.py#L38

### Import modules for MinIO multi-deletion

In [None]:
from minio import Minio
from minio.deleteobjects import DeleteObject

### Delete file(s) with prefix (Prefix = full directory path to delete)

In [None]:
delete_object_list = map(
    lambda x: DeleteObject(x.object_name),
    client.list_objects("python-batch-bucket", "delta_output/", recursive=True),
)
errors = client.remove_objects("python-batch-bucket", delete_object_list)
for error in errors:
    print("error occurred when deleting object", error)