In [None]:
# spark_minio_demo.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

def main():
    spark = SparkSession.builder \
        .appName("Spark MinIO Demo") \
        .master("spark://spark-master:7077") \
        .getOrCreate()

    # cấu hình MinIO (S3 API)
    spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", "admin")
    spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "admin123")
    spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio:9000")
    spark._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")

    input_path = "s3a://demo-bucket/input/sample.csv"
    output_path = "s3a://demo-bucket/output/result"

    df = spark.read.option("header", "true").csv(input_path)
    print("== Input Data ==")
    df.show()

    # transform
    df2 = df.select(
        col("id"),
        col("value").cast("double").alias("value_num"),
        (col("value").cast("double") * 2).alias("value_double")
    )

    print("== Transformed Data ==")
    df2.show()

    # write back
    df2.write.mode("overwrite").option("header", "true").csv(output_path)
    print(f"Output saved to {output_path}")

    spark.stop()

if __name__ == "__main__":
    main()
