In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MinIO Read Example") \
    .config("spark.jars", "jars/hadoop-aws-3.3.2.jar,jars/aws-java-sdk-bundle-1.11.1026.jar") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()

In [4]:
df_transactions = spark.read.option("header", True) \
    .csv("s3a://raw-data/transactions.csv")

In [5]:
df_customers = spark.read.option("header", True) \
    .csv("s3a://raw-data/customers.csv")

In [6]:
df_transactions.printSchema()

root
 |-- Invoice ID: string (nullable = true)
 |-- Line: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Unit Price: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Discount: string (nullable = true)
 |-- Line Total: string (nullable = true)
 |-- Store ID: string (nullable = true)
 |-- Employee ID: string (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Currency Symbol: string (nullable = true)
 |-- SKU: string (nullable = true)
 |-- Transaction Type: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Invoice Total: string (nullable = true)



In [7]:
df_customers.printSchema()

root
 |-- Customer ID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Telephone: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Date Of Birth: string (nullable = true)
 |-- Job Title: string (nullable = true)



In [8]:
from pyspark.sql.functions import col, sum, when

missing_counts = df_transactions.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df_transactions.columns
])

missing_counts.show()

+----------+----+-----------+----------+------+-------+----------+--------+----+--------+----------+--------+-----------+--------+---------------+---+----------------+--------------+-------------+
|Invoice ID|Line|Customer ID|Product ID|  Size|  Color|Unit Price|Quantity|Date|Discount|Line Total|Store ID|Employee ID|Currency|Currency Symbol|SKU|Transaction Type|Payment Method|Invoice Total|
+----------+----+-----------+----------+------+-------+----------+--------+----+--------+----------+--------+-----------+--------+---------------+---+----------------+--------------+-------------+
|         0|   0|          0|         0|413102|4350783|         0|       0|   0|       0|         0|       0|          0|       0|              0|  0|               0|             0|            0|
+----------+----+-----------+----------+------+-------+----------+--------+----+--------+----------+--------+-----------+--------+---------------+---+----------------+--------------+-------------+



In [9]:
from pyspark.sql.functions import col, when, count

total_rows = df_transactions.count()

missing_size = df_transactions.filter(col("Size").isNull()).count()
missing_color = df_transactions.filter(col("Color").isNull()).count()

print(f"Size missing: {missing_size} ({missing_size/total_rows:.2%})")
print(f"Color missing: {missing_color} ({missing_color/total_rows:.2%})")

Size missing: 413102 (6.44%)
Color missing: 4350783 (67.80%)


In [10]:
from pyspark.sql.functions import col, count, when

# 1. Hitung nilai yang paling sering muncul (modus) untuk kolom kategorikal
most_common_size = (
    df_transactions.filter(col("Size").isNotNull())
      .groupBy("Size")
      .count()
      .orderBy("count", ascending=False)
      .first()[0]
)

most_common_color = (
    df_transactions.filter(col("Color").isNotNull())
      .groupBy("Color")
      .count()
      .orderBy("count", ascending=False)
      .first()[0]
)

# 2. Isi missing value dengan nilai modus tersebut
df_transactions_filled = df_transactions.fillna({
    "Size": most_common_size,
    "Color": most_common_color
})

In [11]:
from pyspark.sql.functions import col, sum, when

missing_counts = df_transactions_filled.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df_transactions_filled.columns
])

missing_counts.show()

+----------+----+-----------+----------+----+-----+----------+--------+----+--------+----------+--------+-----------+--------+---------------+---+----------------+--------------+-------------+
|Invoice ID|Line|Customer ID|Product ID|Size|Color|Unit Price|Quantity|Date|Discount|Line Total|Store ID|Employee ID|Currency|Currency Symbol|SKU|Transaction Type|Payment Method|Invoice Total|
+----------+----+-----------+----------+----+-----+----------+--------+----+--------+----------+--------+-----------+--------+---------------+---+----------------+--------------+-------------+
|         0|   0|          0|         0|   0|    0|         0|       0|   0|       0|         0|       0|          0|       0|              0|  0|               0|             0|            0|
+----------+----+-----------+----------+----+-----+----------+--------+----+--------+----------+--------+-----------+--------+---------------+---+----------------+--------------+-------------+



In [12]:
from pyspark.sql.functions import col, sum, when

missing_counts = df_customers.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df_customers.columns
])

missing_counts.show()

+-----------+----+-----+---------+----+-------+------+-------------+---------+
|Customer ID|Name|Email|Telephone|City|Country|Gender|Date Of Birth|Job Title|
+-----------+----+-----+---------+----+-------+------+-------------+---------+
|          0|   0|    0|        0|   0|      0|     0|            0|   584185|
+-----------+----+-----+---------+----+-------+------+-------------+---------+



In [13]:
most_common_size = (
    df_customers.filter(col("Job Title").isNotNull())
      .groupBy("Job Title")
      .count()
      .orderBy("count", ascending=False)
      .first()[0]
)

# 2. Isi missing value dengan nilai modus tersebut
df_customers_filled = df_customers.fillna({
    "Job Title": most_common_size
})

In [14]:
from pyspark.sql.functions import col, sum, when

missing_counts = df_customers_filled.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df_customers_filled.columns
])

missing_counts.show()

+-----------+----+-----+---------+----+-------+------+-------------+---------+
|Customer ID|Name|Email|Telephone|City|Country|Gender|Date Of Birth|Job Title|
+-----------+----+-----+---------+----+-------+------+-------------+---------+
|          0|   0|    0|        0|   0|      0|     0|            0|        0|
+-----------+----+-----+---------+----+-------+------+-------------+---------+



In [15]:
df_transactions_filled.show(2)
df_customers_filled.show(2)

+-------------------+----+-----------+----------+----+-----+----------+--------+-------------------+--------+----------+--------+-----------+--------+---------------+-----------+----------------+--------------+-------------+
|         Invoice ID|Line|Customer ID|Product ID|Size|Color|Unit Price|Quantity|               Date|Discount|Line Total|Store ID|Employee ID|Currency|Currency Symbol|        SKU|Transaction Type|Payment Method|Invoice Total|
+-------------------+----+-----------+----------+----+-----+----------+--------+-------------------+--------+----------+--------+-----------+--------+---------------+-----------+----------------+--------------+-------------+
|INV-US-001-03558761|   1|      47162|       485|   M| BLUE|      80.5|       1|2023-01-01 15:42:00|     0.0|      80.5|       1|          7|     USD|              $| MASU485-M-|            Sale|          Cash|        126.7|
|INV-US-001-03558761|   2|      47162|      2779|   G| BLUE|      31.5|       1|2023-01-01 15:42:00|

In [19]:
from pyspark.sql import functions as F

# Join kedua tabel di customer ID
df_joined = df_transactions_filled.join(df_customers_filled, on="Customer ID", how="inner")

# 1. Total penjualan dan jumlah transaksi per customer
agg_customer = df_joined.groupBy("Customer ID", "Name").agg(
    F.countDistinct("Invoice ID").alias("num_transactions"),
    F.sum("Invoice Total").alias("total_sales"),
    F.avg("Invoice Total").alias("avg_sales")
).orderBy(F.desc("total_sales"))

agg_customer.show(5, truncate=False)

+-----------+------+----------------+-----------------+------------------+
|Customer ID|Name  |num_transactions|total_sales      |avg_sales         |
+-----------+------+----------------+-----------------+------------------+
|577063     |李秀华|15              |86566.68999999999|2278.070789473684 |
|671898     |王慧  |10              |81611.0          |3022.6296296296296|
|450001     |张峰  |18              |79720.96         |1771.576888888889 |
|694236     |赵娟  |19              |77675.53         |1726.1228888888888|
|482569     |花刚  |18              |75297.15         |2596.453448275862 |
+-----------+------+----------------+-----------------+------------------+
only showing top 5 rows



In [20]:
# 2. Total penjualan per city
agg_city = df_joined.groupBy("City").agg(
    F.countDistinct("Invoice ID").alias("num_transactions"),
    F.sum("Invoice Total").alias("total_sales"),
    F.countDistinct("Customer ID").alias("num_customers")
).orderBy(F.desc("total_sales"))

agg_city.show(5, truncate=False)

+----+----------------+--------------------+-------------+
|City|num_transactions|total_sales         |num_customers|
+----+----------------+--------------------+-------------+
|深圳|207214          |2.1415250447999996E8|55941        |
|上海|203540          |2.1169211219E8      |40976        |
|广州|193603          |2.0375263251999995E8|32996        |
|北京|159529          |1.667636002999999E8 |46232        |
|重庆|107921          |1.1371054553000012E8|24003        |
+----+----------------+--------------------+-------------+
only showing top 5 rows



In [21]:
# 3. Penjualan dan jumlah customer berdasarkan gender
agg_gender = df_joined.groupBy("Gender").agg(
    F.countDistinct("Invoice ID").alias("num_transactions"),
    F.sum("Invoice Total").alias("total_sales"),
    F.countDistinct("Customer ID").alias("num_customers")
)

agg_gender.show()

+------+----------------+-------------------+-------------+
|Gender|num_transactions|        total_sales|num_customers|
+------+----------------+-------------------+-------------+
|     F|         2901929|9.343590403999864E8|       596099|
|     M|         1635652|6.273548965599934E8|       686397|
|     D|            2823|          954399.91|         1211|
+------+----------------+-------------------+-------------+



In [22]:
# 4. Top 10 customer by total sales
top_customers = agg_customer.limit(10)

top_customers.show(truncate=False)

+-----------+------+----------------+-----------------+------------------+
|Customer ID|Name  |num_transactions|total_sales      |avg_sales         |
+-----------+------+----------------+-----------------+------------------+
|577063     |李秀华|15              |86566.68999999999|2278.070789473684 |
|671898     |王慧  |10              |81611.0          |3022.6296296296296|
|450001     |张峰  |18              |79720.96         |1771.576888888889 |
|694236     |赵娟  |19              |77675.53         |1726.1228888888888|
|482569     |花刚  |18              |75297.15         |2596.453448275862 |
|363327     |陈慧  |21              |72841.45         |1348.9157407407406|
|502106     |齐秀芳|26              |72105.18000000001|1360.4750943396227|
|411743     |郑涛  |5               |72068.25         |2771.855769230769 |
|484353     |柏秀华|22              |71872.74         |1437.4548000000002|
|549169     |孙建军|16              |70282.75         |2129.780303030303 |
+-----------+------+----------------+------------

In [25]:
# Simpan hasil clean ke MinIO
df_transactions_filled.write.mode("overwrite").parquet("s3a://raw-data/raw-data-cleaned/transactions/")
df_customers_filled.write.mode("overwrite").parquet("s3a://raw-data/raw-data-cleaned/customers/")

Py4JJavaError: An error occurred while calling o582.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 79.0 failed 1 times, most recent failure: Lost task 0.0 in stage 79.0 (TID 350) (Nicholas.mshome.net executor driver): java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
	at org.apache.hadoop.util.DiskChecker.checkAccessByFileMethods(DiskChecker.java:160)
	at org.apache.hadoop.util.DiskChecker.checkDirInternal(DiskChecker.java:100)
	at org.apache.hadoop.util.DiskChecker.checkDir(DiskChecker.java:77)
	at org.apache.hadoop.util.BasicDiskValidator.checkStatus(BasicDiskValidator.java:32)
	at org.apache.hadoop.fs.LocalDirAllocator$AllocatorPerContext.confChanged(LocalDirAllocator.java:330)
	at org.apache.hadoop.fs.LocalDirAllocator$AllocatorPerContext.getLocalPathForWrite(LocalDirAllocator.java:393)
	at org.apache.hadoop.fs.LocalDirAllocator.getLocalPathForWrite(LocalDirAllocator.java:165)
	at org.apache.hadoop.fs.LocalDirAllocator.getLocalPathForWrite(LocalDirAllocator.java:146)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.createTmpFileForWrite(S3AFileSystem.java:1282)
	at org.apache.hadoop.fs.s3a.S3ADataBlocks$DiskBlockFactory.create(S3ADataBlocks.java:816)
	at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.createBlockIfNeeded(S3ABlockOutputStream.java:211)
	at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.<init>(S3ABlockOutputStream.java:188)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.innerCreateFile(S3AFileSystem.java:1727)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$create$6(S3AFileSystem.java:1646)
	at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499)
	at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:444)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2337)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2356)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.create(S3AFileSystem.java:1645)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1195)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1175)
	at org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:347)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:314)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:484)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:422)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:411)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:36)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetUtils$$anon$1.newInstance(ParquetUtils.scala:490)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:161)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:146)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:389)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:842)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$4(FileFormatWriter.scala:307)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:271)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:802)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
	at org.apache.hadoop.util.DiskChecker.checkAccessByFileMethods(DiskChecker.java:160)
	at org.apache.hadoop.util.DiskChecker.checkDirInternal(DiskChecker.java:100)
	at org.apache.hadoop.util.DiskChecker.checkDir(DiskChecker.java:77)
	at org.apache.hadoop.util.BasicDiskValidator.checkStatus(BasicDiskValidator.java:32)
	at org.apache.hadoop.fs.LocalDirAllocator$AllocatorPerContext.confChanged(LocalDirAllocator.java:330)
	at org.apache.hadoop.fs.LocalDirAllocator$AllocatorPerContext.getLocalPathForWrite(LocalDirAllocator.java:393)
	at org.apache.hadoop.fs.LocalDirAllocator.getLocalPathForWrite(LocalDirAllocator.java:165)
	at org.apache.hadoop.fs.LocalDirAllocator.getLocalPathForWrite(LocalDirAllocator.java:146)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.createTmpFileForWrite(S3AFileSystem.java:1282)
	at org.apache.hadoop.fs.s3a.S3ADataBlocks$DiskBlockFactory.create(S3ADataBlocks.java:816)
	at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.createBlockIfNeeded(S3ABlockOutputStream.java:211)
	at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.<init>(S3ABlockOutputStream.java:188)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.innerCreateFile(S3AFileSystem.java:1727)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$create$6(S3AFileSystem.java:1646)
	at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499)
	at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:444)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2337)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2356)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.create(S3AFileSystem.java:1645)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1195)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1175)
	at org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:347)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:314)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:484)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:422)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:411)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:36)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetUtils$$anon$1.newInstance(ParquetUtils.scala:490)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:161)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:146)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:389)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
