# Test S3A Connection with MinIO

This notebook tests reading Parquet files from MinIO using Spark with S3A protocol.

In [1]:
from pyspark.sql import SparkSession

# Setup Spark sesssion
spark = (SparkSession.builder
    # .master(":7077")
    .appName("ReadFile")
    .getOrCreate()
)

spark

In [2]:
spark.sparkContext._jsc.hadoopConfiguration().get("fs.s3a.impl")

'org.apache.hadoop.fs.s3a.S3AFileSystem'

In [3]:
silver_path = 's3a://dev/data/silver/vnstock3/daily_stock_prices/'
df_silver = spark.read.parquet(silver_path)

df_silver.show()

+-------------------+-----+-----+-----+-----+------+------+-------------------+----------+-----------+--------------------+--------------------+
|               time| open| high|  low|close|volume|symbol|   loaded_timestamp|updated_at|price_range|price_percent_change|     processing_time|
+-------------------+-----+-----+-----+-----+------+------+-------------------+----------+-----------+--------------------+--------------------+
|2018-10-24 00:00:00|13.19|13.19|13.19|13.19|     0|   A32|2024-11-01 05:16:42|2024-11-01|        0.0|                 0.0|2024-12-17 14:34:...|
|2018-10-26 00:00:00|13.19|13.19|13.19|13.19|     0|   A32|2024-11-01 05:16:42|2024-11-01|        0.0|                 0.0|2024-12-17 14:34:...|
|2018-10-29 00:00:00|13.19|13.19|13.19|13.19|     0|   A32|2024-11-01 05:16:42|2024-11-01|        0.0|                 0.0|2024-12-17 14:34:...|
|2018-10-30 00:00:00|13.19|13.19|13.19|13.19|     0|   A32|2024-11-01 05:16:42|2024-11-01|        0.0|                 0.0|2024-12

In [1]:
# Spark session with JAR files load explicitly
spark.stop()
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName("ReadFile")
    .config("spark.jars", "/usr/local/spark/jars/hadoop-aws-3.3.4.jar,/usr/local/spark/jars/aws-java-sdk-bundle-1.12.262.jar")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minio")
    .config("spark.hadoop.fs.s3a.secret.key", "minio123")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .getOrCreate()
)

# Verify S3A config
print("S3A Implementation:", spark.sparkContext._jsc.hadoopConfiguration().get("fs.s3a.impl"))
print("S3A Endpoint:", spark.sparkContext._jsc.hadoopConfiguration().get("fs.s3a.endpoint"))

# Read data
silver_path = 's3a://dev/data/silver/vnstock3/daily_stock_prices/'
df_silver = spark.read.parquet(silver_path)
df_silver.show()

S3A Implementation: org.apache.hadoop.fs.s3a.S3AFileSystem
S3A Endpoint: http://minio:9000
+-------------------+-----+-----+-----+-----+------+------+-------------------+----------+-----------+--------------------+--------------------+
|               time| open| high|  low|close|volume|symbol|   loaded_timestamp|updated_at|price_range|price_percent_change|     processing_time|
+-------------------+-----+-----+-----+-----+------+------+-------------------+----------+-----------+--------------------+--------------------+
|2018-10-24 00:00:00|13.19|13.19|13.19|13.19|     0|   A32|2024-11-01 05:16:42|2024-11-01|        0.0|                 0.0|2024-12-17 14:34:...|
|2018-10-26 00:00:00|13.19|13.19|13.19|13.19|     0|   A32|2024-11-01 05:16:42|2024-11-01|        0.0|                 0.0|2024-12-17 14:34:...|
|2018-10-29 00:00:00|13.19|13.19|13.19|13.19|     0|   A32|2024-11-01 05:16:42|2024-11-01|        0.0|                 0.0|2024-12-17 14:34:...|
|2018-10-30 00:00:00|13.19|13.19|13.19|