# Example to Read / Write to Minio / S3 bucket with Spark

NOTE: Read / Write from Minio like you would the local file system. The different is the path uses `s3a://`

In [3]:
import pyspark
from pyspark.sql import SparkSession

In [4]:
# AMAZON S3 CONFIGURATION
s3_url = f"https://s3.amazonaws.com"
s3_key = "aws-access-key" # Access Key ID
s3_secret = "aws-secret-key" # Secret Access Key
s3_bucket = "bucket" # Bucket Name

In [5]:
# Spark init
spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
    .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:3.1.2")\
    .config("spark.hadoop.fs.s3a.endpoint", s3_url) \
    .config("spark.hadoop.fs.s3a.access.key", s3_key) \
    .config("spark.hadoop.fs.s3a.secret.key", s3_secret) \
    .config("spark.hadoop.fs.s3a.fast.upload", True) \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")



:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4f985fe9-cb4c-4974-9627-10f022430b2c;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.1.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.271 in central
:: resolution report :: resolve 227ms :: artifacts dl 4ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.271 from central in [default]
	org.apache.hadoop#hadoop-aws;3.1.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	--------------------------------

In [6]:
# read local data
df = spark.read.option("multiline",True).json("/home/jovyan/datasets/json-samples/stocks.json")
df.toPandas()

Unnamed: 0,price,symbol
0,126.82,AAPL
1,3098.12,AMZN
2,251.11,FB
3,1725.05,GOOG
4,128.39,IBM
5,212.55,MSFT
6,78.0,NET
7,497.0,NFLX
8,823.8,TSLA
9,45.11,TWTR


In [8]:
# Write to minio as CSV file - MAKE SURE BUCKET EXISTS !
df.write.mode("Overwrite").csv(f"s3a://{s3_bucket}/stocks.csv",header=True)

                                                                                

In [9]:
# read back from minio
df1 = spark.read.csv(f"s3a://{s3_bucket}/stocks.csv",header=True)
df1.toPandas()

Unnamed: 0,price,symbol
0,126.82,AAPL
1,3098.12,AMZN
2,251.11,FB
3,1725.05,GOOG
4,128.39,IBM
5,212.55,MSFT
6,78.0,NET
7,497.0,NFLX
8,823.8,TSLA
9,45.11,TWTR


In [11]:
df3 = spark.read.csv(f"s3a://{s3_bucket}/customers.csv",header=True)
df3.show()

+------+----------+--------------------+------+---------------+-----------+-----+------------+---------------+---------------+
| First|      Last|               Email|Gender|Last IP Address|       City|State|Total Orders|Total Purchased|Months Customer|
+------+----------+--------------------+------+---------------+-----------+-----+------------+---------------+---------------+
|    Al|    Fresco|  afresco@dayrep.com|     M|  74.111.18.161|   Syracuse|   NY|           1|             45|              1|
|  Abby|      Kuss|     akuss@rhyta.com|     F|  23.80.125.101|    Phoenix|   AZ|           1|             25|              2|
| Arial|     Photo|   aphoto@dayrep.com|     F|     24.0.14.56|     Newark|   NJ|           1|            680|              1|
| Bette|     Alott|    balott@rhyta.com|     F| 56.216.127.219|    Raleigh|   NC|           6|            560|             18|
| Barb |    Barion|bbarion@superrito...|     F|   38.68.15.223|     Dallas|   TX|           4|           1590| 