In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [11]:
import pandas as pd

# 1) Test with minio and spark

### 1.1 Configure spark session

In [3]:
conf = SparkConf()
sc = SparkContext( conf=conf)
spark = SparkSession(sc).builder \
        .appName("test_app") \
        .getOrCreate()

spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", "minio")
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "minio123")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
spark._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minio1:9000")

### 1.2 Create test pandas dataframe

In [14]:
df_local = pd.DataFrame([(1,'a'),(2,'v'),(3,'c'),(4,'a'),(21,'v')], columns = ['a','b'])
df_local 

Unnamed: 0,a,b
0,1,a
1,2,v
2,3,c
3,4,a
4,21,v


### 1.3 Create pyspark dataframe and write it to minio s3

In [17]:
spark.createDataFrame(df_local).repartition(1)\
.write\
.option("header","true")\
.csv("s3a://test/tmp")

### 1.4 Read this dataframe from s3 and create temporary view

In [18]:
df = spark.read.csv("s3a://test/tmp/*.csv", header = True)
df.createOrReplaceTempView('tmp_table')

### 1.5 Make new spark datafrme with aggregation 

In [24]:
df_agg = spark\
.sql("""
SELECT 
b,
avg(a) as a_mean
FROM tmp_table 
GROUP BY b
""")
df_agg.show(5)

+---+------+
|  b|a_mean|
+---+------+
|  v|  11.5|
|  c|   3.0|
|  a|   2.5|
+---+------+



### 1.6 Write aggregated df back to s3

In [25]:
df_agg.repartition(1)\
.write\
.option("header","true")\
.csv("s3a://test/tmp_agg")

# 2) minio S3 by boto3

In [26]:
import boto3
s3 = boto3.resource(
    's3', 
    endpoint_url='http://minio1:9000', 
    aws_access_key_id = 'minio', 
    aws_secret_access_key = 'minio123'
)
bucket = s3.Bucket('test')
[obj for obj in bucket.objects.all()]

[s3.ObjectSummary(bucket_name='test', key='tmp/_SUCCESS'),
 s3.ObjectSummary(bucket_name='test', key='tmp/part-00000-5a7caf20-4fe5-444b-ae0c-bc5d68e01aef-c000.csv'),
 s3.ObjectSummary(bucket_name='test', key='tmp_agg/_SUCCESS'),
 s3.ObjectSummary(bucket_name='test', key='tmp_agg/part-00000-f623e433-f7a8-4eaa-aec7-150257e4ca69-c000.csv')]