In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType

conf = (SparkConf()
            # dependencies
            .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.7,com.amazonaws:aws-java-sdk:1.7.4")
            # implementation
            .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            # V4 support
            .set("spark.executor.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")
            .set("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")        
            .set("spark.hadoop.fs.s3a.endpoint", "s3-eu-central-1.amazonaws.com")
            # remaining configuration
            .setAppName("Working with Amazon S3 on Apache Spark"))

spark = SparkSession.builder.config(conf=conf).getOrCreate()

data = [[x, 100 - x] for x in range(0, 101)]

schema = StructType([
    StructField("value", IntegerType()),
    StructField("completion", IntegerType())
])

S3_PATH = "s3a://spark-with-s3-frankfurt/data/"

df_w = spark.createDataFrame(data, schema=schema)
df_w.write.csv(S3_PATH, mode="overwrite")

df_r = spark.read.csv(S3_PATH, schema=schema)

are_same = df_w.orderBy("value").collect() == df_r.orderBy("value").collect()

In [2]:
are_same

True