# Spark compatibility check
In this section, we use spark to read parquet file that are generated by arrow

In [1]:
from pyspark.sql import SparkSession,DataFrame
import os
import numpy as np
from pyspark.sql import functions as f
from pyspark.sql.types import *
import io
import time
from pyspark.sql import Row


In [2]:
local=False
# rpc.message.maxSize if for write large csv file. The default value is 128, here we set it to 1024
if local:
    spark = SparkSession \
    .builder.master("local[4]") \
    .appName("SparkReadArrow") \
    .getOrCreate()
else: 
    spark = SparkSession \
    .builder.master("k8s://https://kubernetes.default.svc:443") \
    .appName("SparkReadArrow") \
    .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory","8g") \
    .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
    .getOrCreate()

In [3]:
! kubectl get pods

I0922 07:39:05.655316    8944 request.go:655] Throttling request took 1.173939039s, request: GET:https://kubernetes.default/apis/crd.projectcalico.org/v1?timeout=32s
NAME                                     READY   STATUS    RESTARTS   AGE
flume-test-agent-df8c5b944-vtjbx         1/1     Running   0          2d18h
jupyter-12907-54b4894749-5h87t           1/1     Running   0          23h
jupyter-168161-5cbfcc8f85-5qn9h          1/1     Running   0          28m
kafka-server-0                           1/1     Running   0          2d18h
kafka-server-1                           1/1     Running   0          2d18h
kafka-server-2                           1/1     Running   0          2d19h
kafka-server-zookeeper-0                 1/1     Running   0          2d18h
sparkreadarrow-5491277c0c714586-exec-1   1/1     Running   0          24s
sparkreadarrow-5491277c0c714586-exec-2   1/1     Running   0          24s
sparkreadarrow-5491277c0c714586-exec-3   1/1     Running   0          24s
sparkreada

In [7]:
parquet_input_path = "s3a://pengfei/diffusion/data_format/ny_taxis/parquet/raw_2011_2012"

output_path="s3a://pengfei/diffusion/data_format/spark_netflix/"
csv_input_path="s3a://pengfei/diffusion/data_format/ny_taxis/csv"

In [8]:
def check_spark_parquet_read_time(path:str)->DataFrame:
    t1=time.time()
    df=spark.read.parquet(path)
    print(f"data frame has {df.count()} rows, {len(df.columns)} columns")
    t2=time.time()
    print(f"Spark read above data frame in parquet format, and spents: {t2 - t1} s")
    return df

# read parquet generated by arrow    
df=check_spark_parquet_read_time(parquet_input_path)

df.printSchema()

# read parquet generated by spark
# check_spark_parquet_read_time("s3a://pengfei/diffusion/data_format/netflix.parquet")

data frame has 355441523 rows, 18 columns
Spark read above data frame in parquet format, and spents: 15.679622888565063 s
root
 |-- vendor_id: string (nullable = true)
 |-- pickup_at: timestamp (nullable = true)
 |-- dropoff_at: timestamp (nullable = true)
 |-- passenger_count: byte (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)
 |-- rate_code_id: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: float (nullable = true)
 |-- dropoff_latitude: float (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- total_amount: float (nullable = true)



In [None]:
def check_spark_csv_write_time(df:DataFrame,path:str):
    t1=time.time()
    df.coalesce(1).write.option("header","true").csv(path)
    print(f"data frame has {df.count()} rows, {len(df.columns)} columns")
    t2=time.time()
    print(f"Spark read time spents: {t2 - t1} s")

check_spark_csv_write_time(df,f"{csv_input_path}/2011_2012")   

In [22]:
def check_spark_write_time(df,path):
    t1=time.time()
    df.write.parquet(path)
    t2=time.time()
    print(f"Spark write time spents: {t2 - t1} s")
    
df=spark.read.parquet(parquet_input_path)
check_spark_write_time(df,output_path)

Spark write time spents: 26.110023498535156 s


In [26]:
df.show()

+---------+-------------------+-------------------+---------------+-------------+----------------+---------------+------------+------------------+-----------------+----------------+------------+-----------+-----+-------+----------+------------+------------+
|vendor_id|          pickup_at|         dropoff_at|passenger_count|trip_distance|pickup_longitude|pickup_latitude|rate_code_id|store_and_fwd_flag|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|total_amount|
+---------+-------------------+-------------------+---------------+-------------+----------------+---------------+------------+------------------+-----------------+----------------+------------+-----------+-----+-------+----------+------------+------------+
|      VTS|2010-01-26 07:41:00|2010-01-26 07:45:00|              1|         0.75|       -73.95678|       40.76775|           1|              null|        -73.96596|       40.765232|         CAS|        4.5|  0.0|    0.5|      

root
 |-- single: long (nullable = true)
 |-- double: long (nullable = true)
 |-- key: integer (nullable = true)

+------+------+---+
|single|double|key|
+------+------+---+
|     9|  null|  2|
|    10|  null|  2|
|     4|    16|  1|
|     5|    25|  1|
|     6|  null|  2|
|     7|  null|  2|
|     8|  null|  2|
|     1|     1|  1|
|     2|     4|  1|
|     3|     9|  1|
+------+------+---+



In [None]:
def check_spark_read_csv_time(path):
    t1=time.time()
    df=spark.read.csv(path)
    print(f"data frame has {df.count()} rows, {len(df.columns)} columns")
    t2=time.time()
    print(f"Spark read time spents: {t2 - t1} s")
    return df

df_fire=check_spark_read_csv_time(csv_example)


In [None]:
#
fire_output_path1="s3a://pengfei/diffusion/data_format/Fire_Department1.parquet"
# use gzip compression instead of snappy
fire_output_path2="s3a://pengfei/diffusion/data_format/Fire_Department2.parquet"
# change default dictionary size to 512KB
fire_output_path3="s3a://pengfei/diffusion/data_format/Fire_Department3.parquet"
# this parquet has default partition 8, so it has 8 parquet file
fire_output_path0="s3a://pengfei/diffusion/data_format/Fire_Department.parquet"

# check_spark_write_time(df_fire,fire_output_path)

# set the block size to 128MB, this does not work, because the number of parquet partition is set by the number of dataframe partition.
# df_fire.write.option("parquet.block.size",256 * 1024 * 1024).parquet(fire_output_path)


## Parquet block size

When writing a parquet file to disk, we have to consider how the parquet file is stored physically. 
- In a block storage file system, if your partquet partition size is bigger than the block size,  your parquet partition will be splitted into blocks, it means the columns of row group are splitted into different blocks. Or even worse, one column is splitted into different blocks. So its recommended that the parquet block size should have the same block size of the block storage. The option **parquet.block.size** can help you to set the size of block
- In an object storage file system, the parquet file partition will not be splitted, because object storage stores data as one single object. As a result, **parquet.block.size** is meaningless in this kind of situation.

In [None]:

# we can set the
df_fire.coalesce(1).write \
# it only works for the block storage such as hdfs. 

# But for object storgage such as s3 or minio, we don't have the storage 
.option("parquet.block.size",256 * 1024 * 1024) \
.option("parquet.page.size",3*1024*1024) \
.option("parquet.dictionary.page.size",512 * 1024) \
.option("parquet.enable.dictionary", "true") \
.option("parquet.compression","gzip") \
.parquet(fire_output_path0)


In [None]:
df0=check_spark_parquet_read_time(fire_output_path0)
df1=check_spark_parquet_read_time(fire_output_path1)
df2=check_spark_parquet_read_time(fire_output_path2)
df3=check_spark_parquet_read_time(fire_output_path3)

# Parquet configuration

- parquet.block.size: It defines the default size of a parquet partition file. If you use hdfs to store these file, the Parquet file block size should be no larger than the HDFS block size for the file so that each Parquet block can be read from a single HDFS block (and therefore from a single datanode). It is common to set them to be the same, and indeed both defaults are for 128 MB block sizes. Note the default size does not mean that all parquet partition files will have the exact same size. But their size will be approximate to this value. For example, if the default size is 128MB, then one can have 127.66MB, one can have 126.98MB.

- parquet.page.size: It defines the default size of a page in a column. A page is the smallest unit of storage in a Parquet file, so retrieving an arbitrary row requires that the page containing the row be decompressed and decoded. Thus, for single-row lookups, it is more efficient to have smaller pages, so there are fewer values to read through before reaching the target value.

- parquet.dictionary.page.size: The maximum allow size in byte of a dictionary before falling back to plain encoding for a page

- parquet.enable.dictionary: Enable dictionary encoding or not.

- parquet.compress: choose the compression type.



# Some tips:
1. Dictionary encoding:
Smaller files means there will be less I/O involved. Dictionary encoding will ensure that there is improvement in storage and accessing. For one column chunk there will be single dictionary. Most types are encoded using dictionary en‐ coding by default; however, a plain encoding will be used as a fallback if the dictionary becomes too large. The threshold size at which this happens is referred to as the dictionary page size and is the same as the page size by default. Please refer to parquet configuration section for more information. One can validate whether the file is dictionary encoded by using the parquet-tools.
In order to perform better we need to decrease the row group size and increase the dictionary page size.
2. Page Compression:
The below default compression schemes while using the Parquet format.
Spark uses snappy as default.
Impala uses snappy as default.
Hive uses deflate codec as default.
Using snappy compression will reduce the size of the page and improve read time.
Using Parquet format allows for better compression, as data is more homogeneous. The space savings are very noticeable at the scale of a Hadoop cluster. I/O will be reduced as we can efficiently scan only a subset of the columns while reading the data. Better compression also reduces the bandwidth required to read the input. As we store data of the same type in each column, we can use encoding better suited to the modern processors’ pipeline by making instruction branching more predictable. Parquet format is mainly used for WRITE ONCE READ MANY applications.
I hope this blog helped you in understanding the parquet format and internal functionality. Happy Learning!!!


# Spark parquet config set to false by default
spark.sql.parquet.mergeSchema
spark.sql.parquet.respectSummaryFiles
spark.sql.parquet.binaryAsString
spark.sql.parquet.int96TimestampConversion
spark.sql.parquet.int64AsTimestampMillis
spark.sql.parquet.writeLegacyFormat
spark.sql.parquet.recordLevelFilter.enabled

# Spark parquet config set to true by default

spark.sql.parquet.int96AsTimestamp
spark.sql.parquet.filterPushdown
spark.sql.parquet.filterPushdown.date
spark.sql.parquet.filterPushdown.timestamp
spark.sql.parquet.filterPushdown.decimal
spark.sql.parquet.filterPushdown.string.startsWith
spark.sql.parquet.enableVectorizedReader

# These properties need value and listing it with defaults-

spark.sql.parquet.outputTimestampType = INT96
spark.sql.parquet.compression.codec = snappy
spark.sql.parquet.pushdown.inFilterThreshold = 10
spark.sql.parquet.output.committer.class = org.apache.parquet.hadoop.ParquetOutputCommitter
spark.sql.parquet.columnarReaderBatchSize = 4096

Regarding parquet.enable.dictionary, it is not supported by Spark yet. But it can be set in sqlContext as -

sqlContext.setConf("parquet.enable.dictionary", "false")
Default value is of this property is true in parquet. Therefore, it should be true when parquet code is called from Spark.