# Spark Arrow compression test 
In this section, we use spark and arrow to output parquet files with different compression algo.

In [30]:
from pyspark.sql import SparkSession,DataFrame
import os
import numpy as np
from pyspark.sql import functions as f
from pyspark.sql.types import *
import io
import time
from pyspark.sql import Row


In [31]:
local=False
# spark.rpc.message.maxSize if for write large csv file. The default value is 128, here we set it to 1024
if local:
    spark = SparkSession \
    .builder.master("local[4]") \
    .appName("SparkArrowCompression") \
    .getOrCreate()
else: 
    spark = SparkSession \
    .builder.master("k8s://https://kubernetes.default.svc:443") \
    .appName("SparkArrowCompression") \
    .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory","8g") \
    .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
    .getOrCreate()

In [28]:
! kubectl get pods

I0923 14:15:29.814286    2508 request.go:655] Throttling request took 1.173250271s, request: GET:https://kubernetes.default/apis/coordination.k8s.io/v1beta1?timeout=32s
NAME                                            READY   STATUS    RESTARTS   AGE
flume-test-agent-df8c5b944-vtjbx                1/1     Running   0          4d
jupyter-324928-7b4cdf67dd-tk99l                 1/1     Running   0          5h28m
kafka-server-0                                  1/1     Running   0          4d1h
kafka-server-1                                  1/1     Running   0          4d
kafka-server-2                                  1/1     Running   0          4d2h
kafka-server-zookeeper-0                        1/1     Running   0          4d
sparkarrowcompression-03c6b67c11df10d9-exec-1   0/1     Error     0          5h18m
sparkarrowcompression-03c6b67c11df10d9-exec-2   0/1     Error     0          5h18m
sparkarrowcompression-03c6b67c11df10d9-exec-3   0/1     Error     0          5h18m
sparkarrowcomp

In [33]:
parquet_input_path = "s3a://pengfei/diffusion/data_format/ny_taxis/parquet/raw"
compress_output_path = "s3a://pengfei/diffusion/data_format/ny_taxis/parquet/compress"
output_path="s3a://pengfei/diffusion/data_format/spark_netflix/"
csv_input_path="s3a://pengfei/diffusion/data_format/ny_taxis/csv"

In [34]:
def check_spark_parquet_read_time(path:str)->DataFrame:
    t1=time.time()
    df=spark.read.parquet(path)
    print(f"data frame has {df.count()} rows, {len(df.columns)} columns")
    t2=time.time()
    print(f"Spark read above data frame in parquet format, and spents: {t2 - t1} s")
    return df

# read parquet generated by arrow    
df=check_spark_parquet_read_time(parquet_input_path)

df.printSchema()

# read parquet generated by spark
# check_spark_parquet_read_time("s3a://pengfei/diffusion/data_format/netflix.parquet")

data frame has 170896055 rows, 18 columns
Spark read above data frame in parquet format, and spents: 20.538439750671387 s
root
 |-- vendor_id: string (nullable = true)
 |-- pickup_at: timestamp (nullable = true)
 |-- dropoff_at: timestamp (nullable = true)
 |-- passenger_count: byte (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)
 |-- rate_code_id: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: float (nullable = true)
 |-- dropoff_latitude: float (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- total_amount: float (nullable = true)



In [8]:
def check_spark_csv_write_time(df:DataFrame,path:str):
    t1=time.time()
    df.coalesce(1).write.option("header","true").csv(path)
    print(f"data frame has {df.count()} rows, {len(df.columns)} columns")
    t2=time.time()
    print(f"Spark read time spents: {t2 - t1} s")

# check_spark_csv_write_time(df,f"{csv_input_path}/2011_2012")   

def check_spark_csv_read_time(path):
    t1=time.time()
    df=spark.read.csv(path)
    print(f"data frame has {df.count()} rows, {len(df.columns)} columns")
    t2=time.time()
    print(f"Spark read time spents: {t2 - t1} s")
    return df

# df_fire=check_spark_read_csv_time(csv_example)


In [9]:
def check_spark_parquet_write_time(df,path,partition_number,compression_algo):
    t1=time.time()
    df.coalesce(partition_number).write \
    .option("parquet.compression",compression_algo) \
    .parquet(path) 
    t2=time.time()
    print(f"Spark write parquet with {compression_algo} compression, it spents : {t2 - t1} s")
 

# 1. Spark compression example

# 1.1 Spark Compress with gzip

In [10]:
# Spark write parquet with gzip compression, it spents : 327.7600781917572 s

comp_algo="gzip"
check_spark_parquet_write_time(df,f"{compress_output_path}/2009_{comp_algo}",8,comp_algo)

Spark write parquet with gzip compression, it spents : 327.7600781917572 s


In [11]:
! mc ls --summarize s3/pengfei/diffusion/data_format/ny_taxis/parquet/compress/2009_gzip | grep "Total Size"

Total Size: 3.8 GiB


## 1.2 Spark compress with snappy

In [13]:
#Spark write parquet with snappy compression, it spents : 210.80829095840454 s

comp_algo="snappy"
check_spark_parquet_write_time(df,f"{compress_output_path}/2009_{comp_algo}",8,comp_algo)

Spark write parquet with snappy compression, it spents : 210.80829095840454 s


In [14]:
! mc ls --summarize s3/pengfei/diffusion/data_format/ny_taxis/parquet/compress/2009_snappy | grep "Total Size"

Total Size: 4.5 GiB


## 1.3 Spark Compress with lz4
missing lz4 dependencies, but the doc says it's supported by default 
https://spark.apache.org/docs/latest/sql-data-sources-parquet.html

In [None]:
comp_algo="lz4"
check_spark_parquet_write_time(df,f"{compress_output_path}/2009_{comp_algo}",8,comp_algo)

## 1.4 Spark compress with lzo
missing lzo dependencies, but the doc says it's supported by default 
https://spark.apache.org/docs/latest/sql-data-sources-parquet.html

In [None]:
comp_algo="lzo"
check_spark_parquet_write_time(df,f"{compress_output_path}/2009_{comp_algo}",8,comp_algo)

## 1.5 Spark compress with brotli
doc says it's not supported by default, so missing brotli dependencies is normal

In [None]:
comp_algo="brotli"
check_spark_parquet_write_time(df,f"{compress_output_path}/2009_{comp_algo}",8,comp_algo)

## 1.6 Spark compress with zstd
doc says it's not supported by default, so missing zstd dependencies is normal

In [None]:
# zstd is not supported by default
comp_algo="zstd"
check_spark_parquet_write_time(df,f"{compress_output_path}/2019_{comp_algo}",8,comp_algo)

## Pyarrow writes parquet with compression


In [1]:
import pathlib
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import s3fs
import os
import time

In [8]:
# This function reads a parquet data set (partitioned partque files) from s3, and returns an arrow table
def read_parquet_from_s3(endpoint: str, bucket_name, path):
    url = f"https://{endpoint}"
    fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': url})
    file_uri = f"{bucket_name}/{path}"
    str_info = fs.info(file_uri)
    print(f"input file metadata: {str_info}")
    dataset = pq.ParquetDataset(file_uri, filesystem=fs, metadata_nthreads=8)
    table = dataset.read()
    return table

# check read time
def check_arrow_read_time(endpoint, bucket, path):
    t1 = time.time()
    arrow_table=read_parquet_from_s3(endpoint, bucket, path)
    get_shape(arrow_table)
    t2 = time.time()
    print(f"Arrow read time spents: {t2 - t1} s")
    return arrow_table
    
# This function reads an arrow table, convert it to a pandas dataframe, then return the shape of the dataframe. 
def get_shape(table):
    df = table.to_pandas()
    print(f"shape of the data set: {df.shape}")

In [9]:
endpoint = os.environ['AWS_S3_ENDPOINT']
bucket = "pengfei"
# don't add / after raw, it will raise error
input_path = "diffusion/data_format/Fire_Department.parquet"

In [10]:
arrow_table=check_arrow_read_time(endpoint,bucket, input_path)

input file metadata: {'name': 'pengfei/diffusion/data_format/Fire_Department.parquet', 'size': 0, 'type': 'directory'}
shape of the data set: (5500520, 35)
Arrow read time spents: 37.68216300010681 s


In [39]:
pdf=arrow_table.to_pandas()
print(pdf.head(2))

           _c0      _c1              _c2        _c3         _c4         _c5  \
0  Call Number  Unit ID  Incident Number  Call Type   Call Date  Watch Date   
1    210391607      E19         21017645     Alarms  02/08/2021  02/08/2021   

                      _c6                     _c7                     _c8  \
0           Received DtTm              Entry DtTm           Dispatch DtTm   
1  02/08/2021 01:00:14 PM  02/08/2021 01:01:36 PM  02/08/2021 01:01:40 PM   

                      _c9  ...             _c25              _c26       _c27  \
0           Response DtTm  ...  Call Type Group  Number of Alarms  Unit Type   
1  02/08/2021 01:03:21 PM  ...            Alarm                 1     ENGINE   

                             _c28                      _c29  \
0  Unit sequence in call dispatch  Fire Prevention District   
1                               1                         8   

                  _c30                                  _c31           _c32  \
0  Supervisor Distri

In [17]:
# This function write an arrow table to s3 as parquet files, you can specify a compression type
# compression (str or dict) – Specify the compression codec, either on a general basis or per-column. 
# Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’}.
# default is snappy.

def write_parquet_as_partitioned_dataset(table, endpoint, bucket_name, path, partition_cols=None, compression="SNAPPY"):
    url = f"https://{endpoint}"
    fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': url})
    file_uri = f"{bucket_name}/{path}"
    pq.write_to_dataset(table, root_path=file_uri, partition_cols=partition_cols, filesystem=fs, compression=compression)
    
# check write time
def check_write_time(table, endpoint, bucket_name, path, partition_cols=None, compression="SNAPPY"):
    t1=time.time()
    write_parquet_as_partitioned_dataset(table, endpoint, bucket_name, path, partition_cols,compression=compression)
    t2=time.time()
    print(f"Arrow write time spents: {t2 - t1} s")

In [18]:
output_path=f"diffusion/data_format/arrow_snappy_fire_department.parquet"
check_write_time(arrow_table,endpoint,bucket,output_path)

Arrow write time spents: 40.83500552177429 s


In [20]:
! mc ls --summarize s3/pengfei/diffusion/data_format/arrow_snappy_fire_department.parquet | grep "Total Size"

Total Size: 619 MiB


In [19]:
output_path="diffusion/data_format/arrow_zstd_fire_department.parquet"
check_write_time(arrow_table,endpoint,bucket,output_path,compression="ZSTD")

Arrow write time spents: 31.99910068511963 s


In [21]:
! mc ls --summarize s3/pengfei/diffusion/data_format/arrow_zstd_fire_department.parquet | grep "Total Size"

Total Size: 393 MiB


In [22]:
output_path="diffusion/data_format/arrow_gzip_fire_department.parquet"
check_write_time(arrow_table,endpoint,bucket,output_path,compression="GZIP")

Arrow write time spents: 152.42934775352478 s


In [23]:
! mc ls --summarize s3/pengfei/diffusion/data_format/arrow_gzip_fire_department.parquet | grep "Total Size"

Total Size: 380 MiB


In [24]:
output_path="diffusion/data_format/arrow_lz4_fire_department.parquet"
check_write_time(arrow_table,endpoint,bucket,output_path,compression="LZ4")

Arrow write time spents: 41.850053787231445 s


In [25]:
! mc ls --summarize s3/pengfei/diffusion/data_format/arrow_lz4_fire_department.parquet | grep "Total Size"

Total Size: 612 MiB


In [26]:
output_path="diffusion/data_format/arrow_brotli_fire_department.parquet"
check_write_time(arrow_table,endpoint,bucket,output_path,compression="BROTLI")

Arrow write time spents: 176.08239126205444 s


In [27]:
! mc ls --summarize s3/pengfei/diffusion/data_format/arrow_brotli_fire_department.parquet | grep "Total Size"

Total Size: 340 MiB


# compression and use dictionary encoding by column
arrow allows us to specify compression and dictionary codec per column
spark does not, spark can only specify a global compression and dicionary codec.

pq.write_table(table, where, compression={'foo': 'snappy', 'bar': 'gzip'}, use_dictionary=['foo', 'bar'])

# to test if we have a mix compression, spark can read correctly or not.

Write a mixed compression

In [None]:
url = f"https://{endpoint}"
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': url})
file_uri = f"{bucket_name}/{path}"
pq.write_table(table, root_path=file_uri, filesystem=fs, compression={:"SNAPPY",:"GZIP"})

In [42]:
path1="s3a://pengfei/diffusion/data_format/arrow_snappy_fire_department.parquet"
path2="s3a://pengfei/diffusion/data_format/Fire_Department.parquet"
df=check_spark_parquet_read_time(path2)

data frame has 5500520 rows, 35 columns
Spark read above data frame in parquet format, and spents: 2.1240062713623047 s


In [43]:
print(df.columns)

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', '_c10', '_c11', '_c12', '_c13', '_c14', '_c15', '_c16', '_c17', '_c18', '_c19', '_c20', '_c21', '_c22', '_c23', '_c24', '_c25', '_c26', '_c27', '_c28', '_c29', '_c30', '_c31', '_c32', '_c33', '_c34']
