# Spark Arrow compression test 
In this section, we use spark and arrow to output parquet files with different compression algo.

In [1]:
from pyspark.sql import SparkSession,DataFrame
import os
import numpy as np
from pyspark.sql import functions as f
from pyspark.sql.types import *
import io
import time
from pyspark.sql import Row


In [2]:
local=False
# spark.rpc.message.maxSize if for write large csv file. The default value is 128, here we set it to 1024
if local:
    spark = SparkSession \
    .builder.master("local[4]") \
    .appName("SparkArrowCompression") \
    .getOrCreate()
else: 
    spark = SparkSession \
    .builder.master("k8s://https://kubernetes.default.svc:443") \
    .appName("SparkArrowCompression") \
    .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory","8g") \
    .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
    .getOrCreate()

In [4]:
! kubectl get pods

I0923 08:58:48.699103     665 request.go:655] Throttling request took 1.172025013s, request: GET:https://kubernetes.default/apis/argoproj.io/v1alpha1?timeout=32s
NAME                                            READY   STATUS    RESTARTS   AGE
flume-test-agent-df8c5b944-vtjbx                1/1     Running   0          3d19h
jupyter-324928-7b4cdf67dd-tk99l                 1/1     Running   0          11m
kafka-server-0                                  1/1     Running   0          3d20h
kafka-server-1                                  1/1     Running   0          3d19h
kafka-server-2                                  1/1     Running   0          3d20h
kafka-server-zookeeper-0                        1/1     Running   0          3d19h
sparkarrowcompression-03c6b67c11df10d9-exec-1   1/1     Running   0          2m6s
sparkarrowcompression-03c6b67c11df10d9-exec-2   1/1     Running   0          2m6s
sparkarrowcompression-03c6b67c11df10d9-exec-3   1/1     Running   0          2m5s
sparkarrowcompr

In [6]:
parquet_input_path = "s3a://pengfei/diffusion/data_format/ny_taxis/parquet/raw"
compress_output_path = "s3a://pengfei/diffusion/data_format/ny_taxis/parquet/compress"
output_path="s3a://pengfei/diffusion/data_format/spark_netflix/"
csv_input_path="s3a://pengfei/diffusion/data_format/ny_taxis/csv"

In [7]:
def check_spark_parquet_read_time(path:str)->DataFrame:
    t1=time.time()
    df=spark.read.parquet(path)
    print(f"data frame has {df.count()} rows, {len(df.columns)} columns")
    t2=time.time()
    print(f"Spark read above data frame in parquet format, and spents: {t2 - t1} s")
    return df

# read parquet generated by arrow    
df=check_spark_parquet_read_time(parquet_input_path)

df.printSchema()

# read parquet generated by spark
# check_spark_parquet_read_time("s3a://pengfei/diffusion/data_format/netflix.parquet")

data frame has 170896055 rows, 18 columns
Spark read above data frame in parquet format, and spents: 17.811139345169067 s
root
 |-- vendor_id: string (nullable = true)
 |-- pickup_at: timestamp (nullable = true)
 |-- dropoff_at: timestamp (nullable = true)
 |-- passenger_count: byte (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)
 |-- rate_code_id: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: float (nullable = true)
 |-- dropoff_latitude: float (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- total_amount: float (nullable = true)



In [8]:
def check_spark_csv_write_time(df:DataFrame,path:str):
    t1=time.time()
    df.coalesce(1).write.option("header","true").csv(path)
    print(f"data frame has {df.count()} rows, {len(df.columns)} columns")
    t2=time.time()
    print(f"Spark read time spents: {t2 - t1} s")

# check_spark_csv_write_time(df,f"{csv_input_path}/2011_2012")   

def check_spark_csv_read_time(path):
    t1=time.time()
    df=spark.read.csv(path)
    print(f"data frame has {df.count()} rows, {len(df.columns)} columns")
    t2=time.time()
    print(f"Spark read time spents: {t2 - t1} s")
    return df

# df_fire=check_spark_read_csv_time(csv_example)


In [9]:
def check_spark_parquet_write_time(df,path,partition_number,compression_algo):
    t1=time.time()
    df.coalesce(partition_number).write \
    .option("parquet.compression",compression_algo) \
    .parquet(path) 
    t2=time.time()
    print(f"Spark write parquet with {compression_algo} compression, it spents : {t2 - t1} s")
 

# 1. Spark compression example

# 1.1 Spark Compress with gzip

In [10]:
# Spark write parquet with gzip compression, it spents : 327.7600781917572 s

comp_algo="gzip"
check_spark_parquet_write_time(df,f"{compress_output_path}/2009_{comp_algo}",8,comp_algo)

Spark write parquet with gzip compression, it spents : 327.7600781917572 s


In [11]:
! mc ls --summarize s3/pengfei/diffusion/data_format/ny_taxis/parquet/compress/2009_gzip | grep "Total Size"

Total Size: 3.8 GiB


## 1.2 Spark compress with snappy

In [13]:
#Spark write parquet with snappy compression, it spents : 210.80829095840454 s

comp_algo="snappy"
check_spark_parquet_write_time(df,f"{compress_output_path}/2009_{comp_algo}",8,comp_algo)

Spark write parquet with snappy compression, it spents : 210.80829095840454 s


In [14]:
! mc ls --summarize s3/pengfei/diffusion/data_format/ny_taxis/parquet/compress/2009_snappy | grep "Total Size"

Total Size: 4.5 GiB


## 1.3 Spark Compress with lz4
missing lz4 dependencies, but the doc says it's supported by default 
https://spark.apache.org/docs/latest/sql-data-sources-parquet.html

In [None]:
comp_algo="lz4"
check_spark_parquet_write_time(df,f"{compress_output_path}/2009_{comp_algo}",8,comp_algo)

## 1.4 Spark compress with lzo
missing lzo dependencies, but the doc says it's supported by default 
https://spark.apache.org/docs/latest/sql-data-sources-parquet.html

In [None]:
comp_algo="lzo"
check_spark_parquet_write_time(df,f"{compress_output_path}/2009_{comp_algo}",8,comp_algo)

## 1.5 Spark compress with brotli
doc says it's not supported by default, so missing brotli dependencies is normal

In [None]:
comp_algo="brotli"
check_spark_parquet_write_time(df,f"{compress_output_path}/2009_{comp_algo}",8,comp_algo)

## 1.6 Spark compress with zstd
doc says it's not supported by default, so missing zstd dependencies is normal

In [None]:
# zstd is not supported by default
comp_algo="zstd"
check_spark_parquet_write_time(df,f"{compress_output_path}/2019_{comp_algo}",8,comp_algo)

## Pyarrow writes parquet with compression


In [29]:
import pathlib
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import s3fs
import os
import time

In [30]:
# This function reads a parquet data set (partitioned partque files) from s3, and returns an arrow table
def read_parquet_from_s3(endpoint: str, bucket_name, path):
    url = f"https://{endpoint}"
    fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': url})
    file_uri = f"{bucket_name}/{path}"
    str_info = fs.info(file_uri)
    print(f"input file metadata: {str_info}")
    dataset = pq.ParquetDataset(file_uri, filesystem=fs)
    table = dataset.read()
    return table

# check read time
def check_arrow_read_time(endpoint, bucket, path):
    t1 = time.time()
    arrow_table=read_parquet_from_s3(endpoint, bucket, path)
    get_shape(arrow_table)
    t2 = time.time()
    print(f"Arrow read time spents: {t2 - t1} s")

In [31]:
endpoint = os.environ['AWS_S3_ENDPOINT']
bucket = "pengfei"
input_path = "diffusion/data_format/ny_taxis/parquet/raw/"

In [32]:
check_arrow_read_time(endpoint,bucket, input_path)

input file metadata: {'name': 'pengfei/diffusion/data_format/ny_taxis/parquet/raw', 'size': 0, 'type': 'directory'}


OSError: [Errno 22] Bad Request