# Pyarrow compatibility check:  

In this section, we will use Pyarrow to Read Parquet file that are generated by spark.



In [8]:
import pyarrow.parquet as pq
import s3fs
from pyarrow import fs
import os
import time

In [15]:
# This function reads a parquet data set (partitioned partque files) from s3, and returns an arrow table
def read_parquet_from_s3(endpoint: str, bucket_name, path):
    url = f"https://{endpoint}"
    fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': url})
    file_uri = f"{bucket_name}/{path}"
    str_info = fs.info(file_uri)
    print(f"input file metadata: {str_info}")
    dataset = pq.ParquetDataset(file_uri, filesystem=fs)
    table = dataset.read()
    return table

In [16]:
# This function reads an arrow table, convert it to a pandas dataframe, then return the shape of the dataframe. 
def get_shape(table):
    df = table.to_pandas()
    print(f"shape of the data set: {df.shape}")

In [24]:
# This function write an arrow table to s3 as parquet files
# beware when you use the partition cols. For example, if you use date as partition col, and your date has 3000 distinct value, 
# the output parquet dataset will have 3000 directory, each directory contains one parquet file that has the specific date value.
def write_parquet_as_partitioned_dataset(table, endpoint, bucket_name, path, partition_cols=None):
    url = f"https://{endpoint}"
    fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': url})
    file_uri = f"{bucket_name}/{path}"
    pq.write_to_dataset(table, root_path=file_uri, partition_cols=partition_cols, filesystem=fs)

In [21]:
endpoint = os.environ['AWS_S3_ENDPOINT']
bucket = "pengfei"
input_path = "diffusion/data_format/netflix.parquet"

In [19]:
# check read time
def check_arrow_read_time(endpoint, bucket, path):
    t1 = time.time()
    arrow_table=read_parquet_from_s3(endpoint, bucket, path)
    get_shape(arrow_table)
    t2 = time.time()
    print(f"Arrow read time spents: {t2 - t1} s")
    
check_arrow_read_time(endpoint,bucket,input_path)

input file metadata: {'name': 'pengfei/diffusion/data_format/netflix.parquet', 'size': 0, 'type': 'directory'}
shape of the data set: (24058262, 3)
Arrow read time spents: 7.2179670333862305 s


In [30]:
# check write time
def check_write_time(table, endpoint, bucket_name, path, partition_cols=None):
    t1=time.time()
    write_parquet_as_partitioned_dataset(table, endpoint, bucket_name, path, partition_cols)
    t2=time.time()
    print(f"Arrow write time spents: {t2 - t1} s")

arrow_output_path = "diffusion/data_format/arrow_netflix.parquet"
arrow_table=read_parquet_from_s3(endpoint, bucket, input_path)
partition_cols = ['rating']

# test write without partition
# check_write_time(arrow_table, endpoint, bucket, arrow_output_path)

# test write time with partition
check_write_time(arrow_table, endpoint, bucket, arrow_output_path, partition_cols)

input file metadata: {'name': 'pengfei/diffusion/data_format/netflix.parquet', 'size': 0, 'type': 'directory'}
user_id
rating
date
Arrow write time spents: 19.849036931991577 s


In [None]:
# This function write an arrow table to s3 as parquet files, you can specify a compression type
# compression (str or dict) – Specify the compression codec, either on a general basis or per-column. 
# Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’}.
# default is snappy.

def write_parquet_as_partitioned_dataset(table, endpoint, bucket_name, path, partition_cols=None, compression="SNAPPY}"):
    url = f"https://{endpoint}"
    fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': url})
    file_uri = f"{bucket_name}/{path}"
    pq.write_to_dataset(table, root_path=file_uri, partition_cols=partition_cols, filesystem=fs,compression=compression)
    
