# Spark compatibility check
In this section, we use spark to read parquet file that are generated by arrow

In [10]:
from pyspark.sql import SparkSession
import os
import numpy as np
from pyspark.sql import functions as f
from pyspark.sql.types import *
import io
import time
from pyspark.sql import Row


In [4]:
spark = SparkSession \
    .builder.master("k8s://https://kubernetes.default.svc:443") \
    .appName("SparkReadArrow") \
    .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory","8g") \
    .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
    .getOrCreate()

In [23]:
! kubectl get pods

I0915 14:22:37.084522    2169 request.go:655] Throttling request took 1.168941502s, request: GET:https://kubernetes.default/apis/rbac.authorization.k8s.io/v1beta1?timeout=32s
NAME                                                        READY   STATUS      RESTARTS   AGE
argo-workflows-315671-server-7d5cf97d7-rft67                1/1     Running     0          2d7h
argo-workflows-315671-workflow-controller-f9545ff99-nmzlj   1/1     Running     0          2d7h
flume-test-agent-df8c5b944-j47lw                            1/1     Running     0          3d17h
jupyter-107079-85887dc86c-4h46d                             1/1     Running     0          6h13m
kafka-server-0                                              1/1     Running     0          3d19h
kafka-server-1                                              1/1     Running     0          3d16h
kafka-server-2                                              1/1     Running     0          4d17h
kafka-server-zookeeper-0                             

In [21]:
parquet_input_path = "s3a://pengfei/diffusion/data_format/arrow_netflix/"
output_path="s3a://pengfei/diffusion/data_format/spark_netflix/"

In [25]:
def check_spark_read_time(path):
    t1=time.time()
    df=spark.read.parquet(path)
    print(f"data frame has {df.count()} rows, {len(df.columns)} columns")
    t2=time.time()
    print(f"Spark read time spents: {t2 - t1} s")
# read parquet generated by arrow    
check_spark_read_time(parquet_input_path)

# read parquet generated by spark
check_spark_read_time("s3a://pengfei/diffusion/data_format/netflix.parquet")

data frame has 24058262 rows, 3 columns
Spark read time spents: 0.8141007423400879 s
data frame has 24058262 rows, 3 columns
Spark read time spents: 0.8090753555297852 s


In [22]:
def check_spark_write_time(df,path):
    t1=time.time()
    df.write.parquet(path)
    t2=time.time()
    print(f"Spark write time spents: {t2 - t1} s")
    
df=spark.read.parquet(parquet_input_path)
check_spark_write_time(df,output_path)

Spark write time spents: 26.110023498535156 s


In [16]:
# merge schema example
# In this example, we create two data frame, 
# df1, we have two columns: single, double.
# df2, we have two columns: single, triple
sc = spark.sparkContext
schema_output_path1= "s3a://pengfei/diffusion/data_format/merge_schema/test_table/key=1"
schema_output_path2= "s3a://pengfei/diffusion/data_format/merge_schema/test_table/key=2"
df1 = spark.createDataFrame(sc.parallelize(range(1, 6)).map(lambda i: Row(single=i, double=i ** 2)))
df1.show()
df2 = spark.createDataFrame(sc.parallelize(range(6, 11)).map(lambda i: Row(single=i, triple=i ** 3)))
df2.show()
# then we write the two data frame in a partition folder, here we put key=1, key=2.
df1.write.parquet(schema_output_path1)
df2.write.parquet(schema_output_path2)

+------+------+
|single|double|
+------+------+
|     1|     1|
|     2|     4|
|     3|     9|
|     4|    16|
|     5|    25|
+------+------+

+------+------+
|single|triple|
+------+------+
|     6|   216|
|     7|   343|
|     8|   512|
|     9|   729|
|    10|  1000|
+------+------+



AnalysisException: path s3a://pengfei/diffusion/data_format/merge_schema/test_table/key=1 already exists.

In [17]:
# if we read the parent folder that contains the partitioned folder. The partition key become a column name, we call it partitioned column
parent_path="s3a://pengfei/diffusion/data_format/merge_schema/test_table"
# as the data frame in each partition folder has different schema, we need to set mergeSchema to true. Otherwise it will only use the schema
# of the first parquet file which it reads. 
# set the below value to false to check the output data frame.
mergedDF = spark.read.option("mergeSchema", "true").parquet(parent_path)
mergedDF.printSchema()

mergedDF.show()

root
 |-- single: long (nullable = true)
 |-- double: long (nullable = true)
 |-- key: integer (nullable = true)

+------+------+---+
|single|double|key|
+------+------+---+
|     9|  null|  2|
|    10|  null|  2|
|     4|    16|  1|
|     5|    25|  1|
|     6|  null|  2|
|     7|  null|  2|
|     8|  null|  2|
|     1|     1|  1|
|     2|     4|  1|
|     3|     9|  1|
+------+------+---+

