# Spark parquet basic operations

In [1]:
from pyspark.sql import SparkSession,DataFrame
import os
import numpy as np
from pyspark.sql import functions as f
from pyspark.sql.types import *
import io
import time
from pyspark.sql import Row

In [3]:
local=False
if local:
    spark = SparkSession \
    .builder.master("local[4]") \
    .appName("SparkParquetBasics") \
    .getOrCreate()
else: 
    spark = SparkSession \
    .builder.master("k8s://https://kubernetes.default.svc:443") \
    .appName("SparkParquetBasics") \
    .config("spark.kubernetes.container.image", os.environ["IMAGE_NAME"]) \
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
    .config("spark.executor.instances", "16") \
    .config("spark.executor.memory","8g") \
    .config("spark.kubernetes.driver.pod.name", os.environ['KUBERNETES_POD_NAME'])\
    .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
    .getOrCreate()
    
# .config("spark.hadoop.fs.s3a.committer.name", "directory") \
    # .config("spark.sql.sources.commitProtocolClass", "org.apache.spark.internal.io.cloud.PathOutputCommitProtocol") \
    # .config("spark.sql.parquet.output.committer.class", "org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter") \

2021-12-07 16:22:04,459 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# 1. Merge schema

In [3]:

# merge schema example
# In this example, we create two data frame, 
# df1, we have two columns: single, double.
# df2, we have two columns: single, triple
sc = spark.sparkContext
schema_output_path1= "s3a://pengfei/diffusion/data_format/merge_schema/test_table/key=1"
schema_output_path2= "s3a://pengfei/diffusion/data_format/merge_schema/test_table/key=2"
df1 = spark.createDataFrame(sc.parallelize(range(1, 6)).map(lambda i: Row(single=i, double=i ** 2)))
df1.show()
df2 = spark.createDataFrame(sc.parallelize(range(6, 11)).map(lambda i: Row(single=i, triple=i ** 3)))
df2.show()
# then we write the two data frame in a partition folder, here we put key=1, key=2.
df1.write.parquet(schema_output_path1)
df2.write.parquet(schema_output_path2)

                                                                                

+------+------+
|single|double|
+------+------+
|     1|     1|
|     2|     4|
|     3|     9|
|     4|    16|
|     5|    25|
+------+------+



                                                                                

+------+------+
|single|triple|
+------+------+
|     6|   216|
|     7|   343|
|     8|   512|
|     9|   729|
|    10|  1000|
+------+------+



AnalysisException: path s3a://pengfei/diffusion/data_format/merge_schema/test_table/key=1 already exists.

In [None]:
# if we read the parent folder that contains the partitioned folder. The partition key become a column name, we call it partitioned column
parent_path="s3a://pengfei/diffusion/data_format/merge_schema/test_table"
# as the data frame in each partition folder has different schema, we need to set mergeSchema to true. Otherwise it will only use the schema
# of the first parquet file which it reads. 
# set the below value to false to check the output data frame.
mergedDF = spark.read.option("mergeSchema", "true").parquet(parent_path)
mergedDF.printSchema()

mergedDF.show()

# 2. Read Write partitioned parquet file

# 2.1 Read partitioned file 

In [4]:
parquet_partitioned_by_R_path= "s3a://pengfei/diffusion/data_format/sf_fire/parquet/raw"
df_R=spark.read.parquet(parquet_partitioned_by_R_path)


                                                                                

In [5]:
df_R.show(2,truncate=False)

2021-12-07 16:23:52,355 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+----------+------+--------------+--------+----------+----------+----------------------+----------------------+----------------------+----------------------+----------------------+-------------+------------+--------------------+----------------------+---------------------------+-------------+-----------------+---------+-----------+----+----------------+--------+-------------+-------+-------------+--------------+--------+--------------------------+----------------------+------------------+--------------------+-------------+---------------------------------------------+
|CallNumber|UnitID|IncidentNumber|CallType|CallDate  |WatchDate |ReceivedDtTm          |EntryDtTm             |DispatchDtTm          |ResponseDtTm          |OnSceneDtTm           |TransportDtTm|HospitalDtTm|CallFinalDisposition|AvailableDtTm         |Address                    |City         |ZipcodeofIncident|Battalion|StationArea|Box |OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumberofAlarms|UnitType|U

                                                                                

In [6]:
df_R.count()

                                                                                

5500519

# 2.2 Write Partitioned file

In [None]:
spark_partitioned_path= "s3a://pengfei/diffusion/data_format/sf_fire/parquet/spark_partition"

df_R.write.partitionBy("CallType","UnitID").mode("overwrite").save(spark_partitioned_path)

In [8]:
! ls

SparkParquetBasics.ipynb  SparkReadLargeDataSet.ipynb
