# Spark parquet basic operations

In [None]:
from pyspark.sql import SparkSession,DataFrame
import os
import numpy as np
from pyspark.sql import functions as f
from pyspark.sql.types import *
import io
import time
from pyspark.sql import Row

In [None]:
local=False
if local:
    spark = SparkSession \
    .builder.master("local[4]") \
    .appName("SparkParquetBasics") \
    .getOrCreate()
else: 
    spark = SparkSession \
    .builder.master("k8s://https://kubernetes.default.svc:443") \
    .appName("SparkParquetBasics") \
    .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory","8g") \
    .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
    .getOrCreate()

# Merge schema

In [None]:

# merge schema example
# In this example, we create two data frame, 
# df1, we have two columns: single, double.
# df2, we have two columns: single, triple
sc = spark.sparkContext
schema_output_path1= "s3a://pengfei/diffusion/data_format/merge_schema/test_table/key=1"
schema_output_path2= "s3a://pengfei/diffusion/data_format/merge_schema/test_table/key=2"
df1 = spark.createDataFrame(sc.parallelize(range(1, 6)).map(lambda i: Row(single=i, double=i ** 2)))
df1.show()
df2 = spark.createDataFrame(sc.parallelize(range(6, 11)).map(lambda i: Row(single=i, triple=i ** 3)))
df2.show()
# then we write the two data frame in a partition folder, here we put key=1, key=2.
df1.write.parquet(schema_output_path1)
df2.write.parquet(schema_output_path2)

In [None]:
# if we read the parent folder that contains the partitioned folder. The partition key become a column name, we call it partitioned column
parent_path="s3a://pengfei/diffusion/data_format/merge_schema/test_table"
# as the data frame in each partition folder has different schema, we need to set mergeSchema to true. Otherwise it will only use the schema
# of the first parquet file which it reads. 
# set the below value to false to check the output data frame.
mergedDF = spark.read.option("mergeSchema", "true").parquet(parent_path)
mergedDF.printSchema()

mergedDF.show()