Remove double quotes from value of json string using PySpark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import ArrayType, StructType

spark = SparkSession \
    .builder \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
df = spark.read.json('/home/phillipefs/spark_dev/pyspark-real-time-scenarios/data/json_complex.json', multiLine=True)
df.printSchema()

root
 |-- batters: struct (nullable = true)
 |    |-- batter: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ppu: double (nullable = true)
 |-- topping: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- type: string (nullable = true)



In [4]:
df_final = df.withColumn('topping_explode', explode('topping'))\
             .withColumn('topping_id', col("topping_explode.id"))\
             .withColumn('topping_type', col("topping_explode.type"))\
             .drop("topping", "topping_explode")\
             .withColumn("batter_explode", explode("batters.batter"))\
             .withColumn("batter_id", col("batter_explode.id"))\
             .withColumn("batter_type", col("batter_explode.type"))\
             .drop("batter_explode", "batters")

df_final.show(truncate=False)

+----+----+----+-----+----------+------------------------+---------+------------+
|id  |name|ppu |type |topping_id|topping_type            |batter_id|batter_type |
+----+----+----+-----+----------+------------------------+---------+------------+
|0001|Cake|0.55|donut|5001      |None                    |1001     |Regular     |
|0001|Cake|0.55|donut|5001      |None                    |1002     |Chocolate   |
|0001|Cake|0.55|donut|5001      |None                    |1003     |Blueberry   |
|0001|Cake|0.55|donut|5001      |None                    |1004     |Devil's Food|
|0001|Cake|0.55|donut|5002      |Glazed                  |1001     |Regular     |
|0001|Cake|0.55|donut|5002      |Glazed                  |1002     |Chocolate   |
|0001|Cake|0.55|donut|5002      |Glazed                  |1003     |Blueberry   |
|0001|Cake|0.55|donut|5002      |Glazed                  |1004     |Devil's Food|
|0001|Cake|0.55|donut|5005      |Sugar                   |1001     |Regular     |
|0001|Cake|0.55|

Function Transform Array Struct in Columns

In [5]:
def transform_array_struct_in_columns(df):
    for column in df.columns:
        if isinstance(df.schema[column].dataType, ArrayType) and \
            isinstance(df.schema[column].dataType.elementType, StructType):
            
            array_column = column
            columns_struct = df.schema[array_column].dataType.elementType.fields
            df = df.withColumn('explode_array', explode(array_column))
            for field in columns_struct:
                df = df.withColumn(array_column+'_'+field.name, col('explode_array')[field.name])
            df = df.drop(array_column,'explode_array')
            
    return df

In [6]:
transform_array_struct_in_columns(df).show(truncate=False)

+-------------------------------------------------------------------------------+----+----+----+-----+----------+------------------------+
|batters                                                                        |id  |name|ppu |type |topping_id|topping_type            |
+-------------------------------------------------------------------------------+----+----+----+-----+----------+------------------------+
|{[{1001, Regular}, {1002, Chocolate}, {1003, Blueberry}, {1004, Devil's Food}]}|0001|Cake|0.55|donut|5001      |None                    |
|{[{1001, Regular}, {1002, Chocolate}, {1003, Blueberry}, {1004, Devil's Food}]}|0001|Cake|0.55|donut|5002      |Glazed                  |
|{[{1001, Regular}, {1002, Chocolate}, {1003, Blueberry}, {1004, Devil's Food}]}|0001|Cake|0.55|donut|5005      |Sugar                   |
|{[{1001, Regular}, {1002, Chocolate}, {1003, Blueberry}, {1004, Devil's Food}]}|0001|Cake|0.55|donut|5007      |Powdered Sugar          |
|{[{1001, Regular}, {1002, 