In [1]:
# initialize PySpark
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
spark = SparkSession \
    .builder \
    .appName("PySpark SQL Session") \
    .getOrCreate()

In [3]:
# example simple JSON file
df = spark.read.json("example_1.json", multiLine=True)
df.show()

+-----+-----+-----+
|color|fruit| size|
+-----+-----+-----+
|  Red|Apple|Large|
+-----+-----+-----+



In [4]:
df.schema

StructType(List(StructField(color,StringType,true),StructField(fruit,StringType,true),StructField(size,StringType,true)))

In [5]:
df.select("color").show()

+-----+
|color|
+-----+
|  Red|
+-----+



In [6]:
df.printSchema()

root
 |-- color: string (nullable = true)
 |-- fruit: string (nullable = true)
 |-- size: string (nullable = true)



In [7]:
df.explain()

== Physical Plan ==
*(1) FileScan json [color#0,fruit#1,size#2] Batched: false, Format: JSON, Location: InMemoryFileIndex[file:/Users/lrego/Projects/pyspark_exercises/example_1.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<color:string,fruit:string,size:string>


## Nested JSON structure

Perform data analysis in a nested JSON structure.

https://docs.azuredatabricks.net/_static/notebooks/transform-complex-data-types-python.html  
https://sparkbyexamples.com/spark/spark-sql-structtype-on-dataframe/  
https://stackoverflow.com/questions/37471346/automatically-and-elegantly-flatten-dataframe-in-spark-sql
https://adatis.co.uk/parsing-nested-json-lists-in-databricks-using-python/  
https://spark.apache.org/docs/2.3.0/api/python/pyspark.sql.html  
https://stackoverflow.com/questions/34271398/flatten-nested-spark-dataframe
https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html

This is a unformatted json example.

In [8]:
df2 = spark.read.json("example_3.json", multiLine=True)

In [9]:
df2.printSchema()

root
 |-- persons: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- age: long (nullable = true)
 |    |    |-- cars: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- models: array (nullable = true)
 |    |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |-- name: string (nullable = true)



In [10]:
df2.explain()

== Physical Plan ==
*(1) FileScan json [persons#25] Batched: false, Format: JSON, Location: InMemoryFileIndex[file:/Users/lrego/Projects/pyspark_exercises/example_3.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<persons:array<struct<age:bigint,cars:array<struct<models:array<string>,name:string>>,name:...


In [11]:
# import spark functions
from pyspark.sql.functions import explode, col

In [12]:
persons = df2.select(explode("persons").alias("persons"))

In [13]:
persons_cars = persons.select(
   col("persons.name").alias("persons_name")
 , col("persons.age").alias("persons_age")
 , explode("persons.cars").alias("persons_cars_brands")
 , col("persons_cars_brands.name").alias("persons_cars_brand")
)

In [14]:
persons_cars_models = persons_cars.select(
   col("persons_name")
 , col("persons_age")
 , col("persons_cars_brand")
 , explode("persons_cars_brands.models").alias("persons_cars_model")
)

In [15]:
persons_cars_models.show()

+------------+-----------+------------------+------------------+
|persons_name|persons_age|persons_cars_brand|persons_cars_model|
+------------+-----------+------------------+------------------+
|        John|         30|              Ford|            Fiesta|
|        John|         30|              Ford|             Focus|
|        John|         30|              Ford|           Mustang|
|        John|         30|               BMW|               320|
|        John|         30|               BMW|                X3|
|        John|         30|               BMW|                X5|
|       Peter|         46|           Huyndai|               i10|
|       Peter|         46|           Huyndai|               i30|
|       Peter|         46|          Mercedes|              E320|
|       Peter|         46|          Mercedes|           E63 AMG|
+------------+-----------+------------------+------------------+

