In [12]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

Define file paths

In [13]:
current_dir = os.getcwd()
shipments_files_names = '../json_files/shipments.json'
shipments_files_path = os.path.join(current_dir, shipments_files_names)

Ingest data into Spark

In [14]:
spark = SparkSession.builder.appName("Transform entire JSON documents").getOrCreate()
shipments_df = spark.read.format("json").option("multiline", True).load(shipments_files_path)
shipments_df.printSchema()

root
 |-- books: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- qty: long (nullable = true)
 |    |    |-- title: string (nullable = true)
 |-- customer: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- date: string (nullable = true)
 |-- shipmentId: long (nullable = true)
 |-- supplier: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- state: string (nullable = true)



In [15]:
shipments_df.show()

+--------------------+--------------------+----------+----------+--------------------+
|               books|            customer|      date|shipmentId|            supplier|
+--------------------+--------------------+----------+----------+--------------------+
|[{2, Spark with J...|{Chapel Hill, USA...|2019-10-05|    458922|{Shelter Island, ...|
+--------------------+--------------------+----------+----------+--------------------+



Transformation

In [16]:
shipments_df = (shipments_df.withColumn("country", F.col('customer.country'))
                .withColumn("city", F.col("customer.city"))
                .withColumn("customer_name", F.col("customer.name"))
                .withColumn("supplier_name", F.col("supplier.name"))
                .withColumn("book_items", F.explode(F.col("books")))
                .drop("books", "customer", "supplier"))
shipments_df = (shipments_df.withColumn("book_name", F.col("book_items.title"))
                .withColumn("quantity", F.col("book_items.qty"))
                .drop("book_items"))
# show df
shipments_df.show()

+----------+----------+-------+-----------+-------------------+--------------------+--------------------+--------+
|      date|shipmentId|country|       city|      customer_name|       supplier_name|           book_name|quantity|
+----------+----------+-------+-----------+-------------------+--------------------+--------------------+--------+
|2019-10-05|    458922|    USA|Chapel Hill|Jean Georges Perrin|Manning Publications|     Spark with Java|       2|
|2019-10-05|    458922|    USA|Chapel Hill|Jean Georges Perrin|Manning Publications|Spark in Action, ...|      25|
|2019-10-05|    458922|    USA|Chapel Hill|Jean Georges Perrin|Manning Publications|Spark in Action, ...|       1|
+----------+----------+-------+-----------+-------------------+--------------------+--------------------+--------+



Do analytics

In [17]:
shipments_df.createOrReplaceTempView("shipments_df")
shipments_count = spark.sql("SELECT COUNT(*) as total_of_shipments FROM shipments_df")
shipments_count.show()

+------------------+
|total_of_shipments|
+------------------+
|                 3|
+------------------+



Save dataframe to file

In [19]:
output_path = '../output_files/shipments.csv'
shipments_df.coalesce(1).write.csv(os.path.join(current_dir, output_path), mode="overwrite", header=True)

In [None]:
spark.stop()