In [1]:
from pyspark.sql import SparkSession
spark = (
    SparkSession
    .builder
    .master("local[*]")
    .getOrCreate()
)

load a json file containing a single array of json objects

In [2]:
import os
json = spark.read.json(
    path=os.path.join("/opt/workspace", "json/tv-shows.json"),
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt1553656, 26449...|[Drama, Science-F...|  1|{http://static.tv...| English|    Under the Dome|{

In [3]:
json.printSchema()

root
 |-- _links: struct (nullable = true)
 |    |-- nextepisode: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |    |-- previousepisode: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |    |-- self: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |-- externals: struct (nullable = true)
 |    |-- imdb: string (nullable = true)
 |    |-- thetvdb: long (nullable = true)
 |    |-- tvrage: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: long (nullable = true)
 |-- image: struct (nullable = true)
 |    |-- medium: string (nullable = true)
 |    |-- original: string (nullable = true)
 |-- language: string (nullable = true)
 |-- name: string (nullable = true)
 |-- network: struct (nullable = true)
 |    |-- country: struct (nullable = true)
 |    |    |-- code: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- timez

load a jsonl file containing a single json object per line

In [4]:
import os
json = spark.read.json(
    path=os.path.join("/opt/workspace", "json/tv-shows-1.jsonl"),
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt1553656, 26449...|[Drama, Science-F...|  1|{http://static.tv...| English|    Under the Dome|{

load multiple jsonl files

In [5]:
import os
json = spark.read.json(
    path=[os.path.join("/opt/workspace", "json/tv-shows-1.jsonl"), os.path.join("/opt/workspace", "json/tv-shows-2.jsonl")],
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt2699110, 27043...|    [Drama, Romance]|127|{http://static.tv...| English|        The Affair|{

load jsonl files using a glob pattern

In [6]:
import os
json = spark.read.json(
    path=os.path.join("/opt/workspace", "json/*.jsonl"),
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt2699110, 27043...|    [Drama, Romance]|127|{http://static.tv...| English|        The Affair|{