In [1]:
from pyspark.sql import SparkSession
spark = (
    SparkSession
    .builder
    .master("local[*]")
    .getOrCreate()
)

load the cereal.csv file

In [2]:
import os
csv = spark.read.csv(
    path=os.path.join("/opt/workspace", "cereal.csv"),
    header=True,
    inferSchema=True
)

show the first five rows

In [3]:
csv.show(5)

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|           100% Bran|  N|   C|      70|      4|  1|   130| 10.0|  5.0|     6|   280|      25|    3|   1.0|0.33|68.402973|
|   100% Natural Bran|  Q|   C|     120|      3|  5|    15|  2.0|  8.0|     8|   135|       0|    3|   1.0| 1.0|33.983679|
|            All-Bran|  K|   C|      70|      4|  1|   260|  9.0|  7.0|     5|   320|      25|    3|   1.0|0.33|59.425505|
|All-Bran with Ext...|  K|   C|      50|      4|  0|   140| 14.0|  8.0|     0|   330|      25|    3|   1.0| 0.5|93.704912|
|      Almond Delight|  R|   C|     110|      2|  2|   200|  1.0| 14.0|     8|    -1|      25|    3|   1.0|0.75|34.384843|
+---------------

In [4]:
csv.printSchema()

root
 |-- name: string (nullable = true)
 |-- mfr: string (nullable = true)
 |-- type: string (nullable = true)
 |-- calories: integer (nullable = true)
 |-- protein: integer (nullable = true)
 |-- fat: integer (nullable = true)
 |-- sodium: integer (nullable = true)
 |-- fiber: double (nullable = true)
 |-- carbo: double (nullable = true)
 |-- sugars: integer (nullable = true)
 |-- potass: integer (nullable = true)
 |-- vitamins: integer (nullable = true)
 |-- shelf: integer (nullable = true)
 |-- weight: double (nullable = true)
 |-- cups: double (nullable = true)
 |-- rating: double (nullable = true)


load a json file containing a single array of json objects

In [6]:
import os
json = spark.read.json(
    path=os.path.join("/opt/workspace", "json/tv-shows.json"),
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt1553656, 26449...|[Drama, Science-F...|  1|{http://static.tv...| English|    Under the Dome|{

In [7]:
json.printSchema()

root
 |-- _links: struct (nullable = true)
 |    |-- nextepisode: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |    |-- previousepisode: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |    |-- self: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |-- externals: struct (nullable = true)
 |    |-- imdb: string (nullable = true)
 |    |-- thetvdb: long (nullable = true)
 |    |-- tvrage: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: long (nullable = true)
 |-- image: struct (nullable = true)
 |    |-- medium: string (nullable = true)
 |    |-- original: string (nullable = true)
 |-- language: string (nullable = true)
 |-- name: string (nullable = true)
 |-- network: struct (nullable = true)
 |    |-- country: struct (nullable = true)
 |    |    |-- code: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- timez

load a jsonl file

In [5]:
import os
json = spark.read.json(
    path=os.path.join("/opt/workspace", "json/tv-shows-1.jsonl"),
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt1553656, 26449...|[Drama, Science-F...|  1|{http://static.tv...| English|    Under the Dome|{

load multiple jsonl files

In [6]:
import os
json = spark.read.json(
    path=[os.path.join("/opt/workspace", "json/tv-shows-1.jsonl"), os.path.join("/opt/workspace", "json/tv-shows-2.jsonl")],
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt2699110, 27043...|    [Drama, Romance]|127|{http://static.tv...| English|        The Affair|{

load jsonl files using a glob pattern

In [21]:
import os
json = spark.read.json(
    path=os.path.join("/opt/workspace", "json/*.jsonl"),
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt2699110, 27043...|    [Drama, Romance]|127|{http://static.tv...| English|        The Affair|{

load a single parquet file

In [25]:
import os
parquet = spark.read.parquet(
    os.path.join("/opt/workspace", "parquet/tv-shows=animation.parquet")
)
parquet.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt2699110, 27043...|    [Drama, Romance]|127|{http://static.tv...| English|        The Affair|{{US, United Stat...|http://

load multiple parquet files by unpacking a list of file paths

In [26]:
parquet = spark.read.parquet(
    *[
        os.path.join("/opt/workspace", "parquet/tv-shows=animation.parquet"),
        os.path.join("/opt/workspace", "parquet/tv-shows=documentary.parquet"),
    ]
)

parquet.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+-------------------+--------------------+--------------------+----------+------+-------+-----------------+-------+--------------------+----------+--------------------+------------------+------+
|              _links|           externals|              genres| id|               image|language|               name|             network|        officialSite| premiered|rating|runtime|         schedule| status|             summary|   updated|                 url|        webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+-------------------+--------------------+--------------------+----------+------+-------+-----------------+-------+--------------------+----------+--------------------+------------------+------+
|{NULL, {http://ap...|{tt1695360, 25108...|[Action, Adventur...|178|{http://static.tv...| English|The Legend of Korra|{{US, 

In [27]:
# load multiple parquet files using a glob pattern
parquet = spark.read.parquet(
    os.path.join("/opt/workspace", "parquet/*.parquet")
)
parquet.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt2699110, 27043...|    [Drama, Romance]|127|{http://static.tv...| English|        The Affair|{{US, United Stat...|http://

In [28]:
# load multiple parquet files by specifying a directory
parquet = spark.read.parquet(
    os.path.join("/opt/workspace", "parquet")
)
parquet.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt2699110, 27043...|    [Drama, Romance]|127|{http://static.tv...| English|        The Affair|{{US, United Stat...|http://

load a text file

In [19]:
txt = spark.read.text(
    os.path.join("/opt/workspace", "books/frankenstein.txt"),
)
txt.show(5)

+--------------------+
|               value|
+--------------------+
|                    |
|Project Gutenberg...|
|                    |
|This eBook is for...|
|almost no restric...|
+--------------------+


load a text file into a dataframe using a space as the separator

In [25]:
txt = spark.read.text(
    os.path.join("/opt/workspace", "books/*.txt"), lineSep=' '
)
txt.show(5)

+-----------+
|      value|
+-----------+
|  \nProject|
|Gutenberg's|
|        The|
| Adventures|
|         of|
+-----------+


load multiple text files using a glob pattern

In [24]:
txt = spark.read.text(
    os.path.join("/opt/workspace", "books/*.txt"), wholetext=True
)
txt.show(5)

+--------------------+
|               value|
+--------------------+
|\nProject Gutenbe...|
|\nProject Gutenbe...|
+--------------------+
