In [1]:
from pyspark.sql import SparkSession
spark = (
    SparkSession
    .builder
    .master("local[*]")
    .getOrCreate()
)

load the cereal.csv file

In [2]:
import os
csv = spark.read.csv(
    path=os.path.join("/opt/workspace", "cereal.csv"),
    header=True,
    inferSchema=True
)

show the first five rows

In [3]:
csv.show(5)

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|           100% Bran|  N|   C|      70|      4|  1|   130| 10.0|  5.0|     6|   280|      25|    3|   1.0|0.33|68.402973|
|   100% Natural Bran|  Q|   C|     120|      3|  5|    15|  2.0|  8.0|     8|   135|       0|    3|   1.0| 1.0|33.983679|
|            All-Bran|  K|   C|      70|      4|  1|   260|  9.0|  7.0|     5|   320|      25|    3|   1.0|0.33|59.425505|
|All-Bran with Ext...|  K|   C|      50|      4|  0|   140| 14.0|  8.0|     0|   330|      25|    3|   1.0| 0.5|93.704912|
|      Almond Delight|  R|   C|     110|      2|  2|   200|  1.0| 14.0|     8|    -1|      25|    3|   1.0|0.75|34.384843|
+---------------

In [4]:
csv.printSchema()

root
 |-- name: string (nullable = true)
 |-- mfr: string (nullable = true)
 |-- type: string (nullable = true)
 |-- calories: integer (nullable = true)
 |-- protein: integer (nullable = true)
 |-- fat: integer (nullable = true)
 |-- sodium: integer (nullable = true)
 |-- fiber: double (nullable = true)
 |-- carbo: double (nullable = true)
 |-- sugars: integer (nullable = true)
 |-- potass: integer (nullable = true)
 |-- vitamins: integer (nullable = true)
 |-- shelf: integer (nullable = true)
 |-- weight: double (nullable = true)
 |-- cups: double (nullable = true)
 |-- rating: double (nullable = true)


load a json file containing a single array of json objects

In [4]:
import os
json = spark.read.json(
    path=os.path.join("/opt/workspace", "json/tv-shows.json"),
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt1553656, 26449...|[Drama, Science-F...|  1|{http://static.tv...| English|    Under the Dome|{

load a jsonl file

In [5]:
import os
json = spark.read.json(
    path=os.path.join("/opt/workspace", "json/tv-shows-1.jsonl"),
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt1553656, 26449...|[Drama, Science-F...|  1|{http://static.tv...| English|    Under the Dome|{

load multiple jsonl files

In [6]:
import os
json = spark.read.json(
    path=[os.path.join("/opt/workspace", "json/tv-shows-1.jsonl"), os.path.join("/opt/workspace", "json/tv-shows-2.jsonl")],
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt2699110, 27043...|    [Drama, Romance]|127|{http://static.tv...| English|        The Affair|{

load jsonl files using a glob pattern

In [7]:
import os
json = spark.read.json(
    path=os.path.join("/opt/workspace", "json/*.jsonl"),
)
json.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+--------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt2699110, 27043...|    [Drama, Romance]|127|{http://static.tv...| English|        The Affair|{

load a parquet file

In [15]:
import os
parquet = spark.read.parquet(
    os.path.join("/opt/workspace", "nba/total.parquet")
)
parquet.show(5)

+---+------------------+---+---+---+----+---+----+-----+---+---+-----+---+----+-----+-----+---+---+-----+---+---+---+---+---+---+---+---+----+----+---------+-----------+
| Rk|            Player|Age|  G| GS|  MP| FG| FGA|  FG%| 3P|3PA|  3P%| 2P| 2PA|  2P%| eFG%| FT|FTA|  FT%|ORB|DRB|TRB|AST|STL|BLK|TOV| PF| PTS|team|   season|team_retcon|
+---+------------------+---+---+---+----+---+----+-----+---+---+-----+---+----+-----+-----+---+---+-----+---+---+---+---+---+---+---+---+----+----+---------+-----------+
|  1|      Isaiah Rider| 23| 75| 67|35.3|7.4|16.7|0.447|1.9|5.3|0.351|5.6|11.4|0.491|0.502|3.7|4.5|0.817|1.2|2.1|3.3|3.3|0.9|0.3|3.1|2.6|20.4| MIN|1995/1996|        MIN|
|  2|Christian Laettner| 25| 81| 80|34.2|5.6|11.4|0.489|0.2|0.5|0.325|5.4|10.9|0.497|0.496|5.0|6.2|0.818|2.0|5.5|7.6|2.9|1.2|1.1|2.8|3.7|16.3| MIN|1995/1996|        MIN|
|  3|         Doug West| 27| 71| 65|32.8|4.9|10.7|0.461|0.2|0.9| 0.18|4.8| 9.9|0.485|0.468|2.9|3.5|0.837|0.8|2.4|3.2|2.6|0.9|0.3|1.8|3.5|12.9| MIN|199

load multiple parquet files by unpacking a list of file paths

In [18]:
parquet = spark.read.parquet(
    *[
        os.path.join("/opt/workspace", "nba/total.parquet"),
        os.path.join("/opt/workspace", "nba/total_playoffs.parquet"),
    ]
)
parquet.show(5)

+---+------------------+----+---+----+----+---+----+-----+---+---+-----+---+---+-----+-----+---+---+-----+---+---+---+---+---+---+---+---+----+----+---------+-----------+
| Rk|            Player| Age|  G|  GS|  MP| FG| FGA|  FG%| 3P|3PA|  3P%| 2P|2PA|  2P%| eFG%| FT|FTA|  FT%|ORB|DRB|TRB|AST|STL|BLK|TOV| PF| PTS|team|   season|team_retcon|
+---+------------------+----+---+----+----+---+----+-----+---+---+-----+---+---+-----+-----+---+---+-----+---+---+---+---+---+---+---+---+----+----+---------+-----------+
|1.0|Christian Laettner|25.0| 81|80.0|2770|450| 920|0.489| 13| 40|0.325|437|880|0.497|0.496|409|500|0.818|164|449|613|234|101| 87|225|302|1322| MIN|1995/1996|        MIN|
|2.0|      Isaiah Rider|23.0| 75|67.0|2645|558|1249|0.447|139|396|0.351|419|853|0.491|0.502|277|339|0.817| 90|159|249|245| 69| 23|232|194|1532| MIN|1995/1996|        MIN|
|3.0|        Sean Rooks|25.0| 80|70.0|2405|289| 615| 0.47|  0|  5|  0.0|289|610|0.474| 0.47|290|381|0.761|165|321|486| 97| 29| 71|142|208| 868| M

load a text file

In [19]:
txt = spark.read.text(
    os.path.join("/opt/workspace", "books/frankenstein.txt"),
)
txt.show(5)

+--------------------+
|               value|
+--------------------+
|                    |
|Project Gutenberg...|
|                    |
|This eBook is for...|
|almost no restric...|
+--------------------+


load a text file into a dataframe using a space as the separator

In [25]:
txt = spark.read.text(
    os.path.join("/opt/workspace", "books/*.txt"), lineSep=' '
)
txt.show(5)

+-----------+
|      value|
+-----------+
|  \nProject|
|Gutenberg's|
|        The|
| Adventures|
|         of|
+-----------+


load multiple text files using a glob pattern

In [24]:
txt = spark.read.text(
    os.path.join("/opt/workspace", "books/*.txt"), wholetext=True
)
txt.show(5)

+--------------------+
|               value|
+--------------------+
|\nProject Gutenbe...|
|\nProject Gutenbe...|
+--------------------+
