<a href="https://colab.research.google.com/github/margaridagomes/dataeng-basic-course/blob/main/spark/examples/06-write_partitioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Write
- .write
- .format (parquet, csv, json)
- options
- spark.sql.sources.partitionOverwriteMode dynamic

# Write Mode
- overwrite - The overwrite mode is used to overwrite the existing file, alternatively, you can use SaveMode.Overwrite
- append - To add the data to the existing file, alternatively, you can use SaveMode.Append
- ignore - Ignores write operation when the file already exists, alternatively, you can use SaveMode.Ignore.
- errorifexists or error - This is a default option when the file already exists, it returns an error, alternatively, you can use SaveMode.ErrorIfExists.

# Partitioning
Process to organize the data into multiple chunks based on some criteria.
Partitions are organized in sub-folders.
Partitioning improves performance in Spark.

# Setting up PySpark

In [None]:
%pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').getOrCreate()

# Preparing data

In [None]:
!pip install faker

Collecting faker
  Downloading faker-37.4.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m1.4/1.9 MB[0m [31m43.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.4.0


In [None]:
from faker import Faker
from datetime import datetime

fake = Faker()

users = []
for _ in range(50):
    user = {
        'date': fake.date_time_between_dates(datetime(2024, 5, 1), datetime(2024, 5, 5)),
        'name': fake.name(),
        'address': fake.address(),
        'email': fake.email(),
        'dob': fake.date_of_birth(),
        'phone': fake.phone_number()
    }
    users.append(user)

df = spark.createDataFrame(users)

df.show(10, False)


+------------------------------------------------------+--------------------------+----------+------------------------+----------------+--------------------+
|address                                               |date                      |dob       |email                   |name            |phone               |
+------------------------------------------------------+--------------------------+----------+------------------------+----------------+--------------------+
|6862 Murphy Dam\nShermanview, MO 48276                |2024-05-01 20:35:15.882791|2000-06-20|josemartinez@example.org|Phillip Campos  |001-337-658-2574    |
|1263 Melissa Harbors\nLake Chloemouth, VA 00865       |2024-05-03 15:22:56.711335|1956-05-04|mariajenkins@example.net|Kevin Russell   |+1-692-349-6215x279 |
|5967 Randall Cove Suite 010\nWest Sandrastad, NC 46233|2024-05-01 03:27:43.675545|2012-01-30|michelle80@example.net  |Jamie Pierce    |(689)796-1482x335   |
|9400 King Extensions Apt. 660\nFloresfurt, CA 40312

# Writing as PARQUET



In [None]:
# Writing as PARQUET with no partitions

path = "/content/write_partitioning/parquet_no_partitions"

df.write.mode("overwrite").format("parquet").save(path)

!ls /content/write_partitioning/parquet_no_partitions

spark.read.format("parquet").load(path).count()

part-00000-b777a8a5-5d45-420b-8b3c-7eae3b3ea3e9-c000.snappy.parquet  _SUCCESS


50

In [None]:
# Writing as PARQUET with partitions
from pyspark.sql.functions import *

path = "/content/write_partitioning/parquet_with_partitions"

# Creating partition column
df = df.withColumn("date_part", date_format(col("date"), "yyyyMMdd"))

spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic") # enable dynamic partition overwrite - only overwrites partitions that are coming in the dataframe

(df#.where("date_part = '20240503'")
 .write
 .mode("overwrite")                                               # overwrites the entire path with the new data
 .partitionBy("date_part")                                        # partition the data by column - creates sub-folders for each partition
 .format("parquet")                                               # format of output
 .save(path))                                                     # path

!ls /content/write_partitioning/parquet_with_partitions

spark.read.format("parquet").load(path).count()

'date_part=20240501'  'date_part=20240503'   _SUCCESS
'date_part=20240502'  'date_part=20240504'


50

In [None]:
# Writing as PARQUET with partitions static
from pyspark.sql.functions import *

path = "/content/write_partitioning/parquet_with_partitions"

# Creating partition column
df = df.withColumn("date_part", date_format(col("date"), "yyyyMMdd"))

spark.conf.set("spark.sql.sources.partitionOverwriteMode","static") # enable dynamic partition overwrite - only overwrites partitions that are coming in the dataframe

(df.where("date_part = '20240503'")
 .write
 .mode("overwrite")                                               # overwrites the entire path with the new data
 .partitionBy("date_part")                                        # partition the data by column - creates sub-folders for each partition
 .format("parquet")                                               # format of output
 .save(path))                                                     # path

!ls /content/write_partitioning/parquet_with_partitions

spark.read.format("parquet").load(path).count()

'date_part=20240503'   _SUCCESS


10

In [None]:
# Writing as PARQUET with partitions - para processar apenas a partição que eu quero reprocessar
from pyspark.sql.functions import *

path = "/content/write_partitioning/parquet_with_partitions"

# Creating partition column
df = df.withColumn("date_part", date_format(col("date"), "yyyyMMdd"))

spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic") # enable dynamic partition overwrite - only overwrites partitions that are coming in the dataframe

(df.where("date_part = '20240503'")
 .write
 .mode("overwrite")                                               # overwrites the entire path with the new data
 .partitionBy("date_part")                                        # partition the data by column - creates sub-folders for each partition
 .format("parquet")                                               # format of output
 .save(path))                                                     # path

!ls /content/write_partitioning/parquet_with_partitions

spark.read.format("parquet").load(path).count()

In [None]:
# Checking single partition
spark.read.parquet("/content/write_partitioning/parquet_with_partitions/date_part=20240502").show()

+--------------------+--------------------+----------+--------------------+---------------+--------------------+
|             address|                date|       dob|               email|           name|               phone|
+--------------------+--------------------+----------+--------------------+---------------+--------------------+
|PSC 4942, Box 649...|2024-05-02 01:06:...|1956-11-01|  wclark@example.org|   Dennis Evans|        523.446.3744|
|57826 Bryant High...|2024-05-02 11:59:...|1914-10-11|tanner95@example.org|Leslie Reynolds|        359-621-6733|
|76000 Knapp Cresc...|2024-05-02 20:03:...|1914-12-11|heatherjackson@ex...|Jessica Spencer|001-475-239-8637x...|
|Unit 6414 Box 532...|2024-05-02 10:49:...|2013-01-14|hannah87@example.org| Jeffrey Medina|   (557)333-8460x328|
|698 Gwendolyn Div...|2024-05-02 07:31:...|1999-10-04| sarah42@example.com| Mallory Reilly|        878.853.2830|
|948 Jillian Green...|2024-05-02 04:10:...|1918-02-03|william49@example...|   Brandon Shaw|   67

# Writing as CSV

https://spark.apache.org/docs/3.5.1/sql-data-sources-csv.html

In [None]:
df.count()

50

In [None]:
path = "/content/write_partitioning/csv_no_partitioning/"

# write as csv
(df
  .write
  .format("csv")
  .mode("overwrite")
  .option("delimiter", "|")
  .option("header", True)
  .save(path))

# listing files in the folder
!ls /content/write_partitioning/csv_no_partitioning/

# read as csv
(spark
  .read
  .options(sep="|", multiLine=True, header=True)
  .csv(path)
  .count())

part-00000-4527c78e-2219-460d-9ff6-f90f6a01586d-c000.csv  _SUCCESS


50

In [None]:
# Checking csv
(spark
  .read
  .options(sep="|", multiLine=True, header=True)
  .csv(path)).show()

+--------------------+--------------------+----------+--------------------+------------------+--------------------+---------+
|             address|                date|       dob|               email|              name|               phone|date_part|
+--------------------+--------------------+----------+--------------------+------------------+--------------------+---------+
|6862 Murphy Dam\n...|2024-05-01T20:35:...|2000-06-20|josemartinez@exam...|    Phillip Campos|    001-337-658-2574| 20240501|
|1263 Melissa Harb...|2024-05-03T15:22:...|1956-05-04|mariajenkins@exam...|     Kevin Russell| +1-692-349-6215x279| 20240503|
|5967 Randall Cove...|2024-05-01T03:27:...|2012-01-30|michelle80@exampl...|      Jamie Pierce|   (689)796-1482x335| 20240501|
|9400 King Extensi...|2024-05-04T03:50:...|1996-01-05|pedroking@example...|      Jimmy Mullen|+1-970-742-0881x8905| 20240504|
|PSC 4942, Box 649...|2024-05-02T01:06:...|1956-11-01|  wclark@example.org|      Dennis Evans|        523.446.3744| 20

# Writing as JSON

https://spark.apache.org/docs/3.5.1/sql-data-sources-json.html

In [None]:
path = "/content/write_partitioning/json_no_partitioning/"

# write as json
(df
.write
.mode("overwrite")
.format("json")
.save(path))

# listing files in the folder
!ls /content/write_partitioning/json_no_partitioning/

# read as json
(spark
  .read
  .json(path)
  .count())

part-00000-d13517af-1445-4016-90a8-c66ee9370603-c000.json  _SUCCESS


50

In [None]:
# reading json as text
spark.read.text(path).show(10, False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                              |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"address":"6862 Murphy Dam\nShermanview, MO 48276","date":"2024-05-01T20:35:15.882Z","dob":"2000-06-20","email":"josemartinez@example.org","name":"Phillip Campos","phone":"001-337-658-2574","date_part":"20240501"}             |
|{"address":"1263 Melissa Harbors\nLake Chloemouth, VA 00865","date":"2024-05-03

In [None]:
# reading json as text
spark.read.json(path).show(10, False)

+------------------------------------------------------+------------------------+---------+----------+------------------------+----------------+--------------------+
|address                                               |date                    |date_part|dob       |email                   |name            |phone               |
+------------------------------------------------------+------------------------+---------+----------+------------------------+----------------+--------------------+
|6862 Murphy Dam\nShermanview, MO 48276                |2024-05-01T20:35:15.882Z|20240501 |2000-06-20|josemartinez@example.org|Phillip Campos  |001-337-658-2574    |
|1263 Melissa Harbors\nLake Chloemouth, VA 00865       |2024-05-03T15:22:56.711Z|20240503 |1956-05-04|mariajenkins@example.net|Kevin Russell   |+1-692-349-6215x279 |
|5967 Randall Cove Suite 010\nWest Sandrastad, NC 46233|2024-05-01T03:27:43.675Z|20240501 |2012-01-30|michelle80@example.net  |Jamie Pierce    |(689)796-1482x335   |
|940

In [None]:
# partition json data + saveAsTable

# Creating partition column
df = df.withColumn("date_part", date_format(col("date"), "yyyyMMdd"))

# write as json
(df.write
  .partitionBy("date_part")
  .mode("overwrite")
  .format("json")
  .saveAsTable("tbl_json_part"))

# read as json
spark.table("tbl_json_part").count()

# read as json
spark.sql("show partitions tbl_json_part").show()

+------------------+
|         partition|
+------------------+
|date_part=20240501|
|date_part=20240502|
|date_part=20240503|
|date_part=20240504|
+------------------+



# Append Mode

In [21]:
# Writing as PARQUET with APPEND - depende da lógica de ETL, se os dados são 100% novos, sem duplicação

path = "/content/write_partitioning/parquet_append"

df.write.mode("append").format("parquet").save(path)

!ls /content/write_partitioning/parquet_append

spark.read.format("parquet").load(path).count()

part-00000-b8cc7a6a-f65c-45f6-aa39-e6ea9b95c675-c000.snappy.parquet  _SUCCESS
part-00000-d106263b-3441-4741-b17a-40044a038867-c000.snappy.parquet


100