In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
# Need postgres
# https://mvnrepository.com/artifact/org.postgresql/postgresql
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0", 
            "org.postgresql:postgresql:42.2.9",
            "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1",
            "io.delta:delta-core_2.11:0.4.0"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0,org.postgresql:postgresql:42.2.9,org.mongodb.spark:mongo-spark-connector_2.11:2.4.1,io.delta:delta-core_2.11:0.4.0 pyspark-shell


In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from spark_connections import postgresUrlProperties, createMongoURI

In [4]:
# get or create Spark session

app_name = "spark-mongo-postgres-save-delta"
spark = SparkSession.builder.appName(app_name).getOrCreate()

## Load from MongoDB

In [5]:
mongo_connection = {
    "hostname": "host.docker.internal",
    "port": "27017"
}
mongo_database = "users"
sample_size = 1000 # how many rows to use to determine the schema

collection = "user_data"
mongoURI = createMongoURI(mongo_connection, mongo_database, collection)
print(mongoURI)

df_user_data = spark.read \
    .format("com.mongodb.spark.sql.DefaultSource") \
    .option("uri",mongoURI) \
    .option("sampleSize", sample_size) \
    .load()
df_user_data.printSchema()

mongodb://host.docker.internal:27017/users.user_data
root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- active_user: boolean (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- last_name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- username: string (nullable = true)



## Save data as Delta Table

Parquet, an open source file format for Hadoop. Parquet stores nested data structures in a flat columnar format. Compared to a traditional approach where data is stored in row-oriented approach, parquet is more efficient in terms of storage and performance.

In [6]:
save_mode = "overwrite" # options: error, append, overwrite

df_user_data.write \
    .format("delta") \
    .options() \
    .mode(save_mode) \
    .save("delta/user_data")

In [14]:
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(spark, "delta/user_data/")

In [17]:
deltaTable.delete("id = 1")

In [18]:
deltaTable.history().show()

+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+
|version|          timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|
+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+
|      1|2019-12-12 20:08:43|  null|    null|   DELETE|[predicate -> ["(...|null|    null|     null|          0|          null|        false|
|      0|2019-12-12 20:02:06|  null|    null|    WRITE|[mode -> Overwrit...|null|    null|     null|       null|          null|        false|
+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+



In [20]:
deltaTable.toDF().filter("id = 2").show()

+--------------------+-----------+----------+---+---------+-------+-------------------+--------+
|                 _id|active_user|first_name| id|last_name|  state|     street_address|username|
+--------------------+-----------+----------+---+---------+-------+-------------------+--------+
|[5df2872a9e748e56...|      false|       Caz|  2|   Felgat|Alabama|83 Hazelcrest Place|wwaller1|
+--------------------+-----------+----------+---+---------+-------+-------------------+--------+



In [22]:
deltaTable.update("id = 2", {"active_user": F.lit(True)})

In [23]:
deltaTable.toDF().filter("id = 2").show()

+--------------------+-----------+----------+---+---------+-------+-------------------+--------+
|                 _id|active_user|first_name| id|last_name|  state|     street_address|username|
+--------------------+-----------+----------+---+---------+-------+-------------------+--------+
|[5df2872a9e748e56...|       true|       Caz|  2|   Felgat|Alabama|83 Hazelcrest Place|wwaller1|
+--------------------+-----------+----------+---+---------+-------+-------------------+--------+



In [24]:
deltaTable.history().show()

+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+
|version|          timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|
+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+
|      2|2019-12-12 20:14:48|  null|    null|   UPDATE|[predicate -> (id...|null|    null|     null|          1|          null|        false|
|      1|2019-12-12 20:08:43|  null|    null|   DELETE|[predicate -> ["(...|null|    null|     null|          0|          null|        false|
|      0|2019-12-12 20:02:06|  null|    null|    WRITE|[mode -> Overwrit...|null|    null|     null|       null|          null|        false|
+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+



In [27]:
help(deltaTable.vacuum)

Help on method vacuum in module delta.tables:

vacuum(retentionHours=None) method of delta.tables.DeltaTable instance
    Recursively delete files and directories in the table that are not needed by the table for
    maintaining older versions up to the given retention threshold. This method will return an
    empty DataFrame on successful completion.
    
    Example::
    
        deltaTable.vacuum()     # vacuum files not required by versions more than 7 days old
    
        deltaTable.vacuum(100)  # vacuum files not required by versions more than 100 hours old
    
    :param retentionHours: Optional number of hours retain history. If not specified, then the
                           default retention period of 168 hours (7 days) will be used.
    
    .. note:: Evolving
    
    .. versionadded:: 0.4

