In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
# Need postgres
# https://mvnrepository.com/artifact/org.postgresql/postgresql
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0", 
            "org.postgresql:postgresql:42.2.9",
            "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1",
            "io.delta:delta-core_2.11:0.4.0"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0,org.postgresql:postgresql:42.2.9,org.mongodb.spark:mongo-spark-connector_2.11:2.4.1,io.delta:delta-core_2.11:0.4.0 pyspark-shell


In [3]:
from pyspark.sql import SparkSession

In [4]:
# get or create Spark session

app_name = "spark-word-count"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [5]:
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from delta.tables import *
import random
import shutil

In [6]:
# Clear previous delta-tables

files = ["delta/delta-table1", "delta/delta-table2"]
for i in files:
    try:
        shutil.rmtree(i)
    except:
        pass

In [7]:
@F.udf(returnType=StringType())
def randomWordGenerator():
    return random.choice(["hello", "I", "am", "happy", "to", "run"])

In [8]:
# Generate 8 rows

data = spark.range(8)
data = data.withColumn("value", randomWordGenerator())
data.write.format("delta").save(files[0])

In [10]:
spark.read.format("delta").load(files[0]).show()

+---+-----+
| id|value|
+---+-----+
|  2|   to|
|  3|   to|
|  1|hello|
|  6|happy|
|  7|   to|
|  4|   to|
|  5|   am|
|  0|    I|
+---+-----+



In [11]:
# Stream writes to the table

print("####### Streaming write ######")
streamingDf = spark.readStream.format("delta").load(files[0])
stream = streamingDf \
    .selectExpr("value as word") \
    .groupBy("word")\
    .count() \
    .writeStream\
    .outputMode("complete")\
    .format("delta")\
    .outputMode("complete")\
    .option("checkpointLocation", "/tmp/checkpoint")\
    .start(files[1])

####### Streaming write ######


In [12]:
spark.read.format("delta").load(files[1]).show()

+-----+-----+
| word|count|
+-----+-----+
|hello|    1|
|happy|    1|
|   am|    1|
|   to|    4|
|    I|    1|
+-----+-----+

