In [1]:
# import necessary classes
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

In [2]:
# initializing SparkSession
spark = SparkSession \
        .builder \
        .appName("structured_network_word_count") \
        .getOrCreate()

spark

In [3]:
# create dataframe representing the stream of input lines
# from connections to localhost:9999

# Input DataFrame/Tables
lines = spark \
        .readStream \
        .format("socket") \
        .option("host", "localhost") \
        .option("port", 9999) \
        .load()

In [4]:
# Queries on Input Table 'lines'
words = lines.select(
    explode(
        # split the lines into words
        split(lines.value, " ")
    ).alias("word")
)

# generate running Result DataFrame/Table 'word_count'
word_counts = words.groupBy("word").count()

In [None]:
# OUTPUT: start running the query that prints the running counts to the console
query = word_counts \
        .writeStream \
        .outputMode("complete") \ # Output Modes: Complete, Append & Update
        .format("console") \
        .start()

query.awaitTermination()