In [1]:
spark

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

inputPath = "dbfs:/mnt/yelp-mount"

# Since we know the data format already, let's define the schema to speed up processing (no need for Spark to infer schema)
jsonSchema = StructType([ 
  StructField("review_id", StringType(), True), 
  StructField("user_id", StringType(), True) , 
  StructField("business_id", StringType(), True) , 
  StructField("stars", IntegerType(), True) , 
  StructField("date", TimestampType(), True) , 
  StructField("text", StringType(), True), 
  StructField("useful", IntegerType(), True), 
  StructField("funny", IntegerType(), True), 
  StructField("cool", IntegerType(), True) ])

# Stream DataFrame representing data in the JSON files
streamInputDF = (
  spark
    .readStream
    .schema(jsonSchema)
    .option("maxFilesPerTrigger", 1)
    .json(inputPath)
)

# Query for counting review grouped by stars
streamingCountsDF = (                 
  streamInputDF
    .groupBy(
      streamInputDF.stars, 
      window(streamInputDF.date, "1 hour"))
    .count()
)

streamingCountsDF.isStreaming

In [3]:
spark.conf.set("spark.sql.shuffle.partitions", "2")  # keep the size of shuffles small

query = (
  streamingCountsDF
    .writeStream
    .format("memory")        # memory = store in-memory table (for testing only in Spark 2.0)
    .queryName("counts")     # counts = name of the in-memory table
    .outputMode("complete")  # complete = all the counts should be in the table
    .start()
)

In [4]:
from time import sleep

In [5]:
%sql select stars, date_format(window.end, "MMM-dd HH:mm") as time, count from counts order by time, stars

In [6]:
sleep(5)  # wait a bit more for more data to be computed

In [7]:
%sql select stars, date_format(window.end, "MMM-dd HH:mm") as time, count from counts order by time, stars

In [8]:
%sql select stars, sum(count) as total_count from counts group by stars order by stars