# Tweet Consumer

@author: @raymondmarfurt

Streaming with Spark<br>
ZHAW CAS Machine Intelligence<br>
Big Data Project<br>

Consume JSON files produced by Twitter Producer. Apply windowing and aggregate by weighted sum of tweets per hashtag.

In [2]:
HASHTAGS = ['#tesla',
            '#apple',
            '#Microsoft',
            '#mcdonalds',
            '#nike',
            '#pfizer',
            '#facebook',
            '#alphabet',
            '#goldmansachs',
            '#lockheadmartin']

HASHTAG_STR = "'tesla', 'apple', 'Microsoft', 'mcdonalds', 'nike', 'pfizer', 'facebook', 'alphabet', 'goldmansachs', 'lockheadmartin'"


In [3]:
def printDF(streamingDF):
  iter = 0
  while iter < 50:
    print("testing for streamingDF.count()...")
    if(streamingDF.count() > 0):
      print("Number of entries in dataframe: "+ str(streamingDF.count()))
      streamingDF.select("hashtag", "end", "weight").toPandas().to_csv('/tmp/statistics')
      streamingDF.show(20, False) # the parameter False prevents Spark from truncating the output
      iter += 1
    time.sleep(60)

In [4]:
df = spark.read.json("/tmp/tweets/data_twitter.json")
json_schema = df.schema
df.printSchema()

In [5]:
from pyspark.sql.functions import *
import time

TWEET_DIR = "/tmp/tweet_small/"

streamingInputDF = (
  spark
    .readStream
    .schema(json_schema)
    .json(TWEET_DIR)
)

# You can check if this is streaming dataframe using the following:
streamingInputDF.isStreaming

In [6]:
dbutils.fs.ls(TWEET_DIR)

In [7]:
# Group the data by window and hashtag and compute the count of each group
import math
import pyspark.sql.functions as sf

w1 = streamingInputDF.withColumn("timestamp", to_timestamp("created_at", "EEE MMM dd HH:mm:ss '+0000' yyyy"))
#w2 = w1.withColumn("log_followers_count", math.log(int(w1['user']['followers_count'])))
w3 = w1.withWatermark('timestamp', "1 minute")
windowedCounts = w3.select("id", "user.followers_count", explode(streamingInputDF.entities.hashtags.text).alias("hashtag"), "timestamp")


groupedCounts = windowedCounts.groupBy(
  windowedCounts.hashtag, 
  window(windowedCounts.timestamp, "1 minute")
).agg(sf.sum("followers_count").alias("weight"))#.orderBy('window')

streamingETLQuery = groupedCounts \
  .writeStream \
  .format("memory") \
  .queryName("tweetstream") \
  .outputMode("append")\
  .start()
  

streamingDF = spark.sql("select hashtag, window.end, weight  from tweetstream ")
#streamingDF = spark.sql("select * from tweetstream")
#streamingDF = spark.sql("select count(*) as count, window, lower(hashtag) as hashtag from tweetstream where lower(hashtag) in ('tesla', 'apple', 'Microsoft', 'mcdonalds', 'nike', 'pfizer', 'facebook', 'alphabet', 'goldmansachs', 'lockheadmartin') group by window, hashtag order by count desc")

printDF(streamingDF)


In [8]:
# Finally, stop the running streaming query
streamingETLQuery.stop()