In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import desc
from collections import namedtuple

from __future__ import print_function
import sys
from pyspark.sql import Row, SparkSession
import io
import requests

In [None]:
def getSparkSessionInstance(sparkConf):
    if ('sparkSessionSingletonInstance' not in globals()):
        globals()['sparkSessionSingletonInstance'] = SparkSession\
            .builder\
            .config(conf=sparkConf)\
            .getOrCreate()
    return globals()['sparkSessionSingletonInstance']

if __name__ == "__main__":
    sc = SparkContext.getOrCreate()
    ssc = StreamingContext(sc, 5 )

In [None]:
socket_stream = ssc.socketTextStream("192.0.0.1", 7799)

In [None]:
lines = socket_stream.window(10)

In [None]:
fields = ("tag", "count" )
Tweet = namedtuple( 'Tweet', fields )

In [None]:
hashtags = lines.flatMap(lambda text: text.split(" ")).filter(lambda word: word.lower().startswith("#"))

In [None]:
def send_df_to_dashboard(df):
    # extract the hashtags from dataframe and convert them into array
    top_tags = [str(t.topic_tweet) for t in df.select("topic_tweet").collect()]
    # extract the counts from dataframe and convert them into array
    tags_count = [p.Count for p in df.select("Count").collect()]
    # initialize and send the data through REST API
    url = 'http://192.0.0.1:5001/updateData'
    request_data = {'label': str(top_tags), 'data': str(tags_count)}
    response = requests.post(url, data=request_data)

In [None]:
def get_tweets(time, rdd):
    print("========= %s =========" % str(time))
    from pyspark.sql.types import NumericType
    try:
        header = ["topic_tweet"]
        spark = getSparkSessionInstance(rdd.context.getConf())
        Rowrdd = rdd.map(lambda x: x.split("\n"))
        jsonDataFrame = spark.createDataFrame(Rowrdd,header)
        jsonDataFrame.createOrReplaceTempView("tweets")
        TrendTweetDF = \
        spark.sql("select topic_tweet,count(*) as Count from tweets group by topic_tweet order by count(*) desc limit 10")
        TrendTweetDF.show()
        send_df_to_dashboard(TrendTweetDF)
    except:
            pass

In [None]:
hashtags.foreachRDD(get_tweets)

In [None]:
ssc.start() 