In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, concat, col, lit
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from time import sleep
from textblob import TextBlob

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("SparkStreamer")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Read the whole dataset as a batch
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9093") \
        .option("subscribe", "tweet") \
        .option("startingOffsets", "earliest") \
        .load()

lines = df.selectExpr("CAST(value AS STRING)")

def sentiment_analysis(df, batch_id):
    tweet = TextBlob(df.select(col("tweet")))
    polarity = tweet.sentiment.polarity
    tweet_sentiment = ""
    if polarity > 0:
        tweet_sentiment = 'positive'
    elif polarity < 0:
        tweet_sentiment = 'negative'
    elif polarity == 0:
        tweet_sentiment = 'neutral'
    object = {
        "author": df.select(col("user")),
        "time": df.select(col("race")),
        "message": df.select(col("text")),
        "sentiment": tweet_sentiment
    }

    df \
      .select(tweet_sentiment) \
      .write.format('bigquery') \
      .option('table', 'de2021-324520.labdataset.tweetSentiments') \
      .mode("overwrite") \
      .save()

query = df \
            .writeStream \
            .outputMode("complete") \
            .option("checkpointLocation", "/home/jovyan/checkpoint") \
            .foreachBatch(sentiment_analysis).start() \
            .start()

try:
    query.awaitTermination()
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")
except:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Unexpected error")
    print("Stoped the streaming query and the spark context")