In [None]:
!pip install textblob

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, concat, col, lit, from_json
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, DateType, BooleanType
from time import sleep
from textblob import TextBlob

dataSchema = StructType(
        [StructField("user_name", StringType(), True),
         StructField("user_location", StringType(), True),
         StructField("user_description", StringType(), True),
         StructField("user_created", DateType(), True),
         StructField("user_followers", DoubleType(), True),
         StructField("user_friends", DoubleType(), True),
         StructField("user_favourites", DoubleType(), True),
         StructField("user_verified", BooleanType(), True),
         StructField("date", DateType(), True),
         StructField("text", StringType(), True),
         StructField("hashtags", StringType(), True),
         StructField("source", StringType(), True),         
         StructField("is_retweet", BooleanType(), True),
         StructField("sentiment", StringType(), True),
         ])

In [None]:
def sentiment_analysis(df, batch_id):
    text = df.first()['text']
    tweet = TextBlob(text)
    polarity = tweet.sentiment.polarity
    tweet_sentiment = ""
    if polarity > 0:
        tweet_sentiment = 'positive'
    elif polarity < 0:
        tweet_sentiment = 'negative'
    elif polarity == 0:
        tweet_sentiment = 'neutral'
        
    df = df.withColumn("sentiment", lit(tweet_sentiment))
    
    df \
      .write.format('bigquery') \
      .option('table', 'de2022-362622.assignmentDatasets.tweetSentiments') \
      .mode("overwrite") \
      .save()


In [None]:
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("SparkStreamer")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")


# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "de_jads_temp-401"
spark.conf.set('temporaryGcsBucket', bucket)

# Read the whole dataset as a batch
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9093") \
        .option("subscribe", "tweet") \
        .option("startingOffsets", "earliest") \
        .load() \


query = df \
          .select(from_json(col("value").cast("string"), dataSchema).alias("parsed_value")) \
          .select(col("parsed_value.*")) \
          .writeStream \
          .format("kafka") \
          .option("kafka.bootstrap.servers", "kafka1:9093") \
          .option("checkpointLocation", "/home/jovyan/checkpoint")\
          .option("topic", "tweet") \
          .foreachBatch(sentiment_analysis) \
          .start()

try:
    query.awaitTermination()
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")
except Exception as e:
    query.stop()
    # Stop the spark context
    spark.stop()
    print(e)
    print("Unexpected error")
    print("Stoped the streaming query and the spark context")