In [9]:
from pyspark.sql import SparkSession
from operator import add

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.207:7077") \
        .appName("sentiment_analysis_group19")\
        .config("spark.executor.cores",2)\
        .config("spark.dynamicAllocation.enabled", False)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", False)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

In [10]:
# necessary imports
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F

# sentiment analysis libraries
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

In [11]:
# data from http://files.pushshift.io/reddit/comments/
df = spark_session.read.json("hdfs://192.168.2.207:9000/input/RC_2008-07").cache()

ERROR:root:KeyboardInterrupt while sending command.                 (0 + 0) / 2]
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ubuntu/.local/lib/python3.8/site-packages/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [20]:
def analyze_sentiment(text):
    """Determines the sentiment of a given text. 
    Returns 0 if text sentiment is neither positive nor negative.
    Returns 1 if text sentiment is mainly positive
    Returns -1 if text sentiment is mainly negative. """
    try:
        sia = SentimentIntensityAnalyzer()
    except:
        nltk.download('vader_lexicon')
        sia = SentimentIntensityAnalyzer()
    try:
        scores = sia.polarity_scores(text)
    except: 
        pass
    pos = scores['pos']
    neg = scores['neg']
    if pos == neg:
        return 0
    elif pos > neg:
        return 1
    else:
        return -1

In [14]:
# check function
print(analyze_sentiment('I love you!'))
print(analyze_sentiment('I hate you..'))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ubuntu/nltk_data...


True

In [22]:
def apply_sentiment_analysis(path):
    # read in data from path
    df = spark_session.read.json(path).cache()
    # select columns
    df = df.select('body', 'score')
    # instantiate user defined function
    analyze_sentiment_fct = udf(analyze_sentiment, IntegerType())
    # apply 'analyze_sentiment' on 'body and save result in column 'sentiment'
    df = df.withColumn('sentiment', analyze_sentiment_fct(df.body))
    
    # only consider posts with score larger than 10
    #df = df.filter(df.score > 10)
    
    # posts with positive sentiment
    df_neg = df.filter(df.sentiment == -1)
    
    # posts with negative sentiment
    df_pos = df.filter(df.sentiment == 1)
    
    # num of posts with positive / negative sentiment
    num_neg = df_neg.count()
    num_pos = df_pos.count()
    print('num_neg ', num_neg)
    print('num_pos ', num_pos)
    
    # total score
    sum_score_neg = df_neg.groupBy().agg(F.sum('score')).collect()
    sum_score_pos = df_pos.groupBy().agg(F.sum('score')).collect()
    
    print('sum_score_neg ', sum_score_neg[0][0])
    print('sum_score_pos ', sum_score_pos[0][0])

    # average score per post for negative / positive sentiment
    avg_neg = abs(sum_score_neg[0][0])/num_neg
    avg_pos = abs(sum_score_pos[0][0])/num_pos
     
    return avg_neg, avg_pos

In [26]:
import time
start_time = time.time()

path = "hdfs://192.168.2.207:9000/input/RC_2008-07"
avg_neg, avg_pos = apply_sentiment_analysis(path)

print("--- %s seconds ---" % (time.time() - start_time))


print('average score negative sentiment: ', avg_neg)
print('average score pos sentiment: ', avg_pos)

0

In [None]:
file_1_path = ""
file_2_path = ""
file_3_path = ""
file_4_path = ""

files = [file_1_path, file_2_path, file_3_path, file_4_path]

for file in files:
    

In [8]:
spark_context.stop()