This notebook applies VADER Sentiment Analysis to each submission from the already processed reddit dataset.

In [1]:
%%configure -f
{
    "conf": {
        "spark.pyspark.python": "python3",
        "spark.pyspark.virtualenv.enabled": "true",
        "spark.pyspark.virtualenv.type":"native",
        "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv"
    }
}

In [2]:
sc.install_pypi_package("vaderSentiment","https://pypi.org/simple")
sc.install_pypi_package("pandas==1.0.5")
sc.install_pypi_package("pyarrow==0.15.1", "https://pypi.org/simple")

#Since I needed a nuanced score of sentiment and not just labels, I'm using 
#VADER instead of the Spark NLP built-in functions 

#Pandas and Pyarrow packages will be used for creating Pandas UDF 

#https://github.com/cjhutto/vaderSentiment?tab=readme-ov-file#code-examples

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
8,application_1716476646286_0010,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting vaderSentiment
  Using cached https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl
Collecting requests (from vaderSentiment)
  Using cached https://files.pythonhosted.org/packages/70/8e/0e2d847013cb52cd35b38c009bb167a1a26b2ce6cd6965bf26b47bc0bf44/requests-2.31.0-py3-none-any.whl
Collecting charset-normalizer<4,>=2 (from requests->vaderSentiment)
  Using cached https://files.pythonhosted.org/packages/28/76/e6222113b83e3622caa4bb41032d0b1bf785250607392e1b778aca0b8a7d/charset_normalizer-3.3.2-py3-none-any.whl
Collecting certifi>=2017.4.17 (from requests->vaderSentiment)
  Using cached https://files.pythonhosted.org/packages/ba/06/a07f096c664aeb9f01624f858c3add0a4e913d6c96257acb4fce61e7de14/certifi-2024.2.2-py3-none-any.whl
Collecting idna<4,>=2.5 (from requests->vaderSentiment)
  Using cached https://files.pythonhosted.org/packages/e5/3e/741d8c82801c347547f8a2a06aa57dbb1992be9e948df2

In [12]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pyspark.sql.functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pandas as pd

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
df = spark.read.parquet("s3://finalproject-nat-s3/data_withtopic_labels/*.parquet")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
df.filter(F.col("entire_text").isNull()).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

157

In [6]:
#I'm seeing that some rows were not correctly cleaned up before, so I'll remove those rows 
#(I would clean up again the original file but the topic modeling was too computationally expensive to run again)
df_filtered = df.filter(F.col("entire_text").isNotNull())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
#Ensure they got deleted
df_filtered.filter(F.col("entire_text").isNull()).count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

In [13]:
#Create a user defined function that gets the levels of negativity. 

#Since the subreddit is mainly
#for discussing problems, I do not expect positive texts. Instead, I want to 
#capture the nuances of negativity.

#Which topics are discussed with more or less intensity of negativity?

analyzer = SentimentIntensityAnalyzer() 

#Use a pandas udf to optimize applying the function to all rows 
#https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.functions.pandas_udf.html

@pandas_udf('float', PandasUDFType.SCALAR)
def get_negativity_score_udf(text: pd.Series) -> pd.Series:
    return text.apply(lambda x: analyzer.polarity_scores(x)['neg'])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…



In [14]:
#Ensure executers are using enough memory for persisting the dataframe
sc._conf.get('spark.executor.memory')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'18971M'

In [15]:
#Persist DataFrame and apply UDF
df_filtered.persist()
df_w_neg_scores = df_filtered.withColumn('negativity_score', get_negativity_score_udf(F.col('entire_text')))


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
df_w_neg_scores.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-----+------------+--------------------+----+--------------------+----------------+
|     id|score|num_comments|         entire_text|year|         topic_label|negativity_score|
+-------+-----+------------+--------------------+----+--------------------+----------------+
| acnfj5|   24|          30|is it reasonable ...|2019|       Unclear topic|           0.054|
| emdrc4|    3|           8|im now my ex and ...|2020|       Unclear topic|           0.092|
|15gdohy|    1|           3|my 44m boyfriend ...|2023|       Unclear topic|           0.089|
|161n6ru|    1|           1|how do i 18f cont...|2023|Romantic relation...|           0.114|
| cyp5td|    2|           7|how do i get peop...|2019|       Unclear topic|           0.078|
+-------+-----+------------+--------------------+----+--------------------+----------------+
only showing top 5 rows

In [None]:
#Save scores into S3 for later visualization
df_repartitioned = df_w_neg_scores.repartition(10)
df_repartitioned.write.parquet("s3://finalproject-nat-s3/final_data", mode = 'overwrite')