In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#create Spark session
spark = SparkSession.builder.appName('Stackoverflow_Project').getOrCreate()

#change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '5g'), 
                                        ('spark.app.name', 'Spark Updated Conf'), 
                                        ('spark.executor.cores', '4'), 
                                        ('spark.cores.max', '4'), 
                                        ('spark.driver.memory','8g')])

In [3]:
#if you have limited memory - ignore this second dataset
df = spark.read \
    .option("quote", "\"")  \
    .option("escape", "\"") \
    .option("ignoreLeadingWhiteSpace",True) \
    .parquet("gs://msca-bdp-student-gcs/Group6/extracted_StackOverflow.parquet",inferSchema=True, header=True )

                                                                                

In [4]:
df.count()

                                                                                

2605413

In [5]:
df.show(5)

24/11/28 18:02:56 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 1) / 1]

+--------+--------------------+--------------------+-----------------------+-----------------+------------------+------------------+-------------------+-----------------------+-------------------+------------------------+------------------+-----------------+----------+--------------------+---------------+---------+--------------------+--------------------+--------------------+-------------------------+---------------------+--------------------------+--------------------+----------------+-------------------+------------+--------------------+--------------------+
| post_id|          post_title|           post_body|post_accepted_answer_id|post_answer_count|post_comment_count|post_creation_date|post_favorite_count|post_last_activity_date|post_last_edit_date|post_last_editor_user_id|post_owner_user_id|post_post_type_id|post_score|           post_tags|post_view_count|answer_id|         answer_body|answer_comment_count|answer_creation_date|answer_last_activity_date|answer_last_edit_date|answe

                                                                                

In [6]:
df.printSchema()

root
 |-- post_id: long (nullable = true)
 |-- post_title: string (nullable = true)
 |-- post_body: string (nullable = true)
 |-- post_accepted_answer_id: long (nullable = true)
 |-- post_answer_count: long (nullable = true)
 |-- post_comment_count: long (nullable = true)
 |-- post_creation_date: date (nullable = true)
 |-- post_favorite_count: long (nullable = true)
 |-- post_last_activity_date: date (nullable = true)
 |-- post_last_edit_date: date (nullable = true)
 |-- post_last_editor_user_id: long (nullable = true)
 |-- post_owner_user_id: long (nullable = true)
 |-- post_post_type_id: long (nullable = true)
 |-- post_score: long (nullable = true)
 |-- post_tags: string (nullable = true)
 |-- post_view_count: long (nullable = true)
 |-- answer_id: long (nullable = true)
 |-- answer_body: string (nullable = true)
 |-- answer_comment_count: long (nullable = true)
 |-- answer_creation_date: long (nullable = true)
 |-- answer_last_activity_date: date (nullable = true)
 |-- answer_last

In [7]:
#Count rows with missing values
df.dropna().count() #no duplicates

                                                                                

369090

In [8]:
df.select('post_body','post_body_text','post_body_code').show(10)

[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+--------------------+--------------------+
|           post_body|      post_body_text|      post_body_code|
+--------------------+--------------------+--------------------+
|<p>I'm just learn...| i m just learnin...|[zero or more num...|
|<p>I am trying to...| i am trying to c...|Stylecss  &lt;a c...|
|<p>i want to decl...| i want to declar...|                    |
|<p>How to fetch <...| how to fetch ful...|{ rows : 10 os : ...|
|<p>I have been lo...| i have been look...|std::cin class A ...|
|<p>How do I acces...| how do i access ...|rand = nprandomRa...|
|<p>Hello I have a...| hello i have a v...|                    |
|<code>var people ...| for example i ne...|var people = [ {f...|
|<p>I tried runnin...| i tried running ...|webpack watch web...|
|<p>The following ...| the following wi...|day = input('What...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



                                                                                

In [9]:
posts_df = df.select('post_body_text').groupBy('post_body_text').count()
posts_df = posts_df.withColumnRenamed('count', 'Count of Answers')

In [10]:
posts_df.show(5)



+--------------------+----------------+
|      post_body_text|Count of Answers|
+--------------------+----------------+
| is there any fas...|               5|
| i am using strin...|               7|
| i have a problem...|               5|
| i have to create...|               6|
| curretly i am us...|               7|
+--------------------+----------------+
only showing top 5 rows



                                                                                

In [11]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import UniversalSentenceEncoder, SentimentDLModel
from pyspark.ml import Pipeline

# Step 1: DocumentAssembler
document_assembler = DocumentAssembler() \
    .setInputCol("post_body_text") \
    .setOutputCol("document")

# Step 2: Universal Sentence Encoder (optional for embeddings)
use_embeddings = UniversalSentenceEncoder.pretrained("tfhub_use", "en") \
    .setInputCols(["document"]) \
    .setOutputCol("sentence_embeddings")

# Step 3: SentimentDLModel
sentiment_dl = SentimentDLModel.pretrained("sentimentdl_use_twitter","en") \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("sentiment")

#sentimentdl_use_imdb - other pre-trained model that we can use, however it is trained on reviews
#whereas tweets are more generic and hence we can use it for our data.

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ / ]tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ — ]Download done! Loading the resource.


[Stage 12:>                                                         (0 + 1) / 1]

[ \ ]

                                                                                

[ | ]

2024-11-28 18:06:11.851546: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[ / ]sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[ — ]Download done! Loading the resource.


24/11/28 18:06:30 WARN org.apache.hadoop.util.concurrent.ExecutorHelper: Thread (Thread[GetFileInfo #1,5,main]) interrupted: 
java.lang.InterruptedException
	at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:510)
	at com.google.common.util.concurrent.FluentFuture$TrustedFuture.get(FluentFuture.java:88)
	at org.apache.hadoop.util.concurrent.ExecutorHelper.logThrowableFromAfterExecute(ExecutorHelper.java:48)
	at org.apache.hadoop.util.concurrent.HadoopThreadPoolExecutor.afterExecute(HadoopThreadPoolExecutor.java:90)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1157)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
24/11/28 18:06:31 WARN org.apache.hadoop.util.concurrent.ExecutorHelper: Thread (Thread[GetFileInfo #1,5,main]) interrupted: 
java.lang.InterruptedException
	at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:510)
	at c

[OK!]


In [12]:
# Update the pipeline
nlp_pipeline = Pipeline(stages=[
    document_assembler,  # Generates the 'document' column
    use_embeddings,      # Creates sentence embeddings from documents
    sentiment_dl         # Performs sentiment analysis
])

In [13]:
# Fit and transform the data
nlp_model = nlp_pipeline.fit(posts_df)
nlp_model

PipelineModel_812762293135

In [14]:
sentiment_df = nlp_model.transform(posts_df)

In [15]:
# Select relevant columns
sentiment_df_2 = sentiment_df.select("post_body_text", "Count of Answers", "sentiment.result")
sentiment_df_2.show(5,truncate=False)

[Stage 20:>                                                         (0 + 1) / 1]

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+----------+
|post_body_text                                                                                                                                                                                                                                                                                                                         

                                                                                

In [16]:
sentiment_df_2 = sentiment_df_2.withColumn(
    "sentiment_score",
    F.when(F.expr("result[0] == 'positive'"), 1.0)
    .when(F.expr("result[0] == 'negative'"), 0.0)
    .otherwise(0.5) #neutral
)

In [17]:
sentiment_df_2.printSchema()

root
 |-- post_body_text: string (nullable = true)
 |-- Count of Answers: long (nullable = false)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sentiment_score: double (nullable = false)



In [18]:
sentiment_df_2.groupBy(["result", "sentiment_score"]).count().show()

                                                                                

+----------+---------------+------+
|    result|sentiment_score| count|
+----------+---------------+------+
|[negative]|            0.0|272597|
|[positive]|            1.0| 89689|
| [neutral]|            0.5|  5050|
+----------+---------------+------+



In [19]:
#Top posts by view counts have positive sentiment
sentiment_df_2.orderBy("Count of Answers",ascending=False).show(5)



+--------------------+----------------+----------+---------------+
|      post_body_text|Count of Answers|    result|sentiment_score|
+--------------------+----------------+----------+---------------+
|                    |            1808|[positive]|            1.0|
| what is the best...|             518|[positive]|            1.0|
| this is definite...|             407|[positive]|            1.0|
| what is in your ...|             320|[positive]|            1.0|
| this came to my ...|             296|[positive]|            1.0|
+--------------------+----------------+----------+---------------+
only showing top 5 rows



                                                                                

In [22]:
#Bottom posts by view counts don't necessarily have a negative sentiment
sentiment_df_2.orderBy("Count of Answers").show(5)



+--------------------+----------------+----------+---------------+
|      post_body_text|Count of Answers|    result|sentiment_score|
+--------------------+----------------+----------+---------------+
| i need to create...|               1|[positive]|            1.0|
| we are going to ...|               1|[positive]|            1.0|
| i m trying to cr...|               1|[negative]|            0.0|
| what is a qualit...|               1|[positive]|            1.0|
| what tools are u...|               1|[positive]|            1.0|
+--------------------+----------------+----------+---------------+
only showing top 5 rows



                                                                                

In [20]:
# Compute correlation between sentiment_score and number_of_answers
correlation = sentiment_df_2.stat.corr("sentiment_score", "Count of Answers")

                                                                                

In [21]:
#Indicates no correlation b/w sentiment and count of answers
print(f"Correlation between sentiment and number of answers: {correlation}")

Correlation between sentiment and number of answers: 0.017798187721007528


In [23]:
#Also trying with post favorite counts

posts_df_2 = df.groupBy('post_body_text') \
    .agg(
        F.count('*').alias('Count of Answers'),
        F.avg('post_favorite_count').alias('Post Favorite Count')
    )

posts_df_2.show()



+--------------------+----------------+-------------------+
|      post_body_text|Count of Answers|Post Favorite Count|
+--------------------+----------------+-------------------+
| maybe a basic qu...|               8|                5.0|
| initialization w...|               5|               null|
| how do you write...|               9|                7.0|
| say i have a str...|               8|               null|
| writing a ton of...|               5|                1.0|
| a beginner quest...|               8|                4.0|
| we are working o...|              11|               11.0|
| the toolbox in v...|               5|                4.0|
| i m looking for ...|               6|               null|
| i have a program...|              10|               59.0|
| we are a group o...|               5|               null|
| in css when sett...|               9|                2.0|
| i will phrase th...|               7|                0.0|
| i m planning to ...|              15| 

                                                                                

In [24]:
nlp_model_2 = nlp_pipeline.fit(posts_df_2)
nlp_model_2

PipelineModel_6a7c6b4558c0

In [35]:
sentiment_df_v2 = nlp_model_2.transform(posts_df_2)

In [36]:
# Select relevant columns
sentiment_df_v2 = sentiment_df_v2.select("post_body_text", "Count of Answers",
                                         "Post Favorite Count","sentiment.result")

In [37]:
sentiment_df_v2 = sentiment_df_v2.withColumn(
    "Sentiment Score",
    F.when(F.expr("result[0] == 'positive'"), 1.0)
    .when(F.expr("result[0] == 'negative'"), 0.0)
    .otherwise(0.5) #neutral
)

sentiment_df_v2.printSchema()

root
 |-- post_body_text: string (nullable = true)
 |-- Count of Answers: long (nullable = false)
 |-- Post Favorite Count: double (nullable = true)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Sentiment Score: double (nullable = false)



In [39]:
# Compute correlation between sentiment_score and number_of_answers
correlation_2 = sentiment_df_v2.stat.corr("Sentiment Score", "Post Favorite Count")

                                                                                

In [40]:
print(f"Correlation between sentiment and post favorite count: {correlation_2}")

Correlation between sentiment and post favorite count: 0.03869474162682203


In [41]:
#This is also low.