In [9]:
%%configure -f
{
    "conf": {
        "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:4.3.1",
        "spark.pyspark.python": "python3",
        "spark.pyspark.virtualenv.enabled": "true",
        "spark.pyspark.virtualenv.type":"native",
        "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv",
        "spark.driver.memory":"6000M"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6,application_1685120423611_0008,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6,application_1685120423611_0008,pyspark,idle,Link,Link,✔


In [10]:
sc.install_pypi_package('spark-nlp')
sc.install_pypi_package("boto3==1.19.2")
sc.install_pypi_package("pandas==1.0.5")
sc.install_pypi_package("scipy==1.4.1")
sc.install_pypi_package("matplotlib==3.2.1")
sc.install_pypi_package("seaborn==0.10.1")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting spark-nlp
  Using cached https://files.pythonhosted.org/packages/e2/88/943fb14a2b024bf328bcc448837f75114ac97478db9def2e2042b2818aaa/spark_nlp-4.4.3-py2.py3-none-any.whl
Installing collected packages: spark-nlp
Successfully installed spark-nlp-4.4.3

In [12]:
spark.sparkContext.getConf().get('spark.driver.memory')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'6000M'

In [13]:
data_comments = spark.read.parquet('s3://reddit-parquet/comments.parquet')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
data_comments.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- subreddit: string (nullable = true)
 |-- post_id: integer (nullable = true)
 |-- comment_score: long (nullable = true)
 |-- comment_body: string (nullable = true)
 |-- post_url: string (nullable = true)
 |-- comment_id: string (nullable = true)

In [15]:
data_posts = spark.read.parquet('s3://reddit-parquet/posts.parquet')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
data_posts.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- subreddit: string (nullable = true)
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- score: long (nullable = true)
 |-- url: string (nullable = true)
 |-- comms_num: long (nullable = true)
 |-- body: string (nullable = true)
 |-- ups: long (nullable = true)

In [17]:
import json
import pandas as pd
import numpy as np

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Data Cleaning
##### Remove deleted posts/comments

### Define Spark NLP pipeline

In [18]:
MODEL_NAME='classifierdl_use_emotion'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [39]:
def pipe(column_name):
    documentAssembler = DocumentAssembler() \
        .setInputCol(column_name) \
        .setOutputCol("document")

    use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en") \
        .setInputCols(["document"]) \
        .setOutputCol("sentence_embeddings")

    sentimentdl = ClassifierDLModel.pretrained(name=MODEL_NAME) \
        .setInputCols(["sentence_embeddings"]) \
        .setOutputCol("sentiment")

    nlpPipeline = Pipeline(stages=[documentAssembler, use, sentimentdl])
    return nlpPipeline

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [48]:
def run_pipeline(column_name, df):
    text = df.select(column_name)
    pipeline = pipe(column_name)
    cls_results = pipeline.fit(text).transform(text)
    return cls_results

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

##### Classify posts body

In [20]:
data_posts[['body']].show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+
|                body|
+--------------------+
|#Community Guidel...|
|I want to see if ...|
|How new does a fi...|
|What a trip man w...|
|May (2002), Excis...|
+--------------------+
only showing top 5 rows

In [41]:
posts_body_cls = run_pipeline('body', data_posts)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
classifierdl_use_emotion download started this may take some time.
Approximate size to download 21.3 MB
[OK!]

In [42]:
posts_body_cls.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+--------------------+--------------------+--------------------+
|                body|            document| sentence_embeddings|           sentiment|
+--------------------+--------------------+--------------------+--------------------+
|#Community Guidel...|[[document, 0, 11...|[[sentence_embedd...|[[category, 0, 11...|
|I want to see if ...|[[document, 0, 17...|[[sentence_embedd...|[[category, 0, 17...|
|How new does a fi...|[[document, 0, 22...|[[sentence_embedd...|[[category, 0, 22...|
|What a trip man w...|[[document, 0, 21...|[[sentence_embedd...|[[category, 0, 21...|
|May (2002), Excis...|[[document, 0, 23...|[[sentence_embedd...|[[category, 0, 23...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows

In [43]:
posts_body_cls.select(F.explode(F.arrays_zip(posts_body_cls.document.result, 
                                     posts_body_cls.sentiment.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("document"),
              F.expr("cols['1']").alias("sentiment")).show(truncate=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------+
|            document|sentiment|
+--------------------+---------+
|#Community Guidel...|     fear|
|I want to see if ...|     fear|
|How new does a fi...|     fear|
|What a trip man w...|     fear|
|May (2002), Excis...|     fear|
|Hey,  
I have bee...|     fear|
|The sub needs to ...|      joy|
|I’m looking for m...|     fear|
|While it is under...|     fear|
|Just watched *Don...|     fear|
|I will try and ed...|     fear|
|Please suggest so...|      joy|
|I regularly will ...|     fear|
|[Fast X](https://...|      joy|
|I have a friend I...|     fear|
|I watched parasit...|     fear|
|You cannot win a ...|     fear|
|Washington D.C. C...|     fear|
|I’m Sebastian Man...|      joy|
|I'm craving some ...|     fear|
+--------------------+---------+
only showing top 20 rows

##### Classify posts title

In [21]:
data_posts[['title']].show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+
|               title|
+--------------------+
|Using This Subreddit|
|Movies that start...|
|What is the oldes...|
|Just watched Mand...|
|The most underrat...|
+--------------------+
only showing top 5 rows

In [45]:
posts_title_cls = run_pipeline('title', data_posts)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
classifierdl_use_emotion download started this may take some time.
Approximate size to download 21.3 MB
[OK!]

In [46]:
posts_title_cls.select(F.explode(F.arrays_zip(posts_title_cls.document.result, 
                                     posts_title_cls.sentiment.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("document"),
              F.expr("cols['1']").alias("sentiment")).show(truncate=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------+
|            document|sentiment|
+--------------------+---------+
|Using This Subreddit| surprise|
|Movies that start...|     fear|
|What is the oldes...|     fear|
|Just watched Mand...|      joy|
|The most underrat...|     fear|
|Recommendations o...|     fear|
|Need Links/Phone ...|     fear|
|9-5 through the a...|     fear|
|rule 7 applies to...| surprise|
|Movies that are a...|  sadness|
|Lawsuit Tracker T...|     fear|
|Movie that show c...|  sadness|
|The “pregnancy wo...|  sadness|
|Official Discussi...| surprise|
|[Request] Could s...|     fear|
|Looking to get in...|  sadness|
|Stop Calling Them...|     fear|
|Washington D.C. C...|     fear|
|Hi, it’s Sebastia...|      joy|
|Seeking Movie Rec...|     fear|
+--------------------+---------+
only showing top 20 rows

##### Classify comments body

In [24]:
data_comments[['comment_body']].show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+
|        comment_body|
+--------------------+
|The founders were...|
|All I do is point...|
|Well stated Sir, ...|
|I’m so stoked for...|
|If you're buildin...|
+--------------------+
only showing top 5 rows

In [49]:
comments_cls_body = run_pipeline('comment_body', data_comments)
comments_cls_body.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
classifierdl_use_emotion download started this may take some time.
Approximate size to download 21.3 MB
[OK!]
+--------------------+--------------------+--------------------+--------------------+
|        comment_body|            document| sentence_embeddings|           sentiment|
+--------------------+--------------------+--------------------+--------------------+
|The founders were...|[[document, 0, 19...|[[sentence_embedd...|[[category, 0, 19...|
|All I do is point...|[[document, 0, 14...|[[sentence_embedd...|[[category, 0, 14...|
|Well stated Sir, ...|[[document, 0, 76...|[[sentence_embedd...|[[category, 0, 76...|
|I’m so stoked for...|[[document, 0, 18...|[[sentence_embedd...|[[category, 0, 18...|
|If you're buildin...|[[document, 0, 30...|[[sentence_embedd...|[[category, 0, 30...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 r

In [50]:
comments_cls_body.select(F.explode(F.arrays_zip(comments_cls_body.document.result, 
                                     comments_cls_body.sentiment.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("document"),
              F.expr("cols['1']").alias("sentiment")).show(truncate=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------+
|            document|sentiment|
+--------------------+---------+
|The founders were...|     fear|
|All I do is point...|     fear|
|Well stated Sir, ...|      joy|
|I’m so stoked for...| surprise|
|If you're buildin...|     fear|
|The 2nd Amendment...|     fear|
|Just buy a real S...|     fear|
|Sounds like a gre...| surprise|
|There are much be...|     fear|
|(Australia) looki...|     fear|
|CZ P10C is a bett...| surprise|
|It shouldn't be a...| surprise|
|How often are you...| surprise|
|           [deleted]|     fear|
|.32acp you uncult...|  sadness|
|I love this commu...|      joy|
|You have the same...| surprise|
|Boiled linseed oi...| surprise|
|(Sweden) Custom f...|  sadness|
|Oooof real tough ...|     fear|
+--------------------+---------+
only showing top 20 rows