## Experiments

In [0]:
# exp = spark.read.parquet("/mnt/lsde/group05/filtered_comments/2014-3.parquet")

In [0]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.1.2 spark-nlp

import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

In [0]:
spark = sparknlp.start()

In [0]:
MODEL_NAME='classifierdl_use_emotion'

In [0]:
text_list = [
            """I am SO happy the news came out in time for my birthday this weekend! My inner 7-year-old cannot WAIT!""",
            """That moment when you see your friend in a commercial. Hahahaha!""",
            """My soul has just been pierced by the most evil look from @rickosborneorg. A mini panic attack &amp; chill in bones followed soon after.""",
            """For some reason I woke up thinkin it was Friday then I got to school and realized its really Monday -_-""",
            """I'd probably explode into a jillion pieces from the inablility to contain all of my if I had a Whataburger patty melt right now. #drool""",
            """These are not emotions. They are simply irrational thoughts feeding off of an emotion""",
            """Found out im gonna be with sarah bo barah in ny for one day!!! Eggcitement :)""",
            """That awkward moment when you find a perfume box full of sensors!""",
            """Just home from group celebration - dinner at Trattoria Gianni, then Hershey Felder's performance - AMAZING!!""",
            """Nooooo! My dad turned off the internet so I can't listen to band music!""",
            ]


In [0]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = ClassifierDLModel.pretrained(name=MODEL_NAME)\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])


In [0]:
empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)
df = spark.createDataFrame(pd.DataFrame({"text":text_list}))
# result = pipelineModel.transform(df)
result = pipelineModel.transform(exp.withColumnRenamed("body","text"))

In [0]:
#result.show()

In [0]:
# result.select("parent_id", "author", "text", "id", "score", "ups", "subreddit", "subreddit_id", "created_utc", F.explode(F.arrays_zip('sentiment.result')).alias("cols")).show()

In [0]:
# result[1]

In [0]:
# sentiment = result.select("parent_id", "author", "text", "id", "score", "ups", "subreddit", "subreddit_id", "created_utc", F.explode(F.arrays_zip('sentiment.result')).alias("cols")).select("parent_id", "author", "text", "id", "score", "ups", "subreddit", "subreddit_id", "created_utc", F.col("cols.*"))

# sentiment.show()

## Implementation

In [0]:
def items_with_sentiment(year = 2014, month = 3, comments_or_submissions = "comments"):
    if comments_or_submissions == "comments":
        link = "filtered_comments/"
    elif comments_or_submissions == "submissions":
        link = "kwfilter_top3/"
    else:
        link = "non-exist"
    result = None
    month_literal = str(month)
    data_path = "/mnt/lsde/group05/" + link + str(year) + "-" + month_literal + ".parquet"
    result = spark.read.parquet(data_path)
    result = pipelineModel.transform(result.withColumnRenamed("body","text"))
    sentiment = result.select("parent_id", "author", "text", "id", "score", "ups", "subreddit", "subreddit_id", "created_utc", F.explode(F.arrays_zip('sentiment.result')).alias("cols")).select("parent_id", "author", "text", "id", "score", "ups", "subreddit", "subreddit_id", "created_utc", F.col("cols.*"))
    sentiment.write.mode("overwrite").format("parquet").save("/mnt/lsde/group05/comments_sentiment/" + str(year) + "-" + str(month) + ".parquet")

In [0]:
items_with_sentiment(2014, 2)

In [0]:
for year in range(2014, 2022):
    items_with_sentiment(year, 12)

In [0]:
items_with_sentiment(2022, 8)

In [0]:
# items_with_sentiment(2014, 3, "comments")

In [0]:
for month in range(3, 12):
    print("processing 2014-" + str(month))
    items_with_sentiment(2014, month)
for year in range(2015, 2022):
    for month in range(1, 13):
        print("processing " + str(year) + "-" + str(month))
        items_with_sentiment(year, month)
for month in range(1, 9):
    print("processing 2022-" + str(month))
    items_with_sentiment(2022, month)

In [0]:
def submissions_with_sentiment(year = 2014, month = 3, comments_or_submissions = "submissions"):
    if comments_or_submissions == "comments":
        link = "filtered_comments/"
    elif comments_or_submissions == "submissions":
        link = "kwfilter_top3/"
    else:
        link = "non-exist"
    result = None
    month_literal = str(month)
    data_path = "/mnt/lsde/group05/" + link + str(year) + "-" + month_literal + ".parquet"
    result = spark.read.parquet(data_path)
    result = pipelineModel.transform(result.withColumnRenamed("title","text"))
    sentiment = result.select("author", "text", "selftext", "subreddit", "id", "score", "created_utc", F.explode(F.arrays_zip('sentiment.result')).alias("cols")).select("author", "text", "selftext", "subreddit", "id", "score", "created_utc", F.col("cols.*"))
    sentiment.write.mode("overwrite").format("parquet").save("/mnt/lsde/group05/submissions_sentiment/" + str(year) + "-" + str(month) + ".parquet")

In [0]:
for month in range(2, 13):
    print("processing 2014-" + str(month))
    submissions_with_sentiment(2014, month)
for year in range(2015, 2022):
    for month in range(1, 13):
        print("processing " + str(year) + "-" + str(month))
        submissions_with_sentiment(year, month)
for month in range(1, 9):
    print("processing 2022-" + str(month))
    submissions_with_sentiment(2022, month)