In [20]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.3.0 spark-nlp==4.2.8

In [21]:
import pandas as pd
import numpy as np
import json

import sparknlp
import pyspark.sql.functions as F

from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

In [22]:
# get comment data and metadata through a query in an SQLite db (need to upload a db file in colab)
# TODO: directly load parquet as a spark dataframe

import sqlite3

# Connect to the database
conn = sqlite3.connect('porn_data_new.db')

# Execute the query
query = """SELECT * FROM comments c 
           left join video_info vi on c.view_key == vi.view_key 
           left join creators c2 on vi.creator_href == c2.creator_href 
"""

df = pd.read_sql(query, conn)


In [23]:
df.columns

Index(['username_href', 'view_key', 'comment_text', 'upvotes', 'timestamp',
       'view_key', 'title', 'creator_name', 'creator_href', 'views', 'rating',
       'year_added', 'categories', 'timestamp', 'creator_href', 'creator_name',
       'creator_type', 'about_info', 'video_count', 'subscribers', 'infos',
       'timestamp'],
      dtype='object')

In [24]:
df.head(2)

Unnamed: 0,username_href,view_key,comment_text,upvotes,timestamp,view_key.1,title,creator_name,creator_href,views,...,categories,timestamp.1,creator_href.1,creator_name.1,creator_type,about_info,video_count,subscribers,infos,timestamp.2
0,/model/mirabella-star,ph5f9da0f4b754c,https://vm.tiktok.com/ZMe4STLD6/,0,1684297000.0,ph5f9da0f4b754c,Sheryl X - Redhead girl sexually eating waterm...,Sheryl X,/channels/sheryl-x,,...,[],1684297000.0,/channels/sheryl-x,Sheryl X,channels,,,,{},1684297000.0
1,/users/peterachterstraat,ph5f9da0f4b754c,ik ben supergeil,0,1684297000.0,ph5f9da0f4b754c,Sheryl X - Redhead girl sexually eating waterm...,Sheryl X,/channels/sheryl-x,,...,[],1684297000.0,/channels/sheryl-x,Sheryl X,channels,,,,{},1684297000.0


In [25]:
text_list = df.comment_text.to_list()

In [28]:
spark = sparknlp.start()

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 4.2.8
Apache Spark version: 3.3.0


# Sarcasm classifier

In [29]:
MODEL_NAME='classifierdl_use_sarcasm'

In [30]:
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

sentimentdl = ClassifierDLModel.pretrained(name=MODEL_NAME)\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("sentiment")

nlpPipeline = Pipeline(stages = [
                                documentAssembler,
                                use,
                                sentimentdl])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
classifierdl_use_sarcasm download started this may take some time.
Approximate size to download 21.3 MB
[OK!]


In [31]:
df = spark.createDataFrame(text_list, StringType()).toDF("text")
result=nlpPipeline.fit(df).transform(df)

In [32]:
result.select(F.explode(F.arrays_zip(result.document.result, 
                                     result.sentiment.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("document"),
              F.expr("cols['1']").alias("sentiment")).show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|document                                                                                                                                                |sentiment|
+--------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|https://vm.tiktok.com/ZMe4STLD6/                                                                                                                        |normal   |
|ik ben supergeil                                                                                                                                        |normal   |
|So sexy, I love readhead, my tiny dick is so hard.                                                                                                      |normal   |
|She praye

# Sexism classifier

In [34]:
MODEL_NAME='classifierdl_use_cyberbullying'

In [35]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = ClassifierDLModel.pretrained(name=MODEL_NAME)\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(stages = [documentAssembler,
                                 use,
                                 sentimentdl])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
classifierdl_use_cyberbullying download started this may take some time.
Approximate size to download 21.3 MB
[OK!]


In [36]:
df = spark.createDataFrame(text_list, StringType()).toDF("text")
result = nlpPipeline.fit(df).transform(df)

In [37]:
result.select(F.explode(F.arrays_zip(result.document.result, 
                                     result.sentiment.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("document"),
              F.expr("cols['1']").alias("sentiment")).show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|document                                                                                                                                                |sentiment|
+--------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|https://vm.tiktok.com/ZMe4STLD6/                                                                                                                        |neutral  |
|ik ben supergeil                                                                                                                                        |neutral  |
|So sexy, I love readhead, my tiny dick is so hard.                                                                                                      |neutral  |
|She praye