In [1]:
import findspark

findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark
""" conf = pyspark.SparkConf().setAppName('Tap').setMaster('local[4]')
conf.set("spark.network.timeout", "360000s")
conf.set("spark.executor.heartbeatInterval", "36000s")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc) """
spark = (
    # https://luminousmen.com/post/how-to-speed-up-spark-jobs-on-small-test-datasets?trk=article-ssr-frontend-pulse_little-text-block
    SparkSession.builder.appName("Spark Jupyter")
    .master("local[1]")
    .config("spark.driver.memory", "24G")
    .config("spark.driver.maxResultSize", "0")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    # .config("spark.kryoserializer.buffer.max", "2000M")
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.4.0")
    .config("spark.local.dir", "C:/tmp")
    # https://stackoverflow.com/questions/34625410/why-does-my-spark-run-slower-than-pure-python-performance-comparison
    .config("spark.sql.shuffle.partitions", "1")
    .config("spark.default.parallelism", "1")
    .config("spark.rdd.compress", "false")
    .config("spark.shuffle.compress", "false")
    .getOrCreate()
)

In [2]:
spark

In [3]:
DATASET_PATH = "./dataset/dataset.csv"
IMAGES_PATH = "./dataset/images/"

In [4]:
import pyspark.sql.types as tp
#  id,title,subreddit,score,num_comments,created_utc,author,upvote_ratio,img_filename,ocr_text,url,selftext,img_url
posts_schema = tp.StructType(
    [
        tp.StructField(name="id", dataType=tp.StringType(), nullable=False),
        tp.StructField(name="title", dataType=tp.StringType(), nullable=False),
        tp.StructField(name="subreddit", dataType=tp.StringType(), nullable=False),
        tp.StructField(name="score", dataType=tp.IntegerType(), nullable=False),
        tp.StructField(name="num_comments", dataType=tp.IntegerType(), nullable=False),
        tp.StructField(name="created_utc", dataType=tp.FloatType(), nullable=True),
        tp.StructField(name="author", dataType=tp.StringType(), nullable=False),
        tp.StructField(name="upvote_ratio", dataType=tp.FloatType(), nullable=False),
        tp.StructField(name="img_filename", dataType=tp.StringType(), nullable=False),
        tp.StructField(name="ocr_text", dataType=tp.StringType(), nullable=False),
        tp.StructField(name="url", dataType=tp.StringType(), nullable=False),
        tp.StructField(name="selftext", dataType=tp.StringType(), nullable=True),
        tp.StructField(name="img_url", dataType=tp.StringType(), nullable=True),
        tp.StructField(name="caption_text", dataType=tp.StringType(), nullable=True),
    ]
)

In [5]:
# https://spark.apache.org/docs/latest/sql-data-sources-csv.html
# read and impose a schema
# df= spark.read.csv(DATASET_PATH, header=True, schema= posts_schema)
from pyspark.sql.functions import col, from_unixtime


df = (
    spark.read.format("csv")
    .schema(posts_schema)
 #   .options(multiline=True, mode="FAILFAST", emptyValue="", nullValue="")
    .option("multiline", True)
    .option("mode", "FAILFAST")
    .option("emptyValue", "")
    .option("nullValue", "")
    .option("delimiter", ",")
    .option("quote", '"')
    .option("escape", '"')
    .load(DATASET_PATH, header=True)
)

df = df.withColumn(
    "created_utc", from_unixtime(col("created_utc")).cast("timestamp")
) 

df = df.fillna({'selftext': ''})

df.show()

+-------+--------------------+-------------+-----+------------+-------------------+--------------------+------------+------------+--------------------+--------------------+--------+--------------------+--------------------+
|     id|               title|    subreddit|score|num_comments|        created_utc|              author|upvote_ratio|img_filename|            ocr_text|                 url|selftext|             img_url|        caption_text|
+-------+--------------------+-------------+-----+------------+-------------------+--------------------+------------+------------+--------------------+--------------------+--------+--------------------+--------------------+
|1fj1pz3|Upper body after ...|     GymMemes|   24|           7|2024-09-17 17:10:56|       Schitts-Creek|        0.84|1fj1pz3.jpeg|                   G|https://i.redd.it...|        |./dataset/images/...|a sponge sponge i...|
|1fiya02|The Champions Lea...|  soccermemes|   77|           4|2024-09-17 14:45:52|          leyladexxx|

In [6]:
# drop if any null column
df=df.dropna()

df.count()

540

In [7]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- num_comments: integer (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- author: string (nullable = true)
 |-- upvote_ratio: float (nullable = true)
 |-- img_filename: string (nullable = true)
 |-- ocr_text: string (nullable = true)
 |-- url: string (nullable = true)
 |-- selftext: string (nullable = false)
 |-- img_url: string (nullable = true)
 |-- caption_text: string (nullable = true)



In [13]:
import utils.categories as categories
# mappa ogni subreddit alla sua categoria utilizzando il dizionario categories.subreddit_categories
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

""" def map_subreddit_to_category(subreddit):
    return categories.subreddit_categories.get(subreddit, 3)

map_subreddit_to_category_udf = udf(map_subreddit_to_category, StringType()) """
df = df.withColumn("label", udf(lambda subreddit: categories.subreddit_categories.get(subreddit, 3), tp.IntegerType())(col("subreddit")))
df.show()

+-------+--------------------+-------------+-----+------------+-------------------+--------------------+------------+------------+--------------------+--------------------+--------+--------------------+--------------------+-----+--------------------+
|     id|               title|    subreddit|score|num_comments|        created_utc|              author|upvote_ratio|img_filename|            ocr_text|                 url|selftext|             img_url|        caption_text|label|            all_text|
+-------+--------------------+-------------+-----+------------+-------------------+--------------------+------------+------------+--------------------+--------------------+--------+--------------------+--------------------+-----+--------------------+
|1fj1pz3|Upper body after ...|     GymMemes|   24|           7|2024-09-17 17:10:56|       Schitts-Creek|        0.84|1fj1pz3.jpeg|                   G|https://i.redd.it...|        |./dataset/images/...|a sponge sponge i...|    0|Upper body after .

https://stackoverflow.com/questions/65054072/spark-nlp-pretrained-model-not-loading-in-windows
https://github.com/JohnSnowLabs/spark-nlp/discussions/1022

In [14]:
df = df.withColumn("label", df["label"].cast(tp.DoubleType()))

In [15]:
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.sql.functions import concat_ws
from pyspark.ml.feature import Tokenizer, HashingTF, IDF


"""     df.title,
        df.selftext, 
        df.ocr_text,
        df.caption_text """

""" sqlTrans = SQLTransformer(
    statement="SELECT *, concat_ws(\" \", coalesce(title, ''), \
        coalesce(selftext, ''), \
        coalesce(ocr_text, ''), \
        coalesce(caption_text, '')) AS all_text  FROM __THIS__"
) """
df = df.withColumn("all_text",concat_ws(" ", df.title, df.selftext, df.ocr_text, df.caption_text))

tokenizer = Tokenizer(inputCol="all_text", outputCol="words")
# train_wordsData = tokenizer.transform(train)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2**16)
# train_featurizedData = hashingTF.transform(train_wordsData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
# idfModel = idf.fit(train_featurizedData)
# train_rescaledData = idfModel.transform(train_featurizedData)
lr = LogisticRegression(maxIter=10, tol=1e-6, fitIntercept=True)

ovr = OneVsRest(classifier=lr)


pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, ovr])
train, test = df.randomSplit([0.8, 0.2]) 

pipelineModel = pipeline.fit(train)

In [16]:
pipelineModel.write().overwrite().save("./spark/models/ovrModel")

In [17]:
predictions = pipelineModel.transform(test)
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.47


In [18]:
#df=df.drop("all_text")
predictions.select("title", "subreddit", "label", "prediction").show()

+--------------------+----------------+-----+----------+
|               title|       subreddit|label|prediction|
+--------------------+----------------+-----+----------+
|    I’m a fat piggy |        GymMemes|  0.0|       3.0|
|how did he NEVER ...|        GymMemes|  0.0|       3.0|
|      END OF AN ERA.|   footballmemes|  0.0|       1.0|
|      END OF AN ERA.|     soccermemes|  0.0|       1.0|
|When you need to ...|        GymMemes|  0.0|       3.0|
|Which NBA player ...|        Nbamemes|  0.0|       0.0|
|Man United fans w...|     soccermemes|  0.0|       0.0|
|It really doesn't...|        GymMemes|  0.0|       0.0|
|  Sad facts of life |     soccermemes|  0.0|       0.0|
|        Wait what 😭|     soccermemes|  0.0|       1.0|
|Average depressed...|        GymMemes|  0.0|       1.0|
|Cognitive dissona...|        GymMemes|  0.0|       0.0|
|Andre Onana saves...|     soccermemes|  0.0|       0.0|
|Me after training...|        GymMemes|  0.0|       1.0|
|Liverpool fans ri...|     socce

In [19]:
predictions.count()

100