In [None]:
! mkdir -p data

Download https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/mini_newsgroups.tar.gz
and unarchive it in `data` folder.

In [None]:
! ls ./data/mini_newsgroups/

In [None]:
from pyspark.sql import functions as fun

texts = spark.read\
  .format("text")\
  .option("recursiveFileLookup", "true")\
  .option("wholetext", "true")\
  .load("data/mini_newsgroups/")\
  .withColumnRenamed("value", "text")\
  .withColumn("filename", fun.input_file_name()) # TIP: https://www.youtube.com/watch?v=7jxFffeQHpQ

texts.show(n=5, truncate=120, vertical=True)

In [None]:
texts.limit(5).toPandas()

In [None]:
import os

parts = ("file://" + os.path.dirname(os.path.realpath("data/mini_newsgroups"))).split("/")
print(parts)

In [None]:
texts3col = texts.withColumn("newsgroup", fun.split("filename", "/").getItem(len(parts) + 1))

texts3col.limit(5).toPandas()

In [None]:
import matplotlib.pyplot as plt

newsgroup_counts = texts3col.groupBy("newsgroup").count().toPandas()

newsgroup_counts.plot(kind='bar', figsize=(10, 5))
plt.xticks(
    ticks=range(len(newsgroup_counts)), 
    labels=newsgroup_counts['newsgroup']
)
plt.show()

In [None]:
! mkdir -p models

Download pipline file from https://nlp.johnsnowlabs.com/2022/06/24/explain_document_ml_en_3_0.html and extract it in `models` folder.

In [None]:
from sparknlp.pretrained import PretrainedPipeline

In [None]:
pipeline = PretrainedPipeline.from_disk("models/explain_document_ml_en_4.0.0_3.0_1656066222624")

In [None]:
pipeline.annotate('Hellu wrold!')

In [None]:
annot_texts = pipeline.transform(texts)

annot_texts.printSchema()

In [None]:
annot_texts.show(n=2, truncate=110, vertical=True)

In [None]:
from sparknlp import Finisher

finisher = Finisher()
# taking the lemma column
finisher = finisher.setInputCols(['stems'])
# seperating lemmas by a single space
finished_texts_df = finisher.transform(annot_texts)
finished_texts_df.show(n=1, truncate=110, vertical=True)

In [None]:
finished_texts_df.select('finished_stems').take(1)