## Colab Setup

In [2]:
# !pip install -q pyspark==3.4.1 spark-nlp==5.3.2

In [2]:
import sparknlp

# spark = sparknlp.start()

from sparknlp.base import *
from sparknlp.annotator import *

print("Spark NLP version", sparknlp.version())
# print("Apache Spark version:", spark.version)

# spark

Spark NLP version 5.3.2


## T5Model

In [6]:
from pyspark.ml import Pipeline
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StringType, IntegerType

In [7]:
spark = SparkSession.builder.appName("Spark NLP") \
    .config(
        "spark.jars", 
        "./kafka-clients-3.5.0.jar,./spark-sql-kafka-0-10_2.12-3.5.0.jar, \
        ./spark-token-provider-kafka-0-10_2.12-3.5.0.jar, \
        ./commons-pool2-2.12.0.jar") \
    .config("spark.sql.shuffle.partitions", "3") \
    .config("spark.executorEnv.PYSPARK_PYTHON","/root/anaconda3/bin/python") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.log.level", "ERROR") \
    .master("local[*]") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "2000M") \
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.3") \
    .getOrCreate()

:: loading settings :: url = jar:file:/opt/module/spark-3.5.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a6d74615-64d8-41b5-b54f-899fa47b254c;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.3.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 in central
	found com.g

In [10]:
# !wget -q -O news_category_test.csv https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_test.csv

In [8]:
import numpy as np
import pandas as pd

In [9]:
p2_df = spark.readStream.format("kafka")\
    .option("kafka.bootstrap.servers", "localhost:9092")\
    .option("subscribe", "p2") \
    .option("startingOffsets", "earliest") \
    .load()

In [12]:
p2_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [14]:
p2_df.writeStream \
    .format("memory") \
    .outputMode("append") \
    .queryName("validation_set") \
    .option("checkpointLocation", "./checkpoint") \
    .start()

<pyspark.sql.streaming.query.StreamingQuery at 0x7fed4facbf90>

                                                                                

In [19]:
df = spark.sql("select CAST(value AS STRING) from validation_set")

In [20]:
schema = "id STRING, title STRING, context STRING, question STRING, answers STRUCT<text: ARRAY<STRING>, answer_start: ARRAY<INT>>"
parsed_df = df \
    .withColumn("parsed_value", from_json(col("value"), schema)) \
    .select("parsed_value.*")

parsed_df.show(5, truncate=False)

+------------------------+-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+
|id             

In [23]:
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("documents")

t5 = T5Transformer() \
    .pretrained("t5_small") \
    .setTask("summarize:")\
    .setMaxOutputLength(200)\
    .setInputCols(["documents"]) \
    .setOutputCol("summaries")

pipeline = Pipeline().setStages([document_assembler, t5])


t5_small download started this may take some time.
Approximate size to download 241.9 MB
[OK!]


In [26]:
import pyspark.sql.functions as F

parsed_df = parsed_df.withColumn('text', F.format_string('question: %s context: %s', F.col('question'), F.col('context')))

results = pipeline.fit(parsed_df).transform(parsed_df)

In [27]:
results.select("summaries").show(5, truncate = False)

[Stage 7:>                                                          (0 + 1) / 1]

+---------------------------------------------------------------------------------------------------------------------+
|summaries                                                                                                            |
+---------------------------------------------------------------------------------------------------------------------+
|[{document, 0, 5, France, {sentence -> 0}, []}]                                                                      |
|[{document, 0, 74, the 10th and 11th centuries gave their name to Normandy, a region in France, {sentence -> 0}, []}]|
|[{document, 0, 26, Denmark, Iceland and Norway, {sentence -> 0}, []}]                                                |
|[{document, 0, 4, Rollo, {sentence -> 0}, []}]                                                                       |
|[{document, 0, 22, 10th and 11th centuries, {sentence -> 0}, []}]                                                    |
+---------------------------------------

                                                                                