## Colab Setup

In [1]:
!pip install -q pyspark==3.4.1 spark-nlp==5.3.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.0/565.0 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import sparknlp

spark = sparknlp.start()

from sparknlp.base import *
from sparknlp.annotator import *

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 5.3.2
Apache Spark version: 3.4.1


## T5Model

In [9]:
from pyspark.ml import Pipeline

In [10]:
# !wget -q -O news_category_test.csv https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_test.csv

In [11]:
import numpy as np
import pandas as pd

In [12]:
import pyspark.sql.functions as F

df = spark.read\
      .parquet("/content/data/validation-00000-of-00001.parquet")
df = df.withColumn('text', F.format_string('question: %s context: %s', F.col('question'), F.col('context')))


df.show(truncate=50)

+------------------------+-------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|                      id|  title|                                           context|                                          question|                                           answers|                                              text|
+------------------------+-------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|56ddde6b9a695914005b9628|Normans|The Normans (Norman: Nourmands; French: Normand...|              In what country is Normandy located?|{[France, France, France, France], [159, 159, 1...|question: In what country is Normandy located? ...|
|56ddde6b9a695914005b9629|Normans|The Norman

In [14]:
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("documents")

t5 = T5Transformer() \
    .pretrained("t5_small") \
    .setTask("summarize:")\
    .setMaxOutputLength(200)\
    .setInputCols(["documents"]) \
    .setOutputCol("summaries")

pipeline = Pipeline().setStages([document_assembler, t5])

results = pipeline.fit(df).transform(df)

t5_small download started this may take some time.
Approximate size to download 241.9 MB
[OK!]


In [16]:
results.show()

+--------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|  title|             context|            question|             answers|                text|           documents|           summaries|
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|56ddde6b9a6959140...|Normans|The Normans (Norm...|In what country i...|{[France, France,...|question: In what...|[{document, 0, 79...|[{document, 0, 5,...|
|56ddde6b9a6959140...|Normans|The Normans (Norm...|When were the Nor...|{[10th and 11th c...|question: When we...|[{document, 0, 79...|[{document, 0, 74...|
|56ddde6b9a6959140...|Normans|The Normans (Norm...|From which countr...|{[Denmark, Icelan...|question: From wh...|[{document, 0, 80...|[{document, 0, 26...|
|56ddde6b9a6959140...|Normans|The Normans (Norm...|Who was

In [20]:
results.select("summaries").show(truncate = False)

+-------------------------------------------------------------------------------------------------------------------------------------+
|summaries                                                                                                                            |
+-------------------------------------------------------------------------------------------------------------------------------------+
|[{document, 0, 5, France, {sentence -> 0}, []}]                                                                                      |
|[{document, 0, 74, the 10th and 11th centuries gave their name to Normandy, a region in France, {sentence -> 0}, []}]                |
|[{document, 0, 26, Denmark, Iceland and Norway, {sentence -> 0}, []}]                                                                |
|[{document, 0, 4, Rollo, {sentence -> 0}, []}]                                                                                       |
|[{document, 0, 22, 10th and 11th centuries, {se