<a href="https://colab.research.google.com/github/onlyabhilash/Spark_NLP/blob/main/spart-nlp_basics/spark_02_Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.7.1

openjdk version "1.8.0_312"
OpenJDK Runtime Environment (build 1.8.0_312-8u312-b07-0ubuntu1~18.04-b07)
OpenJDK 64-Bit Server VM (build 25.312-b07, mixed mode)
[K     |████████████████████████████████| 215.7 MB 60 kB/s 
[K     |████████████████████████████████| 197 kB 20.0 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 138 kB 8.8 MB/s 
[?25h

In [3]:
import sparknlp

spark = sparknlp.start()
# params =>> gpu=False, spark23=False (start with spark 2.3)


print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 2.7.1
Apache Spark version: 2.4.4


### Create Spark Dataframe

In [4]:
text = 'Peter Parker is a nice guy and lives in New York'

spark_df = spark.createDataFrame([[text]]).toDF('text')
spark_df.show(truncate=False)

+------------------------------------------------+
|text                                            |
+------------------------------------------------+
|Peter Parker is a nice guy and lives in New York|
+------------------------------------------------+



In [5]:
from pyspark.sql.types import StringType, IntegerType

# if you want to create a spark datafarme from a list of strings

text_list = ['Peter Parker is a nice guy and lives in New York.', 'Bruce Wayne is also a nice guy and lives in Gotham City.']

spark.createDataFrame(text_list, StringType()).toDF("text").show(truncate=80)


+--------------------------------------------------------+
|                                                    text|
+--------------------------------------------------------+
|       Peter Parker is a nice guy and lives in New York.|
|Bruce Wayne is also a nice guy and lives in Gotham City.|
+--------------------------------------------------------+



In [6]:
from pyspark.sql import Row 

spark.createDataFrame(list(map(lambda x : Row(text = x),text_list))).show(truncate = 80)

+--------------------------------------------------------+
|                                                    text|
+--------------------------------------------------------+
|       Peter Parker is a nice guy and lives in New York.|
|Bruce Wayne is also a nice guy and lives in Gotham City.|
+--------------------------------------------------------+



In [7]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/annotation/english/spark-nlp-basics/sample-sentences-en.txt

In [8]:
with open('./sample-sentences-en.txt') as f:
  print(f.read())

Peter is a very good person.
My life in Russia is very interesting.
John and Peter are brothers. However they don't support each other that much.
Lucas Nogal Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!


In [9]:
spark_df = spark.read.text('./sample-sentences-en.txt').toDF('text')
spark_df.show(truncate=False)

+-----------------------------------------------------------------------------+
|text                                                                         |
+-----------------------------------------------------------------------------+
|Peter is a very good person.                                                 |
|My life in Russia is very interesting.                                       |
|John and Peter are brothers. However they don't support each other that much.|
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |
|Europe is very culture rich. There are huge churches! and big houses!        |
+-----------------------------------------------------------------------------+



In [10]:
spark_df.select('text').show(truncate=False)

+-----------------------------------------------------------------------------+
|text                                                                         |
+-----------------------------------------------------------------------------+
|Peter is a very good person.                                                 |
|My life in Russia is very interesting.                                       |
|John and Peter are brothers. However they don't support each other that much.|
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |
|Europe is very culture rich. There are huge churches! and big houses!        |
+-----------------------------------------------------------------------------+



In [11]:
textFiles = spark.sparkContext.wholeTextFiles("./*.txt",4)
    
spark_df_folder = textFiles.toDF(schema=['path','text'])

spark_df_folder.show(truncate=30)

+------------------------------+------------------------------+
|                          path|                          text|
+------------------------------+------------------------------+
|file:/content/sample-senten...|Peter is a very good person...|
+------------------------------+------------------------------+



In [12]:
spark_df_folder.select('text').take(1)

[Row(text="Peter is a very good person.\nMy life in Russia is very interesting.\nJohn and Peter are brothers. However they don't support each other that much.\nLucas Nogal Dunbercker is no longer happy. He has a good car though.\nEurope is very culture rich. There are huge churches! and big houses!")]

In [13]:
from sparknlp.base import *

documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")\
.setCleanupMode("shrink")

doc_df = documentAssembler.transform(spark_df)

doc_df.show(truncate=False)

+-----------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+
|text                                                                         |document                                                                                                               |
+-----------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+
|Peter is a very good person.                                                 |[[document, 0, 27, Peter is a very good person., [sentence -> 0], []]]                                                 |
|My life in Russia is very interesting.                                       |[[document, 0, 37, My life in Russia is very interesting., [sentence -> 0], []]]                                       |


In [14]:
doc_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)



In [15]:
doc_df.select('document.result','document.begin','document.end').show(truncate=False)

+-------------------------------------------------------------------------------+-----+----+
|result                                                                         |begin|end |
+-------------------------------------------------------------------------------+-----+----+
|[Peter is a very good person.]                                                 |[0]  |[27]|
|[My life in Russia is very interesting.]                                       |[0]  |[37]|
|[John and Peter are brothers. However they don't support each other that much.]|[0]  |[76]|
|[Lucas Nogal Dunbercker is no longer happy. He has a good car though.]         |[0]  |[67]|
|[Europe is very culture rich. There are huge churches! and big houses!]        |[0]  |[68]|
+-------------------------------------------------------------------------------+-----+----+



In [16]:
doc_df.select("document.result").take(1)

[Row(result=['Peter is a very good person.'])]

In [17]:
import pyspark.sql.functions as F

doc_df.withColumn(
    'tmp',
    F.explode('document'))\
    .select('tmp.*')\
    .show(truncate=False)

+-------------+-----+---+-----------------------------------------------------------------------------+---------------+----------+
|annotatorType|begin|end|result                                                                       |metadata       |embeddings|
+-------------+-----+---+-----------------------------------------------------------------------------+---------------+----------+
|document     |0    |27 |Peter is a very good person.                                                 |[sentence -> 0]|[]        |
|document     |0    |37 |My life in Russia is very interesting.                                       |[sentence -> 0]|[]        |
|document     |0    |76 |John and Peter are brothers. However they don't support each other that much.|[sentence -> 0]|[]        |
|document     |0    |67 |Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |[sentence -> 0]|[]        |
|document     |0    |68 |Europe is very culture rich. There are huge churches! and 

### Sentence Detector

In [19]:
from sparknlp.annotator import *

sentenceDetector = SentenceDetector()\
.setInputCols(['document'])\
.setOutputCol('sentences')

sentenceDetector.extractParamMap()

{Param(parent='SentenceDetector_1d0d527b4444', name='customBounds', doc='characters used to explicitly mark sentence bounds'): [],
 Param(parent='SentenceDetector_1d0d527b4444', name='detectLists', doc='whether detect lists during sentence detection'): True,
 Param(parent='SentenceDetector_1d0d527b4444', name='explodeSentences', doc='whether to explode each sentence into a different row, for better parallelization. Defaults to false.'): False,
 Param(parent='SentenceDetector_1d0d527b4444', name='inputCols', doc='previous annotations columns, if renamed'): ['document'],
 Param(parent='SentenceDetector_1d0d527b4444', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='SentenceDetector_1d0d527b4444', name='maxLength', doc='Set the maximum allowed length for each sentence'): 99999,
 Param(parent='SentenceDetector_1d0d527b4444', name='minLength', doc='Set the minimum allowed length for each sentence.'): 0,
 Param(parent='Sentenc

In [20]:
sent_df = sentenceDetector.transform(doc_df)
sent_df.show(truncate=False)

+-----------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                         |document                                                                                                               |sentences                                                                                                                                                                                          |
+-----------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+---------

In [21]:
sent_df.select('sentences').take(3)

[Row(sentences=[Row(annotatorType='document', begin=0, end=27, result='Peter is a very good person.', metadata={'sentence': '0'}, embeddings=[])]),
 Row(sentences=[Row(annotatorType='document', begin=0, end=37, result='My life in Russia is very interesting.', metadata={'sentence': '0'}, embeddings=[])]),
 Row(sentences=[Row(annotatorType='document', begin=0, end=27, result='John and Peter are brothers.', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='document', begin=29, end=76, result="However they don't support each other that much.", metadata={'sentence': '1'}, embeddings=[])])]

In [22]:
text ='The patient was prescribed 1 capsule of Advil for 5 days. He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night, 12 units of insulin lispro with meals, and metformin 1000 mg two times a day. It was determined that all SGLT2 inhibitors should be discontinued indefinitely fro 3 months.'
text


'The patient was prescribed 1 capsule of Advil for 5 days. He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night, 12 units of insulin lispro with meals, and metformin 1000 mg two times a day. It was determined that all SGLT2 inhibitors should be discontinued indefinitely fro 3 months.'

In [23]:
spark_df = spark.createDataFrame([[text]]).toDF("text")

spark_df.show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                                                                                                                           |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [24]:
doc_df = documentAssembler.transform(spark_df)
sent_df = sentenceDetector.transform(doc_df)
sent_df.show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [25]:
sent_df.select('sentences.result').take(1)

[Row(result=['The patient was prescribed 1 capsule of Advil for 5 days.', 'He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night, 12 units of insulin lispro with meals, and metformin 1000 mg two times a day.', 'It was determined that all SGLT2 inhibitors should be discontinued indefinitely fro 3 months.'])]

In [26]:
sentenceDetector.setExplodeSentences(True)

SentenceDetector_1d0d527b4444

In [27]:
sent_df = sentenceDetector.transform(doc_df)

sent_df.show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|                                              text|                                          document|                                         sentences|
+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|The patient was prescribed 1 capsule of Advil f...|[[document, 0, 334, The patient was prescribed ...|[[document, 0, 56, The patient was prescribed 1...|
|The patient was prescribed 1 capsule of Advil f...|[[document, 0, 334, The patient was prescribed ...|[[document, 58, 240, He was seen by the endocri...|
|The patient was prescribed 1 capsule of Advil f...|[[document, 0, 334, The patient was prescribed ...|[[document, 242, 334, It was determined that al...|
+--------------------------------------------------+------------------

In [28]:
sent_df.select('sentences.result').show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                   |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[The patient was prescribed 1 capsule of Advil for 5 days.]                                                                                                                              |
|[He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night, 12 units of insulin lispro with meals, and metformin 1000 mg two times a day.]|
|[It was determined that all SGLT2 inhibitors should be disc

In [29]:
sent_df.select(F.explode('sentences.result')).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|col                                                                                                                                                                                    |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|The patient was prescribed 1 capsule of Advil for 5 days.                                                                                                                              |
|He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night, 12 units of insulin lispro with meals, and metformin 1000 mg two times a day.|
|It was determined that all SGLT2 inhibitors should be discontinued in

### Sentence Detector DL

In [30]:
sentencerDL = SentenceDetectorDLModel().pretrained('sentence_detector_dl',lang = 'en')\
.setInputCols(['document'])\
.setOutputCol('sentences')

sent_dl_df = sentencerDL.transform(doc_df)
sent_dl_df.select(F.explode('sentences.result')).show(truncate=False)

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|col                                                                                                                                                                                    |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|The patient was prescribed 1 capsule of Advil for 5 days.                                                                                                                              |
|He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night, 12 units of insulin lispro with meals, and

In [31]:
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetector = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentences')
    
sentencerDL = SentenceDetectorDLModel\
    .pretrained("sentence_detector_dl", "en") \
    .setInputCols(["document"]) \
    .setOutputCol("sentences")

sd_pipeline = PipelineModel(stages = [documenter,sentenceDetector])
sd_model = LightPipeline(sd_pipeline)

#DL version
sd_dl_pipeline = PipelineModel(stages=[documenter, sentencerDL])

sd_dl_model = LightPipeline(sd_dl_pipeline)

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]


In [32]:
text = """John loves Mary.Mary loves Peter
Peter loves Helen .Helen loves John; 
Total: four people involved."""

for anno in sd_model.fullAnnotate(text)[0]["sentences"]:
  print("{}\t{}\t{}\t{}".format(
        anno.metadata["sentence"], anno.begin, anno.end, anno.result))

0	0	51	John loves Mary.Mary loves Peter
Peter loves Helen .
1	52	68	Helen loves John;
2	71	98	Total: four people involved.


In [33]:
for anno in sd_dl_model.fullAnnotate(text)[0]["sentences"]:
    print("{}\t{}\t{}\t{}".format(
        anno.metadata["sentence"], anno.begin, anno.end, anno.result))

0	0	15	John loves Mary.
1	16	32	Mary loves Peter
2	33	51	Peter loves Helen .
3	52	68	Helen loves John;
4	71	98	Total: four people involved.


### Tokenizer

In [34]:
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

In [35]:
tokenizer.extractParamMap()

{Param(parent='Tokenizer_390875b0c691', name='caseSensitiveExceptions', doc='Whether to care for case sensitiveness in exceptions'): True,
 Param(parent='Tokenizer_390875b0c691', name='contextChars', doc='character list used to separate from token boundaries'): ['.',
  ',',
  ';',
  ':',
  '!',
  '?',
  '*',
  '-',
  '(',
  ')',
  '"',
  "'"],
 Param(parent='Tokenizer_390875b0c691', name='inputCols', doc='previous annotations columns, if renamed'): ['document'],
 Param(parent='Tokenizer_390875b0c691', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='Tokenizer_390875b0c691', name='maxLength', doc='Set the maximum allowed legth for each token'): 99999,
 Param(parent='Tokenizer_390875b0c691', name='minLength', doc='Set the minimum allowed legth for each token'): 0,
 Param(parent='Tokenizer_390875b0c691', name='outputCol', doc='output annotation column. can be left default.'): 'token',
 Param(parent='Tokenizer_390875b0c691',

In [36]:
text = 'Peter Parker (Spiderman) is a nice guy and lives in New York but has no e-mail!'

spark_df = spark.createDataFrame([[text]]).toDF("text")
spark_df.show(truncate=False)

+-------------------------------------------------------------------------------+
|text                                                                           |
+-------------------------------------------------------------------------------+
|Peter Parker (Spiderman) is a nice guy and lives in New York but has no e-mail!|
+-------------------------------------------------------------------------------+



In [37]:
doc_df = documentAssembler.transform(spark_df)
token_df = tokenizer.fit(doc_df).transform(doc_df)
token_df.show(truncate=False)

+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [38]:
token_df.select('token.result').take(1)

[Row(result=['Peter', 'Parker', '(', 'Spiderman', ')', 'is', 'a', 'nice', 'guy', 'and', 'lives', 'in', 'New', 'York', 'but', 'has', 'no', 'e-mail', '!'])]

In [39]:
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token") \
    .setSplitChars(['-']) \
    .setContextChars(['?', '!']) \
    .addException("New York") \

token_df = tokenizer.fit(doc_df).transform(doc_df)

token_df.select('token.result').take(1)

[Row(result=['Peter', 'Parker', '(Spiderman)', 'is', 'a', 'nice', 'guy', 'and', 'lives', 'in', 'New York', 'but', 'has', 'no', 'e', 'mail', '!'])]

### Regex tokenizer

In [40]:
from pyspark.sql.types import StringType

content = "1. T1-T2 DATE**[12/24/13] $1.99 () (10/12), ph+ 90%"
pattern = "\\s+|(?=[-.:;*+,$&%\\[\\]])|(?<=[-.:;*+,$&%\\[\\]])"

df = spark.createDataFrame([content], StringType()).withColumnRenamed("value", "text")

documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetector = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

regexTokenizer = RegexTokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("regexToken") \
      .setPattern(pattern) \
      .setPositionalMask(False)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documenter,
        sentenceDetector,
        regexTokenizer])
    
result = docPatternRemoverPipeline.fit(df).transform(df)

result.show(10, False)

+---------------------------------------------------+---------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [41]:
result_df = result.select(F.explode('regexToken.result').alias('regexToken')).toPandas()
result_df

Unnamed: 0,regexToken
0,1
1,.
2,T1
3,-
4,T2
5,DATE
6,*
7,*
8,[
9,12/24/13


## Stacking Spark NLP Annotators in Spark ML Pipeline

In [42]:
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

sentenceDetector = SentenceDetector().\
setInputCols(['document']).\
setOutputCol('sentences')

tokenizer = Tokenizer() \
    .setInputCols(["sentences"]) \
    .setOutputCol("token")

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 sentenceDetector,
 tokenizer
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

In [43]:
spark_df = spark.read.text('./sample-sentences-en.txt').toDF('text')
spark_df.show(truncate = False)

+-----------------------------------------------------------------------------+
|text                                                                         |
+-----------------------------------------------------------------------------+
|Peter is a very good person.                                                 |
|My life in Russia is very interesting.                                       |
|John and Peter are brothers. However they don't support each other that much.|
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |
|Europe is very culture rich. There are huge churches! and big houses!        |
+-----------------------------------------------------------------------------+



In [44]:
result = pipelineModel.transform(spark_df)
result.show(truncate = False)

+-----------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [45]:
result.printSchema()

root
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentences: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = tru

In [46]:
result.select('sentences.result').take(3)

[Row(result=['Peter is a very good person.']),
 Row(result=['My life in Russia is very interesting.']),
 Row(result=['John and Peter are brothers.', "However they don't support each other that much."])]

In [47]:
result.select('token').take(3)[2]

Row(token=[Row(annotatorType='token', begin=0, end=3, result='John', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=5, end=7, result='and', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=9, end=13, result='Peter', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=15, end=17, result='are', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=19, end=26, result='brothers', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=27, end=27, result='.', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=29, end=35, result='However', metadata={'sentence': '1'}, embeddings=[]), Row(annotatorType='token', begin=37, end=40, result='they', metadata={'sentence': '1'}, embeddings=[]), Row(annotatorType='token', begin=42, end=46, result="don't", metadata={'sentence': '1'}, embeddings=[]), Row(annotatorType='token', begin=48, end=54, result='s

### Normalizer

In [48]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [49]:
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setLowercase(True)\
    .setCleanupPatterns(["[^\w\d\s]"]) # remove punctuations (keep alphanumeric chars)
    # if we don't set CleanupPatterns, it will only keep alphabet letters ([^A-Za-z])

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 normalizer
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

In [50]:
result = pipelineModel.transform(spark_df)

In [51]:
result.show(truncate=False)

+-----------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------

In [52]:
result.select('token').take(3)

[Row(token=[Row(annotatorType='token', begin=0, end=4, result='Peter', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=6, end=7, result='is', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=9, end=9, result='a', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=11, end=14, result='very', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=16, end=19, result='good', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=21, end=26, result='person', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=27, end=27, result='.', metadata={'sentence': '0'}, embeddings=[])]),
 Row(token=[Row(annotatorType='token', begin=0, end=1, result='My', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=3, end=6, result='life', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=8, end=9, result='in',

In [53]:
result.select('normalized.result').take(5)

[Row(result=['peter', 'is', 'a', 'very', 'good', 'person']),
 Row(result=['my', 'life', 'in', 'russia', 'is', 'very', 'interesting']),
 Row(result=['john', 'and', 'peter', 'are', 'brothers', 'however', 'they', 'dont', 'support', 'each', 'other', 'that', 'much']),
 Row(result=['lucas', 'nogal', 'dunbercker', 'is', 'no', 'longer', 'happy', 'he', 'has', 'a', 'good', 'car', 'though']),
 Row(result=['europe', 'is', 'very', 'culture', 'rich', 'there', 'are', 'huge', 'churches', 'and', 'big', 'houses'])]

In [54]:
result.select('normalized').take(3)

[Row(normalized=[Row(annotatorType='token', begin=0, end=4, result='peter', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=6, end=7, result='is', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=9, end=9, result='a', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=11, end=14, result='very', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=16, end=19, result='good', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=21, end=26, result='person', metadata={'sentence': '0'}, embeddings=[])]),
 Row(normalized=[Row(annotatorType='token', begin=0, end=1, result='my', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=3, end=6, result='life', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=8, end=9, result='in', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=11, end=16, r

### Document Normalizer

In [55]:
text = '''
  <div id="theworldsgreatest" class='my-right my-hide-small my-wide toptext' style="font-family:'Segoe UI',Arial,sans-serif">
    THE WORLD'S LARGEST WEB DEVELOPER SITE
    <h1 style="font-size:300%;">THE WORLD'S LARGEST WEB DEVELOPER SITE</h1>
    <p style="font-size:160%;">Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum..</p>
  </div>

</div>'''

spark_df = spark.createDataFrame([[text]]).toDF("text")

spark_df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                       

In [56]:
documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument")

documentNormalizer.extractParamMap()

{Param(parent='DocumentNormalizer_59f36f35ce17', name='action', doc='action to perform applying regex patterns on text'): 'clean_up',
 Param(parent='DocumentNormalizer_59f36f35ce17', name='encoding', doc='file encoding to apply on normalized documents'): 'UTF-8',
 Param(parent='DocumentNormalizer_59f36f35ce17', name='inputCols', doc='previous annotations columns, if renamed'): ['document'],
 Param(parent='DocumentNormalizer_59f36f35ce17', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='DocumentNormalizer_59f36f35ce17', name='lowercase', doc='whether to convert strings to lowercase'): False,
 Param(parent='DocumentNormalizer_59f36f35ce17', name='outputCol', doc='output annotation column. can be left default.'): 'normalizedDocument',
 Param(parent='DocumentNormalizer_59f36f35ce17', name='patterns', doc='normalization regex patterns which match will be removed from document. Defaults is <[^>]*>'): ['<[^>]*>'],
 Param(paren

In [57]:

documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

#default
cleanUpPatterns = ["<[^>]*>"]

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setPatterns(cleanUpPatterns) \
    .setReplacement(" ") \
    .setPolicy("pretty_all") \
    .setLowercase(True)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        documentNormalizer])
    
empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = docPatternRemoverPipeline.fit(empty_df)

In [58]:
result = pipelineModel.transform(spark_df)

result.select('normalizedDocument.result').show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                                                                                                                                                                              

### Stopwords Cleaner

In [59]:
stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("token")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)\
      #.setStopWords(["no", "without"]) (e.g. read a list of words from a txt)
      
stopwords_cleaner.getStopWords()

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [60]:
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 stopwords_cleaner
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

In [61]:
spark_df = spark.read.text('./sample-sentences-en.txt').toDF('text')

result = pipelineModel.transform(spark_df)

result.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|         cleanTokens|
+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[[document, 0, 27...|[[token, 0, 4, Pe...|[[token, 0, 4, Pe...|
|My life in Russia...|[[document, 0, 37...|[[token, 0, 1, My...|[[token, 3, 6, li...|
|John and Peter ar...|[[document, 0, 76...|[[token, 0, 3, Jo...|[[token, 0, 3, Jo...|
|Lucas Nogal Dunbe...|[[document, 0, 67...|[[token, 0, 4, Lu...|[[token, 0, 4, Lu...|
|Europe is very cu...|[[document, 0, 68...|[[token, 0, 5, Eu...|[[token, 0, 5, Eu...|
+--------------------+--------------------+--------------------+--------------------+



In [62]:
result.select('cleanTokens.result').take(1)

[Row(result=['Peter', 'good', 'person', '.'])]

### Token Assembler

In [63]:
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

sentenceDetector = SentenceDetector().\
    setInputCols(['document']).\
    setOutputCol('sentences')

tokenizer = Tokenizer() \
    .setInputCols(["sentences"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setLowercase(False)\

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)\

tokenassembler = TokenAssembler()\
    .setInputCols(["sentences", "cleanTokens"]) \
    .setOutputCol("clean_text")


nlpPipeline = Pipeline(stages=[
     documentAssembler,
    sentenceDetector,
     tokenizer,
     normalizer,
     stopwords_cleaner,
     tokenassembler
 ])


empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

result = pipelineModel.transform(spark_df)

result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|           sentences|               token|          normalized|         cleanTokens|          clean_text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[[document, 0, 27...|[[document, 0, 27...|[[token, 0, 4, Pe...|[[token, 0, 4, Pe...|[[token, 0, 4, Pe...|[[document, 0, 16...|
|My life in Russia...|[[document, 0, 37...|[[document, 0, 37...|[[token, 0, 1, My...|[[token, 0, 1, My...|[[token, 3, 6, li...|[[document, 0, 22...|
|John and Peter ar...|[[document, 0, 76...|[[document, 0, 27...|[[token, 0, 3, Jo...|[[token, 0, 3, Jo...|[[token, 0, 3, Jo...|[[document, 0, 18...|
|Lucas Nogal Dunbe...|[[document, 0, 67...|[[document, 0, 41...|[[token, 0, 4, Lu...|[[token, 0, 4, Lu...|

In [64]:
result.select('clean_text').take(1)

[Row(clean_text=[Row(annotatorType='document', begin=0, end=16, result='Peter good person', metadata={'sentence': '0'}, embeddings=[])])]

In [65]:
# if we use TokenAssembler().setPreservePosition(True), the original borders will be preserved (dropped & unwanted chars will be replaced by spaces)

result.select('clean_text').take(1)

[Row(clean_text=[Row(annotatorType='document', begin=0, end=16, result='Peter good person', metadata={'sentence': '0'}, embeddings=[])])]

In [66]:
result.select('text', F.explode('clean_text.result').alias('clean_text')).show(truncate=False)

+-----------------------------------------------------------------------------+-----------------------------------+
|text                                                                         |clean_text                         |
+-----------------------------------------------------------------------------+-----------------------------------+
|Peter is a very good person.                                                 |Peter good person                  |
|My life in Russia is very interesting.                                       |life Russia interesting            |
|John and Peter are brothers. However they don't support each other that much.|John Peter brothers                |
|John and Peter are brothers. However they don't support each other that much.|However dont support much          |
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |Lucas Nogal Dunbercker longer happy|
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.   

In [67]:
import pyspark.sql.functions as F

result.withColumn(
    "tmp", 
    F.explode("clean_text")) \
    .select("tmp.*").select("begin","end","result","metadata.sentence").show(truncate = False)

+-----+---+-----------------------------------+--------+
|begin|end|result                             |sentence|
+-----+---+-----------------------------------+--------+
|0    |16 |Peter good person                  |0       |
|0    |22 |life Russia interesting            |0       |
|0    |18 |John Peter brothers                |0       |
|29   |53 |However dont support much          |1       |
|0    |34 |Lucas Nogal Dunbercker longer happy|0       |
|43   |57 |good car though                    |1       |
|0    |18 |Europe culture rich                |0       |
|29   |41 |huge churches                      |1       |
|54   |63 |big houses                         |2       |
+-----+---+-----------------------------------+--------+



In [68]:
# if we hadn't used Sentence Detector, this would be what we got. (tokenizer gets document instead of sentences column)

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

tokenassembler = TokenAssembler()\
    .setInputCols(["document", "cleanTokens"]) \
    .setOutputCol("clean_text")

nlpPipeline = Pipeline(stages=[
     documentAssembler,
     tokenizer,
     normalizer,
     stopwords_cleaner,
     tokenassembler
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

result = pipelineModel.transform(spark_df)

result.select('text', 'clean_text.result').show(truncate=False)

+-----------------------------------------------------------------------------+-----------------------------------------------------+
|text                                                                         |result                                               |
+-----------------------------------------------------------------------------+-----------------------------------------------------+
|Peter is a very good person.                                                 |[Peter good person]                                  |
|My life in Russia is very interesting.                                       |[life Russia interesting]                            |
|John and Peter are brothers. However they don't support each other that much.|[John Peter brothers However dont support much]      |
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |[Lucas Nogal Dunbercker longer happy good car though]|
|Europe is very culture rich. There are huge churches! and big

In [69]:

result.withColumn(
    "tmp", 
    F.explode("clean_text")) \
    .select("tmp.*").select("begin","end","result","metadata.sentence").show(truncate = False)

+-----+---+---------------------------------------------------+--------+
|begin|end|result                                             |sentence|
+-----+---+---------------------------------------------------+--------+
|0    |16 |Peter good person                                  |0       |
|0    |22 |life Russia interesting                            |0       |
|0    |44 |John Peter brothers However dont support much      |0       |
|0    |50 |Lucas Nogal Dunbercker longer happy good car though|0       |
|0    |43 |Europe culture rich huge churches big houses       |0       |
+-----+---+---------------------------------------------------+--------+



**important note:**

If you have some other steps & annotators in your pipeline that will need to use the tokens from cleaned text (assembled tokens), you will need to tokenize the processed text again as the original text is probably changed completely.

### Stemmer

In [70]:
stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")

In [71]:
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 stemmer
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

In [72]:
result = pipelineModel.transform(spark_df)

result.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|                stem|
+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[[document, 0, 27...|[[token, 0, 4, Pe...|[[token, 0, 4, pe...|
|My life in Russia...|[[document, 0, 37...|[[token, 0, 1, My...|[[token, 0, 1, my...|
|John and Peter ar...|[[document, 0, 76...|[[token, 0, 3, Jo...|[[token, 0, 3, jo...|
|Lucas Nogal Dunbe...|[[document, 0, 67...|[[token, 0, 4, Lu...|[[token, 0, 4, lu...|
|Europe is very cu...|[[document, 0, 68...|[[token, 0, 5, Eu...|[[token, 0, 5, eu...|
+--------------------+--------------------+--------------------+--------------------+



In [73]:
result.select('stem.result').show(truncate=False)

+-------------------------------------------------------------------------------------------+
|result                                                                                     |
+-------------------------------------------------------------------------------------------+
|[peter, i, a, veri, good, person, .]                                                       |
|[my, life, in, russia, i, veri, interest, .]                                               |
|[john, and, peter, ar, brother, ., howev, thei, don't, support, each, other, that, much, .]|
|[luca, nogal, dunberck, i, no, longer, happi, ., he, ha, a, good, car, though, .]          |
|[europ, i, veri, cultur, rich, ., there, ar, huge, church, !, and, big, hous, !]           |
+-------------------------------------------------------------------------------------------+



In [74]:
#import pyspark.sql.functions as F

#result_df = result.select(F.explode(F.arrays_zip('token.result', 'stem.result')).alias("cols")) \
#.select(F.expr("cols['0']").alias("token"),
#        F.expr("cols['1']").alias("stem")).toPandas()

#result_df.head(10)

#### Lemmatizer

Retrieves lemmas out of words with the objective of returning a base dictionary word 

In [75]:
!wget -q https://raw.githubusercontent.com/mahavivo/vocabulary/master/lemmas/AntBNC_lemmas_ver_001.txt

In [76]:
lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")

In [77]:
lemmatizer.extractParamMap()

{Param(parent='Lemmatizer_1d54ce757f86', name='dictionary', doc="lemmatizer external dictionary. needs 'keyDelimiter' and 'valueDelimiter' in options for parsing target text"): JavaObject id=o2573,
 Param(parent='Lemmatizer_1d54ce757f86', name='inputCols', doc='previous annotations columns, if renamed'): ['token'],
 Param(parent='Lemmatizer_1d54ce757f86', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='Lemmatizer_1d54ce757f86', name='outputCol', doc='output annotation column. can be left default.'): 'lemma'}

In [78]:
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 stemmer,
 lemmatizer
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)
result = pipelineModel.transform(spark_df)

result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|                stem|               lemma|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[[document, 0, 27...|[[token, 0, 4, Pe...|[[token, 0, 4, pe...|[[token, 0, 4, Pe...|
|My life in Russia...|[[document, 0, 37...|[[token, 0, 1, My...|[[token, 0, 1, my...|[[token, 0, 1, My...|
|John and Peter ar...|[[document, 0, 76...|[[token, 0, 3, Jo...|[[token, 0, 3, jo...|[[token, 0, 3, Jo...|
|Lucas Nogal Dunbe...|[[document, 0, 67...|[[token, 0, 4, Lu...|[[token, 0, 4, lu...|[[token, 0, 4, Lu...|
|Europe is very cu...|[[document, 0, 68...|[[token, 0, 5, Eu...|[[token, 0, 5, eu...|[[token, 0, 5, Eu...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



In [79]:
result.select('lemma.result').show(truncate = False)

+---------------------------------------------------------------------------------------------+
|result                                                                                       |
+---------------------------------------------------------------------------------------------+
|[Peter, be, a, very, good, person, .]                                                        |
|[My, life, in, Russia, be, very, interest, .]                                                |
|[John, and, Peter, be, brother, ., However, they, don't, support, each, other, that, much, .]|
|[Lucas, Nogal, Dunbercker, be, no, long, happy, ., He, have, a, good, car, though, .]        |
|[Europe, be, very, culture, rich, ., There, be, huge, church, !, and, big, house, !]         |
+---------------------------------------------------------------------------------------------+



In [80]:
#result_df = result.select(F.explode(F.arrays_zip('token.result', 'stem.result',  'lemma.result')).alias("cols")) \
#.select(F.expr("cols['0']").alias("token"),
#        F.expr("cols['1']").alias("stem"),
#        F.expr("cols['2']").alias("lemma")).toPandas()

#result_df.head(10)

### NGram Generator

In [81]:

ngrams_cum = NGramGenerator() \
            .setInputCols(["token"]) \
            .setOutputCol("ngrams") \
            .setN(3) \
            .setEnableCumulative(True)\
            .setDelimiter("_") # Default is space
    
# .setN(3) means, take bigrams and trigrams.

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 ngrams_cum
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

result = pipelineModel.transform(spark_df)

result.select('ngrams.result').show(truncate=200)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                                                                                                  result|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                    [Peter, is, a, very, good, person, ., Peter_is, is_a, a_very, very_good, good_person, person_., Peter_is_a, is_a_very, a_very_good, very_good_person, good_person_.]|
|[My, life, in, Russia, is, very, interesting, ., My_life, life_in, in_Russia, Russia_is, is_very, very_interesting, interesting_., My_life_in, life_in_Russia, in_Russia_is, Russia_is_very

In [82]:
ngrams_nonCum = NGramGenerator() \
            .setInputCols(["token"]) \
            .setOutputCol("ngrams_v2") \
            .setN(3) \
            .setEnableCumulative(False)\
            .setDelimiter("_") # Default is space
    
ngrams_nonCum.transform(result).select('ngrams_v2.result').show(truncate=200)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                                                                                                  result|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                                   [Peter_is_a, is_a_very, a_very_good, very_good_person, good_person_.]|
|                                                                                                     [My_life_in, life_in_Russia, in_Russia_is, Russia_is_very, is_very_interesting, very_i

### Text Matcher

In [83]:
entity_extractor = TextMatcher() \
    .setInputCols(["document",'token'])\
    .setOutputCol("matched_entities")\

entity_extractor.extractParamMap()

{Param(parent='TextMatcher_920c710729f3', name='caseSensitive', doc='whether to match regardless of case. Defaults true'): True,
 Param(parent='TextMatcher_920c710729f3', name='inputCols', doc='previous annotations columns, if renamed'): ['document',
  'token'],
 Param(parent='TextMatcher_920c710729f3', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='TextMatcher_920c710729f3', name='mergeOverlapping', doc='whether to merge overlapping matched chunks. Defaults false'): False,
 Param(parent='TextMatcher_920c710729f3', name='outputCol', doc='output annotation column. can be left default.'): 'matched_entities'}

In [84]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv

news_df = spark.read \
      .option("header", True) \
      .csv("news_category_train.csv")


In [85]:
news_df.show(5, truncate=50)

+--------+--------------------------------------------------+
|category|                                       description|
+--------+--------------------------------------------------+
|Business| Short sellers, Wall Street's dwindling band of...|
|Business| Private investment firm Carlyle Group, which h...|
|Business| Soaring crude prices plus worries about the ec...|
|Business| Authorities have halted oil export flows from ...|
|Business| Tearaway world oil prices, toppling records an...|
+--------+--------------------------------------------------+
only showing top 5 rows



In [86]:
# write the target entities to txt file 

entities = ['Wall Street', 'USD', 'stock', 'NYSE']
with open ('financial_entities.txt', 'w') as f:
    for i in entities:
        f.write(i+'\n')


entities = ['soccer', 'world cup', 'Messi', 'FC Barcelona']
with open ('sport_entities.txt', 'w') as f:
    for i in entities:
        f.write(i+'\n')

In [87]:

documentAssembler = DocumentAssembler()\
.setInputCol("description")\
.setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

financial_entity_extractor = TextMatcher() \
    .setInputCols(["document",'token'])\
    .setOutputCol("financial_entities")\
    .setEntities("financial_entities.txt")\
    .setCaseSensitive(False)\
    .setEntityValue('financial_entity')

sport_entity_extractor = TextMatcher() \
    .setInputCols(["document",'token'])\
    .setOutputCol("sport_entities")\
    .setEntities("sport_entities.txt")\
    .setCaseSensitive(False)\
    .setEntityValue('sport_entity')


nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 financial_entity_extractor,
 sport_entity_extractor
 ])

empty_df = spark.createDataFrame([['']]).toDF("description")

pipelineModel = nlpPipeline.fit(empty_df)

In [88]:
result = pipelineModel.transform(news_df)

In [89]:
result.select('financial_entities.result','sport_entities.result').take(2)

[Row(result=[], result=[]), Row(result=[], result=[])]

In [90]:
result.select('description','financial_entities.result','sport_entities.result')\
.toDF('text','financial_matches','sport_matches').filter((F.size('financial_matches')>1) | (F.size('sport_matches')>1))\
.show(truncate=70)


+----------------------------------------------------------------------+----------------------------------+-------------------+
|                                                                  text|                 financial_matches|      sport_matches|
+----------------------------------------------------------------------+----------------------------------+-------------------+
|"Company launched the biggest electronic auction of stock in Wall S...|              [stock, Wall Street]|                 []|
|Google, Inc. significantly cut the expected share price for its ini...|                    [stock, stock]|                 []|
|Google, Inc. significantly cut the expected share price this mornin...|                    [stock, stock]|                 []|
| Shares of Air Canada  (AC.TO) fell by more than half on Wednesday,...|                    [Stock, stock]|                 []|
|Stock prices are lower in moderate trading. The Dow Jones Industria...|                    [Stock, Stoc

In [91]:
#result_df = result.select(F.explode(F.arrays_zip('financial_entities.result', 'financial_entities.begin',  'financial_entities.end')).alias("cols")) \
#.select(F.expr("cols['0']").alias("clinical_entities"),
#        F.expr("cols['1']").alias("begin"),
#        F.expr("cols['2']").alias("end")).toPandas()

#result_df.head(10)

### RegEx Matcher

In [92]:
! wget -q	https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/pubmed/pubmed-sample.csv

pubMedDF = spark.read\
                .option("header", "true")\
                .csv("./pubmed-sample.csv")\
                .filter("AB IS NOT null")\
                .withColumnRenamed("AB", "text")\
                .drop("TI")

pubMedDF.show(truncate=50)

+--------------------------------------------------+
|                                              text|
+--------------------------------------------------+
|The human KCNJ9 (Kir 3.3, GIRK3) is a member of...|
|BACKGROUND: At present, it is one of the most i...|
|OBJECTIVE: To investigate the relationship betw...|
|Combined EEG/fMRI recording has been used to lo...|
|Kohlschutter syndrome is a rare neurodegenerati...|
|Statistical analysis of neuroimages is commonly...|
|The synthetic DOX-LNA conjugate was characteriz...|
|Our objective was to compare three different me...|
|We conducted a phase II study to assess the eff...|
|"Monomeric sarcosine oxidase (MSOX) is a flavoe...|
|We presented the tachinid fly Exorista japonica...|
|The literature dealing with the water conductin...|
|A novel approach to synthesize chitosan-O-isopr...|
|An HPLC-ESI-MS-MS method has been developed for...|
|The localizing and lateralizing values of eye a...|
|OBJECTIVE: To evaluate the effectiveness and 

In [93]:
rules = '''
renal\s\w+, started with 'renal'
cardiac\s\w+, started with 'cardiac'
\w*ly\b, ending with 'ly'
\S*\d+\S*, match any word that contains numbers
(\d+).?(\d*)\s*(mg|ml|g), match medication metrics
'''

with open('regex_rules.txt', 'w') as f:
    
    f.write(rules)

In [94]:
RegexMatcher().extractParamMap()

{Param(parent='RegexMatcher_4a7ee8ff5b57', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='RegexMatcher_4a7ee8ff5b57', name='strategy', doc='MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE'): 'MATCH_ALL'}

In [95]:
import os

documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

regex_matcher = RegexMatcher()\
    .setInputCols('document')\
    .setStrategy("MATCH_ALL")\
    .setOutputCol("regex_matches")\
    .setExternalRules(path='./regex_rules.txt', delimiter=',')
    

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 regex_matcher
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

match_df = pipelineModel.transform(pubMedDF)

match_df.select('regex_matches.result').take(3)

[Row(result=['inwardly', 'family', 'spansapproximately', 'byapproximately', 'approximately', 'respectively', 'poly', 'KCNJ9', '3.3,', 'GIRK3)', 'KCNJ9', '1q21-23', '7.6', '2.2', '2.6', 'identified14', 'aVal366Ala', '8', 'KCNJ9', 'KCNJ9', '9 g']),
 Row(result=['previously', 'previously', 'intravenously', 'previously', '25', 'mg/m(2)', '1', '8', 'a3', '50', '20.0%', '(10', '50;', '95%', 'interval,10.0-33.7%).', '58.0%', '[10', '18', '50].', '(50%', '115.0', '17.3%', '52).', '25 mg']),
 Row(result=['renal failure', 'cardiac surgery', 'cardiac surgery', 'cardiac surgical', 'early', 'statistically', 'analy', '1995', '2005', '=9796).', '2.9', '11years).', '11.3%', '1105),', '7.2%', '30%', '0.0001),', '1.55,95%', '1.42-1.70,', '0.0001).'])]

In [96]:
match_df.select('text','regex_matches.result')\
.toDF('text','matches').filter(F.size('matches')>1)\
.show(truncate=70)

+----------------------------------------------------------------------+----------------------------------------------------------------------+
|                                                                  text|                                                               matches|
+----------------------------------------------------------------------+----------------------------------------------------------------------+
|The human KCNJ9 (Kir 3.3, GIRK3) is a member of the G-protein-activ...|[inwardly, family, spansapproximately, byapproximately, approximate...|
|BACKGROUND: At present, it is one of the most important issues for ...|[previously, previously, intravenously, previously, 25, mg/m(2), 1,...|
|OBJECTIVE: To investigate the relationship between preoperative atr...|[renal failure, cardiac surgery, cardiac surgery, cardiac surgical,...|
|Combined EEG/fMRI recording has been used to localize the generator...|[normally, significantly, effectively, analy, only, considerably

### Date Matcher

In [97]:
MultiDateMatcher().extractParamMap()

{Param(parent='MultiDateMatcher_4f4f8929861c', name='dateFormat', doc='desired format for dates extracted'): 'yyyy/MM/dd',
 Param(parent='MultiDateMatcher_4f4f8929861c', name='defaultDayWhenMissing', doc='which day to set when it is missing from parsed input'): 1,
 Param(parent='MultiDateMatcher_4f4f8929861c', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='MultiDateMatcher_4f4f8929861c', name='readMonthFirst', doc='Whether to parse july 07/05/2015 or as 05/07/2015'): True}

In [98]:
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

date_matcher = MultiDateMatcher() \
    .setInputCols('document') \
    .setOutputCol("date") \
    .setDateFormat("yyyy/MM/dd")
        
date_pipeline = PipelineModel(stages=[
 documentAssembler, 
 date_matcher
 ])

sample_df = spark.createDataFrame([['I saw him yesterday and he told me that he will visit us next week']]).toDF("text")

result = date_pipeline.transform(sample_df)

result.select('date.result').show(truncate=False)

+------------------------+
|result                  |
+------------------------+
|[2022/04/06, 2022/03/29]|
+------------------------+



## Text Cleaning with UDF

In [99]:
text = '<h1 style="color: #5e9ca0;">Have a great <span  style="color: #2b2301;">birth</span> day!</h1>'

text_df = spark.createDataFrame([[text]]).toDF("text")

import re
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType

clean_text = lambda s: re.sub(r'<[^>]*>', '', s)

text_df.withColumn('cleaned', udf(clean_text, StringType())('text')).select('text','cleaned').show(truncate= False)

+----------------------------------------------------------------------------------------------+-----------------------+
|text                                                                                          |cleaned                |
+----------------------------------------------------------------------------------------------+-----------------------+
|<h1 style="color: #5e9ca0;">Have a great <span  style="color: #2b2301;">birth</span> day!</h1>|Have a great birth day!|
+----------------------------------------------------------------------------------------------+-----------------------+



In [100]:
find_not_alnum_count = lambda s: len([i for i in s if not i.isalnum() and i!=' '])

find_not_alnum_count("it's your birth day!")

2

In [101]:
text = '<h1 style="color: #5e9ca0;">Have a great <span  style="color: #2b2301;">birth</span> day!</h1>'

find_not_alnum_count(text)

23

In [102]:
text_df.withColumn('cleaned', udf(find_not_alnum_count, IntegerType())('text')).select('text','cleaned').show(truncate= False)

+----------------------------------------------------------------------------------------------+-------+
|text                                                                                          |cleaned|
+----------------------------------------------------------------------------------------------+-------+
|<h1 style="color: #5e9ca0;">Have a great <span  style="color: #2b2301;">birth</span> day!</h1>|23     |
+----------------------------------------------------------------------------------------------+-------+



### FINISHER

***Finisher:*** Once we have our NLP pipeline ready to go, we might want to use our annotation results somewhere else where it is easy to use. The Finisher outputs annotation(s) values into a string.

If we just want the desired output column in the final dataframe, we can use Finisher to drop previous stages in the final output and get the `result` from the process.

This is very handy when you want to use the output from Spark NLP annotator as an input to another Spark ML transformer.

Settable parameters are:

`setInputCols()`

`setOutputCols()`

`setCleanAnnotations(True)` -> Whether to remove intermediate annotations

`setValueSplitSymbol(“#”)` -> split values within an annotation character

`setAnnotationSplitSymbol(“@”)` -> split values between annotations character

`setIncludeMetadata(False)` -> Whether to include metadata keys. Sometimes useful in some annotations.

`setOutputAsArray(False)` -> Whether to output as Array. Useful as input for other Spark transformers.

In [103]:
finisher = Finisher() \
    .setInputCols(["regex_matches"]) \
    .setIncludeMetadata(False) # set to False to remove metadata

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 regex_matcher,
 finisher
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

match_df = pipelineModel.transform(pubMedDF)

match_df.show(truncate = 50)

+--------------------------------------------------+--------------------------------------------------+
|                                              text|                            finished_regex_matches|
+--------------------------------------------------+--------------------------------------------------+
|The human KCNJ9 (Kir 3.3, GIRK3) is a member of...|[inwardly, family, spansapproximately, byapprox...|
|BACKGROUND: At present, it is one of the most i...|[previously, previously, intravenously, previou...|
|OBJECTIVE: To investigate the relationship betw...|[renal failure, cardiac surgery, cardiac surger...|
|Combined EEG/fMRI recording has been used to lo...|[normally, significantly, effectively, analy, o...|
|Kohlschutter syndrome is a rare neurodegenerati...|                                          [family]|
|Statistical analysis of neuroimages is commonly...|[analy, commonly, overly, normally, thatsuccess...|
|The synthetic DOX-LNA conjugate was characteriz...|            

In [104]:
match_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- finished_regex_matches: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [105]:
match_df.filter(F.size('finished_regex_matches')>2).show(truncate = 50)

+--------------------------------------------------+--------------------------------------------------+
|                                              text|                            finished_regex_matches|
+--------------------------------------------------+--------------------------------------------------+
|The human KCNJ9 (Kir 3.3, GIRK3) is a member of...|[inwardly, family, spansapproximately, byapprox...|
|BACKGROUND: At present, it is one of the most i...|[previously, previously, intravenously, previou...|
|OBJECTIVE: To investigate the relationship betw...|[renal failure, cardiac surgery, cardiac surger...|
|Combined EEG/fMRI recording has been used to lo...|[normally, significantly, effectively, analy, o...|
|Statistical analysis of neuroimages is commonly...|[analy, commonly, overly, normally, thatsuccess...|
|Our objective was to compare three different me...|[daily, only, Conversely, Hourly, hourly, Hourl...|
|We conducted a phase II study to assess the eff...|[analy, resp

### LIGHTPIPELINE

LightPipelines are Spark NLP specific Pipelines, equivalent to Spark ML Pipeline, but meant to deal with smaller amounts of data. They’re useful working with small datasets, debugging results, or when running either training or prediction from an API that serves one-off requests.

Spark NLP LightPipelines are Spark ML pipelines converted into a single machine but the multi-threaded task, becoming more than 10x times faster for smaller amounts of data (small is relative, but 50k sentences are roughly a good maximum). To use them, we simply plug in a trained (fitted) pipeline and then annotate a plain text. We don't even need to convert the input text to DataFrame in order to feed it into a pipeline that's accepting DataFrame as an input in the first place. This feature would be quite useful when it comes to getting a prediction for a few lines of text from a trained ML model.

 **It is nearly 20x faster than using Spark ML Pipeline**

`LightPipeline(someTrainedPipeline).annotate(someStringOrArray)`

In [113]:
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 stemmer,
 lemmatizer
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

pipelineModel.transform(spark_df).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|                stem|               lemma|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[[document, 0, 27...|[[token, 0, 4, Pe...|[[token, 0, 4, pe...|[[token, 0, 4, Pe...|
|My life in Russia...|[[document, 0, 37...|[[token, 0, 1, My...|[[token, 0, 1, my...|[[token, 0, 1, My...|
|John and Peter ar...|[[document, 0, 76...|[[token, 0, 3, Jo...|[[token, 0, 3, jo...|[[token, 0, 3, Jo...|
|Lucas Nogal Dunbe...|[[document, 0, 67...|[[token, 0, 4, Lu...|[[token, 0, 4, lu...|[[token, 0, 4, Lu...|
|Europe is very cu...|[[document, 0, 68...|[[token, 0, 5, Eu...|[[token, 0, 5, eu...|[[token, 0, 5, Eu...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



In [114]:
from sparknlp.base import LightPipeline

light_model = LightPipeline(pipelineModel)

light_result = light_model.annotate("John and Peter are brothers. However they don't support each other that much.")

In [115]:
light_result.keys()

dict_keys(['document', 'token', 'stem', 'lemma'])

In [116]:
list(zip(light_result['token'], light_result['stem'], light_result['lemma']))

[('John', 'john', 'John'),
 ('and', 'and', 'and'),
 ('Peter', 'peter', 'Peter'),
 ('are', 'ar', 'be'),
 ('brothers', 'brother', 'brother'),
 ('.', '.', '.'),
 ('However', 'howev', 'However'),
 ('they', 'thei', 'they'),
 ("don't", "don't", "don't"),
 ('support', 'support', 'support'),
 ('each', 'each', 'each'),
 ('other', 'other', 'other'),
 ('that', 'that', 'that'),
 ('much', 'much', 'much'),
 ('.', '.', '.')]

In [117]:
light_result = light_model.fullAnnotate("John and Peter are brothers. However they don't support each other that much.")

In [118]:
light_result

[{'document': [Annotation(document, 0, 76, John and Peter are brothers. However they don't support each other that much., {})],
  'lemma': [Annotation(token, 0, 3, John, {'sentence': '0'}),
   Annotation(token, 5, 7, and, {'sentence': '0'}),
   Annotation(token, 9, 13, Peter, {'sentence': '0'}),
   Annotation(token, 15, 17, be, {'sentence': '0'}),
   Annotation(token, 19, 26, brother, {'sentence': '0'}),
   Annotation(token, 27, 27, ., {'sentence': '0'}),
   Annotation(token, 29, 35, However, {'sentence': '0'}),
   Annotation(token, 37, 40, they, {'sentence': '0'}),
   Annotation(token, 42, 46, don't, {'sentence': '0'}),
   Annotation(token, 48, 54, support, {'sentence': '0'}),
   Annotation(token, 56, 59, each, {'sentence': '0'}),
   Annotation(token, 61, 65, other, {'sentence': '0'}),
   Annotation(token, 67, 70, that, {'sentence': '0'}),
   Annotation(token, 72, 75, much, {'sentence': '0'}),
   Annotation(token, 76, 76, ., {'sentence': '0'})],
  'stem': [Annotation(token, 0, 3, john

In [119]:
text_list= ["How did serfdom develop in and then leave Russia ?",
"There will be some exciting breakthroughs in NLP this year."]

light_model.annotate(text_list)

[{'document': ['How did serfdom develop in and then leave Russia ?'],
  'lemma': ['How',
   'do',
   'serfdom',
   'develop',
   'in',
   'and',
   'then',
   'leave',
   'Russia',
   '?'],
  'stem': ['how',
   'did',
   'serfdom',
   'develop',
   'in',
   'and',
   'then',
   'leav',
   'russia',
   '?'],
  'token': ['How',
   'did',
   'serfdom',
   'develop',
   'in',
   'and',
   'then',
   'leave',
   'Russia',
   '?']},
 {'document': ['There will be some exciting breakthroughs in NLP this year.'],
  'lemma': ['There',
   'will',
   'be',
   'some',
   'exciting',
   'breakthrough',
   'in',
   'NLP',
   'this',
   'year',
   '.'],
  'stem': ['there',
   'will',
   'be',
   'some',
   'excit',
   'breakthrough',
   'in',
   'nlp',
   'thi',
   'year',
   '.'],
  'token': ['There',
   'will',
   'be',
   'some',
   'exciting',
   'breakthroughs',
   'in',
   'NLP',
   'this',
   'year',
   '.']}]

**important note:** When you use Finisher in your pipeline, regardless of setting `cleanAnnotations` to False or True, LigtPipeline will only return the finished columns.