In [0]:
!export JAVA_HOME=$(/usr/libexec/java_home -v 1.8)
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.3

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.4.1

In [0]:

import sparknlp 

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

In [0]:
from pyspark import SparkContext, SQLContext
from sparknlp.pretrained import NerDLModel
from sparknlp.pretrained import PretrainedPipeline

In [0]:
path = "File Available @ https://github.com/saicharannivarthi/spark_nlp_issue_resources/blob/master/input_data.json"

oldArticlesData = spark.read.json(path)

#concatenating title & content
import pyspark
from pyspark.sql import functions as sf

inputData = oldArticlesData.withColumn('text', sf.concat(sf.col('title'),sf.lit(' . '), sf.col('content')))

In [0]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

In [0]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

regexTokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

normalizer = Normalizer()\
      .setInputCols(["token"])\
      .setOutputCol("normal")
      
embeddings_bert = BertEmbeddings.pretrained("bert_base_cased", lang="en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol('embeddings')

ner_bert = NerDLModel().pretrained('ner_dl_bert') \
    .setInputCols(["document", "token", "embeddings"]) \
    .setOutputCol("ner_dl_bert")
     


nerConverter_bert = NerConverter()\
     .setInputCols("document", "normal", "ner_dl_bert")\
     .setOutputCol("ner_converter_bert")

finisher = Finisher() \
    .setInputCols(["token"]) \
    .setCleanAnnotations(False)

# document, token, normalizer, wordEmbeddings, ner, nerConverter, finisher
custom_pipeline_bert = Pipeline() \
    .setStages([
        documentAssembler,
        sentenceDetector,
        regexTokenizer,
        embeddings_bert,
        ner_bert,
        normalizer,
        nerConverter_bert,
        finisher
    ])



In [0]:
result = custom_pipeline_bert.fit(inputData).transform(inputData)

In [0]:
bert_custom_ner = result.select('ner_converter_bert.metadata','ner_converter_bert.result')

In [0]:
bert_custom_ner.coalesce(1).write.format('json').save('Output available @ https://github.com/saicharannivarthi/spark_nlp_issue_resources/blob/master/pyspark_output.json')