# **Import Libraries**



In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *


In [3]:
# Start spark session
spark = sparknlp.start()
def start(gpu):
    builder = SparkSession.builder \
        .appName("Spark NLP") \
        .master("local[*]") \
        .config("spark.driver.memory", "8G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.kryoserializer.buffer.max", "1000M")
    if gpu:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-gpu_2.11:2.5.1")
    else:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.1")

    return builder.getOrCreate()

gpu_access=False  
spark = start(gpu=gpu_access)

# **Read Data**

In [5]:
from sparknlp.training import CoNLL


training_data_path = '../data/CoNLL_addresses.txt'


training_data = CoNLL().readDataset(spark, training_data_path)
training_data.show(3)



+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|6557 ORLY DORVAL ...|[[document, 0, 37...|[[document, 0, 37...|[[token, 0, 3, 65...|[[pos, 0, 3, NNP,...|[[named_entity, 0...|
|Level 14 BURJ DAM...|[[document, 0, 36...|[[document, 0, 36...|[[token, 0, 4, Le...|[[pos, 0, 4, NNP,...|[[named_entity, 0...|
|HSBC Bank Middle ...|[[document, 0, 51...|[[document, 0, 51...|[[token, 0, 3, HS...|[[pos, 0, 3, NNP,...|[[named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



# **Word embeddings**

 - Bert embeddings (look at BERT-as-a-service, essential feature extraction): different layers in BERT capture different information. `setPoolingLayer(0)` gives the first layer. That can be changed to anywhere between [-1,-12] depending on that information to capture. -1 will give information biased towards the training output, whereas -12 will give information close to the training input to the model, i.e. BERT adds close to no information in the embeddings.
 - Consider multilingual and ELMO embeddings. 

In [7]:
bert_annotator = BertEmbeddings.pretrained('bert_base_cased', 'en') \
 .setInputCols(["sentence",'token'])\
 .setOutputCol("embeddings")\
 .setCaseSensitive(False)\
 .setPoolingLayer(0)

bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [8]:
training_data = bert_annotator.transform(training_data)
#test_data = bert_annotator.transform(test_data)

# **Train NER deep learning model**

 - `NerDLApproach()` trains Char CNNs - BiLSTM - CRF. ([Read more here.](https://https://arxiv.org/pdf/1603.01354.pdf))
 - Can experiment and build our own deep learning models in `tensorflow` and add the graph into spark-nlp lib. (Look more into how this is done.)
 

In [9]:
nerTagger = NerDLApproach()\
  .setInputCols(["sentence", "token", "bert"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMaxEpochs(1)\
  .setLr(0.001)\
  .setPo(0.005)\
  .setBatchSize(8)\
  .setRandomSeed(0)\
  .setVerbose(1)\
  .setValidationSplit(0.2)\
  .setEvaluationLogExtended(True) \
  .setEnableOutputLogs(True)\
  .setIncludeConfidence(True)\
  .setGraphFolder("graph")

NER_pipeline = Pipeline(
    stages = [
    bert_annotator,
    nerTagger
  ])

Ner_model = NER_pipeline.fit(training_data.limit(1000))

# **Save the model**

In [10]:
path_to_model = 'NER_model1'
Ner_model.stages[1].write().overwrite().save(path_to_model)

# **Prediction**

1. On training data
2. On test data

In [11]:
import pyspark.sql.functions as F
predictions = Ner_model.transform(training_data)

predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).show(truncate=False)

+------+------------+----------+
|token |ground_truth|prediction|
+------+------------+----------+
|6557  |B-House     |I-Street  |
|ORLY  |B-Street    |O         |
|DORVAL|I-Street    |I-Street  |
|Quebec|B-State     |B-City    |
|CANADA|B-Country   |I-Street  |
|H9P   |B-Postcode  |I-Postcode|
|1G1   |I-Postcode  |O         |
|Level |B-House     |I-Street  |
|14    |I-House     |O         |
|BURJ  |B-Street    |I-Street  |
|DAMAN |I-Street    |I-Street  |
|DIFC  |I-Street    |I-Street  |
|DUBAI |B-City      |O         |
|UAE   |B-Country   |B-Country |
|AE    |O           |O         |
|HSBC  |O           |I-Postcode|
|Bank  |O           |I-Street  |
|Middle|O           |I-Street  |
|East  |O           |I-Street  |
|EMMAR |B-Street    |I-Street  |
+------+------------+----------+
only showing top 20 rows



# Prediction Pipeline

In [12]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

bert = BertEmbeddings.pretrained('bert_base_cased', 'en') \
 .setInputCols(["sentence",'token'])\
 .setOutputCol("embeddings")\
 .setCaseSensitive(False)

loaded_ner_model = NerDLModel.load(path_to_model)\
 .setInputCols(["sentence", "token", "embeddings"])\
 .setOutputCol("ner")

converter = NerConverter()\
  .setInputCols(["document", "token", "ner"])\
  .setOutputCol("ner_span")

ner_prediction_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        token,
        bert,
        loaded_ner_model,
        converter])

bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [13]:
empty_data = spark.createDataFrame([['']]).toDF("text")
prediction_model = ner_prediction_pipeline.fit(empty_data)

# **Test on new examples**

In [28]:
text = "70 york street toronto ontario Canada l2n 8f4"
sample_data = spark.createDataFrame([[text]]).toDF("text")
sample_data.show()

+--------------------+
|                text|
+--------------------+
|70 york street to...|
+--------------------+



In [29]:
import pyspark.sql.functions as F
testpreds = prediction_model.transform(sample_data)
testpreds.select(F.explode(F.arrays_zip('token.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("prediction")).show(truncate=False)

+-------+----------+
|token  |prediction|
+-------+----------+
|70     |I-Street  |
|york   |I-Street  |
|street |I-Street  |
|toronto|B-City    |
|ontario|B-City    |
|Canada |I-Street  |
|l2n    |B-Postcode|
|8f4    |I-Street  |
+-------+----------+

