# NerDL
It is an annotator which train on top of tensorflow

1)first step- initialize spark NLP

2) how files are required to train in NER model (read a trainable connll format file)

3) create a pipeline

4) train a model and creat DL graph- specific only to ner-dl

5) save this trained model

6) read this output metrics and prediction pipeline

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

# First Step

In [2]:
spark = sparknlp.start()

In [3]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.5.3
Apache Spark version:  2.4.5


In [4]:

def start(gpu):
    builder = SparkSession.builder \
        .appName("Spark NLP") \
        .master("local[*]") \
        .config("spark.driver.memory", "8G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.kryoserializer.buffer.max", "1000M")
    if gpu:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-gpu_2.11:2.5.1")
    else:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.1")

    return builder.getOrCreate()

gpu_access=False  
spark = start(gpu=gpu_access)

# Second Step

In [5]:
!pip install CoNLL



In [6]:
# a conll class allows us to read trainable conll file:
from sparknlp.training import CoNLL

training_data = CoNLL().readDataset(spark, 'CoNLL_addresses.txt')
training_data.show(2)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|52 Main St. Unit ...|[[document, 0, 37...|[[document, 0, 37...|[[token, 0, 1, 52...|[[pos, 0, 1, CD, ...|[[named_entity, 0...|
|203 738 east 29th...|[[document, 0, 23...|[[document, 0, 23...|[[token, 0, 2, 20...|[[pos, 0, 2, CD, ...|[[named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [14]:
training_data.select('text').limit(1).collect()

[Row(text='52 Main St. Unit 3a Toronto ON N6C 4E9')]

In [15]:
training_data.select('label').limit(1).collect()

[Row(label=[Row(annotatorType='named_entity', begin=0, end=1, result='O', metadata={'word': '52'}, embeddings=[]), Row(annotatorType='named_entity', begin=3, end=6, result='B-road', metadata={'word': 'Main'}, embeddings=[]), Row(annotatorType='named_entity', begin=8, end=10, result='B-road', metadata={'word': 'St.'}, embeddings=[]), Row(annotatorType='named_entity', begin=12, end=15, result='O', metadata={'word': 'Unit'}, embeddings=[]), Row(annotatorType='named_entity', begin=17, end=18, result='B-houseNumber', metadata={'word': '3a'}, embeddings=[]), Row(annotatorType='named_entity', begin=20, end=26, result='O', metadata={'word': 'Toronto'}, embeddings=[]), Row(annotatorType='named_entity', begin=28, end=29, result='O', metadata={'word': 'ON'}, embeddings=[]), Row(annotatorType='named_entity', begin=31, end=33, result='O', metadata={'word': 'N6C'}, embeddings=[]), Row(annotatorType='named_entity', begin=35, end=37, result='O', metadata={'word': '4E9'}, embeddings=[])])]

In [16]:
training_data.select('token').limit(1).collect()

[Row(token=[Row(annotatorType='token', begin=0, end=1, result='52', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=3, end=6, result='Main', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=8, end=10, result='St.', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=12, end=15, result='Unit', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=17, end=18, result='3a', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=20, end=26, result='Toronto', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=28, end=29, result='ON', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=31, end=33, result='N6C', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=35, end=37, result='4E9', metadata={'sentence': '0'}, embeddings=[])])]

In [17]:
training_data.select('pos').limit(1).collect()

[Row(pos=[Row(annotatorType='pos', begin=0, end=1, result='CD', metadata={'word': '52'}, embeddings=[]), Row(annotatorType='pos', begin=3, end=6, result='NNP', metadata={'word': 'Main'}, embeddings=[]), Row(annotatorType='pos', begin=8, end=10, result='NNP', metadata={'word': 'St.'}, embeddings=[]), Row(annotatorType='pos', begin=12, end=15, result='NNP', metadata={'word': 'Unit'}, embeddings=[]), Row(annotatorType='pos', begin=17, end=18, result='CD', metadata={'word': '3a'}, embeddings=[]), Row(annotatorType='pos', begin=20, end=26, result='NNP', metadata={'word': 'Toronto'}, embeddings=[]), Row(annotatorType='pos', begin=28, end=29, result='NNP', metadata={'word': 'ON'}, embeddings=[]), Row(annotatorType='pos', begin=31, end=33, result='NNP', metadata={'word': 'N6C'}, embeddings=[]), Row(annotatorType='pos', begin=35, end=37, result='CD', metadata={'word': '4E9'}, embeddings=[])])]

In [18]:
training_data.select('sentence').limit(5).collect()

[Row(sentence=[Row(annotatorType='document', begin=0, end=37, result='52 Main St. Unit 3a Toronto ON N6C 4E9', metadata={'sentence': '0'}, embeddings=[])]),
 Row(sentence=[Row(annotatorType='document', begin=0, end=23, result='203 738 east 29th avenue', metadata={'sentence': '0'}, embeddings=[])])]

# Using bert annotator to transform the training data and get the embeddings

In [7]:
bert_annotator = BertEmbeddings.pretrained('bert_base_cased', 'en') \
 .setInputCols(["sentence",'token'])\
 .setOutputCol("bert")\
 .setCaseSensitive(False)\
 .setPoolingLayer(0)

bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [8]:
training_data = bert_annotator.transform(training_data)

In [9]:
training_data.show(2)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|                bert|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|52 Main St. Unit ...|[[document, 0, 37...|[[document, 0, 37...|[[token, 0, 1, 52...|[[pos, 0, 1, CD, ...|[[named_entity, 0...|[[word_embeddings...|
|203 738 east 29th...|[[document, 0, 23...|[[document, 0, 23...|[[token, 0, 2, 20...|[[pos, 0, 2, CD, ...|[[named_entity, 0...|[[word_embeddings...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [24]:
training_data[['bert']]

DataFrame[bert: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>]

In [25]:
training_data.select("bert.result","bert.embeddings",'label.result').show()

+--------------------+--------------------+--------------------+
|              result|          embeddings|              result|
+--------------------+--------------------+--------------------+
|[52, main, st., u...|[[-0.2432273, -0....|[O, B-road, B-roa...|
|[203, 738, east, ...|[[-0.1435718, -0....|[B-houseNumber, B...|
+--------------------+--------------------+--------------------+



In [26]:
training_data.select('bert').limit(10).collect()[1]

Row(bert=[Row(annotatorType='word_embeddings', begin=0, end=2, result='203', metadata={'sentence': '0', 'isOOV': 'false', 'isWordStart': 'true', 'pieceId': '20022', 'token': '203'}, embeddings=[-0.14357179403305054, -0.8473706841468811, -0.6466690301895142, -1.229882836341858, -0.3299340605735779, -0.4688408672809601, 0.7193411588668823, 0.8804854154586792, 0.04408001899719238, 0.9136207699775696, -1.2045234441757202, -1.268717646598816, 0.9925127029418945, -0.22271728515625, -0.17023366689682007, -0.5281392335891724, -0.06581830978393555, -0.20628178119659424, 0.7043640613555908, 0.2120419442653656, -0.634574294090271, 0.8582679033279419, 0.6095923781394958, 0.43341967463493347, -0.691056489944458, -0.8792150020599365, -0.054429858922958374, 0.773775577545166, 1.3422927856445312, -0.7378649115562439, 0.690885603427887, -1.752281665802002, -0.3073691129684448, 0.12105962634086609, -0.7720699310302734, 0.37590888142585754, -0.2895960211753845, -1.4378962516784668, 0.20353388786315918, 0

In [27]:
import numpy as np

emb_vector = np.array(training_data.select("bert.embeddings").limit(10).collect()[0])

emb_vector.shape

(1, 9, 768)

# third step

In [18]:
# Approach stands for trainable annotator
# model stands for trained annotator
# sentence, token, embeddings

nerTagger = NerDLApproach()\
  .setInputCols(["sentence", "token", "bert"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMaxEpochs(1)\
  .setLr(0.001)\
  .setPo(0.005)\
  .setBatchSize(8)\
  .setRandomSeed(0)\
  .setVerbose(1)\
  .setValidationSplit(0.2)\
  .setEvaluationLogExtended(True) \
  .setEnableOutputLogs(True)\
  .setIncludeConfidence(True)\
  .setGraphFolder("graph")

#report some metrics on the console .setEnableOutputLogs(True)\
# print training metrics on home folder .setEvaluationLogExtended(True)\  
# confidence score on each prediction. .setIncludeConfidence(True)
#location we put graphs .setGraphFolder('nergraph')



pipeline2 = Pipeline(
    stages = [
    bert_annotator,
    nerTagger
  ])

# forth step

In [19]:
ner_model = pipeline2.fit(training_data.limit(1000))

In [20]:
ner_model

PipelineModel_e7c253dc2c92

In [21]:
predictions = ner_model.transform(training_data)
predictions.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|                bert|                 ner|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|52 Main St. Unit ...|[[document, 0, 37...|[[document, 0, 37...|[[token, 0, 1, 52...|[[pos, 0, 1, CD, ...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|203 738 east 29th...|[[document, 0, 23...|[[document, 0, 23...|[[token, 0, 2, 20...|[[pos, 0, 2, CD, ...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--

In [22]:
predictions.select('token.result','label.result','ner.result').show(truncate=40)

+----------------------------------------+----------------------------------------+-------------------------------------+
|                                  result|                                  result|                               result|
+----------------------------------------+----------------------------------------+-------------------------------------+
|[52, Main, St., Unit, 3a, Toronto, ON...|[O, B-road, B-road, O, B-houseNumber,...|          [O, O, O, O, O, O, O, O, O]|
|          [203, 738, east, 29th, avenue]|   [B-houseNumber, B-road, B-road, O, O]|[B-houseNumber, B-road, B-road, O, O]|
+----------------------------------------+----------------------------------------+-------------------------------------+



In [23]:
import pyspark.sql.functions as F

predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).show(truncate=False)

+-------+-------------+-------------+
|token  |ground_truth |prediction   |
+-------+-------------+-------------+
|52     |O            |O            |
|Main   |B-road       |O            |
|St.    |B-road       |O            |
|Unit   |O            |O            |
|3a     |B-houseNumber|O            |
|Toronto|O            |O            |
|ON     |O            |O            |
|N6C    |O            |O            |
|4E9    |O            |O            |
|203    |B-houseNumber|B-houseNumber|
|738    |B-road       |B-road       |
|east   |B-road       |B-road       |
|29th   |O            |O            |
|avenue |O            |O            |
+-------+-------------+-------------+



In [24]:
ner_model.stages

[BERT_EMBEDDINGS_abf30dcdf344, NerDLModel_8c9213da4525]

# Fifth step
save the nermodel

In [25]:
ner_model.stages[1].write().overwrite().save('NER_bert_v1')

# last step -working on test data
prediction pipeline and loading our model

In [29]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

bert = BertEmbeddings.pretrained('bert_base_cased', 'en') \
 .setInputCols(["sentence",'token'])\
 .setOutputCol("bert")\
 .setCaseSensitive(False)

loaded_ner_model = NerDLModel.load("NER_bert_v1")\
 .setInputCols(["sentence", "token", "bert"])\
 .setOutputCol("ner")

converter = NerConverter()\
  .setInputCols(["document", "token", "ner"])\
  .setOutputCol("ner_span")

ner_prediction_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        token,
        bert,
        loaded_ner_model,
        converter])

bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [30]:
empty_data = spark.createDataFrame([['']]).toDF("text")
prediction_model = ner_prediction_pipeline.fit(empty_data)

### Example of new address

In [31]:
text = "70 york street"
sample_data = spark.createDataFrame([[text]]).toDF("text")
sample_data.show()

+--------------+
|          text|
+--------------+
|70 york street|
+--------------+



In [35]:
testpreds = prediction_model.transform(sample_data)

testpreds.columns

['text', 'document', 'sentence', 'token', 'bert', 'ner', 'ner_span']

In [39]:
import pyspark.sql.functions as F

testpreds.select(F.explode(F.arrays_zip('token.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("prediction")).show(truncate=False)

+------+----------+
|token |prediction|
+------+----------+
|70    |B-road    |
|york  |B-road    |
|street|O         |
+------+----------+

