# Import 

In [3]:
import sparknlp
spark=sparknlp.start()
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pyspark.sql.functions as F
import re
from pyspark.sql import Row

spark=sparknlp.start()


# Create inital spark dataframe by reading a openaddress CSV sample

In [20]:
df=spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').option("encoding", "utf-8").load('sample_of_toronto.csv')
df.show()

+-----------+----------+------+----------------+----+---------+--------+------+--------+----+----------------+
|        LON|       LAT|NUMBER|          STREET|UNIT|     CITY|DISTRICT|REGION|POSTCODE|  ID|            HASH|
+-----------+----------+------+----------------+----+---------+--------+------+--------+----+----------------+
|-79.5442514| 43.593789|    22|Lloyd George Ave|null|Etobicoke|    null|  null|    null|null|4609176c08c67d96|
|-79.5434954|43.5934445|     3|Lloyd George Ave|null|Etobicoke|    null|  null|    null|null|8acf99afdb870ad6|
|-79.5435775|43.5936221|    7A|Lloyd George Ave|null|Etobicoke|    null|  null|    null|null|239c832319e298e7|
|-79.5465666|43.5962026|    58|        Foch Ave|null|Etobicoke|    null|  null|    null|null|2bb9aab1d601c207|
|-79.5464106|43.5959935|    54|        Foch Ave|null|Etobicoke|    null|  null|    null|null|c33e434ceba1c9fb|
|-79.5465831|43.5963137|    60|        Foch Ave|null|Etobicoke|    null|  null|    null|null|f037b9cfaead8162|
|

In [21]:
def make_address_dataframe(df):
    columns_to_drop = ['LON', 'LAT', 'HASH', 'ID', 'DISTRICT', 'REGION']
    df = df.drop(*columns_to_drop)
    df = df.withColumnRenamed('NUMBER', 'house_number')
    df=df.withColumnRenamed('STREET', 'road')
    df=df.withColumnRenamed('UNIT', 'unit')
    df=df.withColumnRenamed('CITY', 'city')
    df=df.withColumnRenamed('POSTCODE', 'postcode')

    return df
df=make_address_dataframe(df)
df.show()

+------------+----------------+----+---------+--------+
|house_number|            road|unit|     city|postcode|
+------------+----------------+----+---------+--------+
|          22|Lloyd George Ave|null|Etobicoke|    null|
|           3|Lloyd George Ave|null|Etobicoke|    null|
|          7A|Lloyd George Ave|null|Etobicoke|    null|
|          58|        Foch Ave|null|Etobicoke|    null|
|          54|        Foch Ave|null|Etobicoke|    null|
|          60|        Foch Ave|null|Etobicoke|    null|
|          62|        Foch Ave|null|Etobicoke|    null|
|          64|        Foch Ave|null|Etobicoke|    null|
|          46|    Jellicoe Ave|null|Etobicoke|    null|
|          44|    Jellicoe Ave|null|Etobicoke|    null|
|          11|    Jellicoe Ave|null|Etobicoke|    null|
|           9|    Jellicoe Ave|null|Etobicoke|    null|
|           7|    Jellicoe Ave|null|Etobicoke|    null|
|          12|    Jellicoe Ave|null|Etobicoke|    null|
|          17|         Owen Dr|null|Etobicoke|  

# Create text, text_token, and label for the df

In [49]:
def text_and_label_maker(df):

   

    def clean_NULL(a):
        a=a.split(" //// ")
        while "NULL" in a:
            a.remove("NULL")
            
        while '[NULL]' in a:
            a.remove("[NULL]")
        a=" //// ".join(a)
        return a
    
    def split(a):
        address=[]
        a=a.split(' //// ')
        for i in range(len(a)):
            
            if len(a[i].split(" "))>1:
                b=a[i].split(" ")
                for i in range(len(b)):
                    address.append(b[i])
            else:
                address.append(a[i])
        return address
    
    def remove_annotation(a):
        a="".join(re.sub(re.compile(r'\s+'), '', a).split("////"))
        return a
    

    def unit_func(u):
        a=[]
        if int(u)>1:
            a.append("B-UNIT")
            for i in range(1,int(u)):
                a.append("I-UNIT")
        elif int(u)>0:
            a.append("B-UNIT")
        else:
            a.append('NULL')
        return a

    def house_func(h):
        a=[]
        if int(h)>1:
            a.append("B-House_number")
            for i in range(1,int(h)):
                a.append("I-House_number")
        elif int(h)>0:
            a.append("B-House_number")
        else:
            a.append('NULL')
        return a

    def road_func(r):
        a=[]
        if int(r)>1:
            a.append("B-Street")
            for i in range(1,int(r)):
                a.append("I-Street")
        elif int(r)>0:
            a.append("B-Street")
        else:
            a.append('NULL')
        return a
                     
    def post_func(p):
        a=[]
        if int(p)>1:
            a.append("B-Postcode")
            for i in range(1,int(p)):
               a.append("I-Postcode")
        elif int(p)>0:
            a.append("B-Postcode")
        else:
            a.append('NULL')
        return a
                     
    def city_func(c):
            a=[]
            if int(c)>1:
                a.append("B-City")
                for i in range(1,int(c)):
                     a.append("I-City")
            elif int(c)>0:
                     a.append("B-City")
            else:
                a.append('NULL')
            return a
    
    def label(a):
            a=str(a).replace(']','')
            a=str(a).replace('[','')
            a=str(a).replace(',',' ')
            a=str(a).replace("'","")
            a=a.split(' //// ')
            a=" ".join(a)
            return a
    
    #udf_functions
    concat_udf = F.udf(lambda cols: " //// ".join([x if x is not None else "NULL" for x in cols]), StringType())
    NULL_udf = F.udf(lambda address: clean_NULL(address))
    split_udf=F.udf(lambda address: split(address))
    len_token_udf=F.udf(lambda x: len(str(x).split(' ')) if x is not None else 0)
    remove_annotation_udf=F.udf(lambda x:" ".join(x.split("////")))
    tagging1_udf=F.udf(lambda u: unit_func(u))
    tagging2_udf=F.udf(lambda h: house_func(h))
    tagging3_udf=F.udf(lambda r: road_func(r))
    tagging4_udf=F.udf(lambda p: post_func(p))
    tagging5_udf=F.udf(lambda c: city_func(c))
    label_udf=F.udf(lambda l: label(l))
    
    
    
    df=df.withColumn("text_with_null", concat_udf(F.array("unit", "house_number", "road","postcode","city")))
    df=df.withColumn("annotated_text", NULL_udf("text_with_null")).drop("text_with_null")
    df=df.withColumn("text_tokens", split_udf("annotated_text"))
    df=df.withColumn("text", remove_annotation_udf("annotated_text"))
    df=df.withColumn("unitl", len_token_udf("unit"))
    df=df.withColumn("house_numberl", len_token_udf("house_number"))
    df=df.withColumn("postcodel", len_token_udf("postcode"))
    df=df.withColumn("roadl", len_token_udf("road"))
    df=df.withColumn("cityl", len_token_udf("city"))
    df=df.withColumn("unit_taggedTokens", tagging1_udf('unitl')).drop('unitl')
    df=df.withColumn("house_number_taggedTokens", tagging2_udf('house_numberl')).drop('house_numberl')
    df=df.withColumn("road_taggedTokens", tagging3_udf('roadl')).drop('roadl')
    df=df.withColumn("postcode_taggedTokens", tagging4_udf('postcodel')).drop('postcodel')
    df=df.withColumn("city_taggedTokens", tagging5_udf('cityl')).drop('cityl')              
    df=df.withColumn("concat_label_with_null", concat_udf(F.array("unit_taggedTokens", "house_number_taggedTokens", "road_taggedTokens", "postcode_taggedTokens","city_taggedTokens")))
    df=df.withColumn("concat_label", NULL_udf("concat_label_with_null")).drop("concat_label_with_null")
    df=df.withColumn("label",label_udf("concat_label")).drop("concat_label")
    return df


df=text_and_label_maker(df)

print(df.select("text").limit(1).collect())
print(df.select("text_tokens").limit(1).collect())
print(df.select("label").limit(1).collect())
df.show(20)


[Row(text='22   Lloyd George Ave   Etobicoke')]
[Row(text_tokens='[22, Lloyd, George, Ave, Etobicoke]')]
[Row(label='B-House_number B-Street  I-Street  I-Street B-City')]
+------------+----------------+----+---------+--------+--------------------+--------------------+-----------------+-------------------------+--------------------+---------------------+-----------------+--------------------+--------------------+--------------------+
|house_number|            road|unit|     city|postcode|         text_tokens|                text|unit_taggedTokens|house_number_taggedTokens|   road_taggedTokens|postcode_taggedTokens|city_taggedTokens|              label2|               label|      annotated_text|
+------------+----------------+----+---------+--------+--------------------+--------------------+-----------------+-------------------------+--------------------+---------------------+-----------------+--------------------+--------------------+--------------------+
|          22|Lloyd George Ave|

# provide annotation requiring for NerDL approach

### Label annotation

In [75]:
def createAnnotation(token,label,text):
    lastBegin =0
    i=0
    data=[]
    
    text_tokens=token.replace(']','')
    text_tokens=text_tokens.replace('[','')
    text_tokens=text_tokens.split(',')
    tags=label.split(" ")
    while "" in tags:
        tags.remove("")
    
    
    for i in range(len(text_tokens)):
            a=Row(
                  annotatorType="named_entity",
                  begin=lastBegin,
                  end=lastBegin + len(text_tokens[i]) - 1,
                  result=tags[i],
                  metadata={'word': text_tokens[i]},
                  embeddings=[0.00]
                  )
                
                
            lastBegin += len(text_tokens[i])+1
            data.append(a)

    
    return {'text':text,'label':data}





### document, sentence, tokenizer, and pose annotation

In [76]:
def get_formatting_model():
    document = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")
    
    sentence = SentenceDetector()\
        .setInputCols(['document'])\
        .setOutputCol('sentence')
    tokenizer = Tokenizer() \
        .setInputCols(["sentence"]) \
        .setOutputCol("token")

    pos = PerceptronModel.pretrained() \
        .setInputCols(["sentence", "token"]) \
        .setOutputCol("pos")
    
    
    formatting_pipeline = Pipeline(
                                   stages = [
                                             document,
                                             sentence,
                                             tokenizer,
                                             pos
                                             ]
                                   )
    empty_data = spark.createDataFrame([['']]).toDF("text")
    formatting_model = formatting_pipeline.fit(empty_data)
    return formatting_model



# Get the final dataframe ready to pass to NerDLApproach()

In [79]:
def format(df):
    df=df.select(['text_tokens', 'text','label'])
    data_rdd = df.rdd.map(lambda row: row.asDict())
    data_rdd = data_rdd.map(lambda x: createAnnotation(x['text_tokens'],x['label'],x['text']))
    Schema = StructType([StructField("text", StringType(), False),
                        StructField('label',ArrayType(
                        StructType([
                                   StructField("annotatorType", StringType(), False),
                                   StructField("begin", IntegerType(), False),
                                   StructField("end", IntegerType(), False),
                                   StructField("result", StringType(), False),
                                   StructField("metadata",  MapType(StringType(), StringType())),
                                   StructField("embeddings",  ArrayType(FloatType()), False)
                                   ])))])
    
    data = spark.createDataFrame(data_rdd, schema=Schema)
    formatting_model=get_formatting_model()
    training_data=formatting_model.transform(data)
    return training_data
training_data=format(df)
training_data.show()

pos_anc download started this may take some time.
Approximate size to download 4.3 MB
[OK!]
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|               label|            document|            sentence|               token|                 pos|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|22   Lloyd George...|[[named_entity, 0...|[[document, 0, 32...|[[document, 0, 32...|[[token, 0, 1, 22...|[[pos, 0, 1, CD, ...|
|3   Lloyd George ...|[[named_entity, 0...|[[document, 0, 31...|[[document, 0, 31...|[[token, 0, 0, 3,...|[[pos, 0, 0, CD, ...|
|7A   Lloyd George...|[[named_entity, 0...|[[document, 0, 32...|[[document, 0, 32...|[[token, 0, 1, 7A...|[[pos, 0, 1, CD, ...|
|58   Foch Ave   E...|[[named_entity, 0...|[[document, 0, 24...|[[document, 0, 24...|[[token, 0, 1, 58...|[[pos, 0, 1, CD, ...|
|54   Foch A

In [80]:
training_data.printSchema()

root
 |-- text: string (nullable = false)
 |-- label: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = false)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = false)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = false)
 |    |    |    |-- element: float (containsNull = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true

# Building Ner Pipeline

In [84]:
bert_annotator = BertEmbeddings.pretrained('bert_base_cased', 'en') \
 .setInputCols(["sentence",'token'])\
 .setOutputCol("bert")\
 .setCaseSensitive(False)\
 .setPoolingLayer(0)

bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


In [85]:
training_data = bert_annotator.transform(training_data)
training_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|               label|            document|            sentence|               token|                 pos|                bert|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|22   Lloyd George...|[[named_entity, 0...|[[document, 0, 32...|[[document, 0, 32...|[[token, 0, 1, 22...|[[pos, 0, 1, CD, ...|[[word_embeddings...|
|3   Lloyd George ...|[[named_entity, 0...|[[document, 0, 31...|[[document, 0, 31...|[[token, 0, 0, 3,...|[[pos, 0, 0, CD, ...|[[word_embeddings...|
|7A   Lloyd George...|[[named_entity, 0...|[[document, 0, 32...|[[document, 0, 32...|[[token, 0, 1, 7A...|[[pos, 0, 1, CD, ...|[[word_embeddings...|
|58   Foch Ave   E...|[[named_entity, 0...|[[document, 0, 24...|[[document, 0, 24...|[[token, 0, 1, 58...|

In [87]:
nerTagger = NerDLApproach()\
  .setInputCols(["sentence", "token", "bert"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMaxEpochs(1)\
  .setLr(0.001)\
  .setPo(0.005)\
  .setBatchSize(8)\
  .setRandomSeed(0)\
  .setVerbose(1)\
  .setValidationSplit(0.2)\
  .setEvaluationLogExtended(True) \
  .setEnableOutputLogs(True)\
  .setIncludeConfidence(True)\
  

NER_pipeline = Pipeline(
    stages = [
    bert_annotator,
    nerTagger
  ])

Ner_model = NER_pipeline.fit(training_data)

In [88]:
import pyspark.sql.functions as F
predictions = Ner_model.transform(training_data)

predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).show(truncate=False)

+---------+--------------+--------------+
|token    |ground_truth  |prediction    |
+---------+--------------+--------------+
|22       |B-House_number|B-House_number|
|Lloyd    |B-Street      |O             |
|George   |I-Street      |O             |
|Ave      |I-Street      |O             |
|Etobicoke|B-City        |O             |
|3        |B-House_number|B-House_number|
|Lloyd    |B-Street      |O             |
|George   |I-Street      |O             |
|Ave      |I-Street      |O             |
|Etobicoke|B-City        |O             |
|7A       |B-House_number|B-House_number|
|Lloyd    |B-Street      |O             |
|George   |I-Street      |O             |
|Ave      |I-Street      |O             |
|Etobicoke|B-City        |O             |
|58       |B-House_number|B-House_number|
|Foch     |B-Street      |O             |
|Ave      |I-Street      |O             |
|Etobicoke|B-City        |O             |
|54       |B-House_number|B-House_number|
+---------+--------------+--------