## Purpose: Get Radio NER Model Prediction

### Note: Before running this notebook, please configure the following paths

In [None]:
# we are using sparknlp clinical embedding word model
# specify your folder containing the downloaded clinical embedding word model file, or you can use .pretrained during training instead to load it online
embeddings_clinical_local_path = r"path\to\sparknlp_pretrained\embeddings_clinical_en_2.4.0_2.4_1580237286004"

# we are using sparknlp radiology assertion model
# specify your folder containing the downloaded jsl assertion model file, or you can use .pretrained during training instead to load it online
jsl_radiology_assertion = r"path\to\sparknlp_pretrained\assertion_dl_radiology_en_2.7.4_2.4_1616071311532"

In [None]:
# specify your sparknlp online license key-need internet connection
# we are using v3.4.2
sparknlp_licence_key = r"..\sparknlp_licence_key\yourkey.json"

# specify your sparknlp offline license key-airgap env
# we are using v3.4.2
sparknlp_airgap_licence_key = r"..\sparknlp_licence_key\yourairgapkey.json"

# 1. Import Libraries

Note: Requires Spark NLP and Spark NLP for Healthcare (licensed version) packages to be installed

In [None]:
import json, os, re, sparknlp, sparknlp_jsl, ner_log_parser, datetime, time
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import CoNLL
from sparknlp_jsl.annotator import *
from sparknlp_jsl.training import tf_graph
from sparknlp_display import AssertionVisualizer, NerVisualizer 

from sklearn.metrics import classification_report, accuracy_score

# 2. Start Spark Session (OFFLINE for inference)

In [None]:
# Offline-Load airgap license key
with open(sparknlp_airgap_licence_key) as f:
    airgap_license_keys = json.load(f)
    
# Defining license key-value pairs as local variables
locals().update(airgap_license_keys)

# Adding license key-value pairs to environment variables, use this (17-Aug-2022)
os.environ['SPARK_NLP_LICENSE'] = airgap_license_keys['SPARK_NLP_LICENSE']

# check variable
#!echo $SECRET
!echo $JSL_VERSION
!echo $PUBLIC_VERSION
#!echo $SPARK_NLP_LICENSE

In [None]:
# put your downloaded spark nlp jar files in a local folder, eg d:\content
# note path cannot be too long, else there will be a java error on package not callable

def start(SECRET):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed radio ner") \
        .master("local[16]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.2") \
        .config("spark.jars", f"d:\content\spark-nlp-jsl-{JSL_VERSION}.jar, d:\content\spark-nlp_2.12-3.4.2.jar" )

    return builder.getOrCreate()


print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = start(SECRET) 

spark

# 3. Create NER Prediction Pipeline Model

In [None]:
# this is saved during NER training
best_ner_model = "clinical_embeddings_1_8_0.001_u0.3o1_train4522"

## Prediction Pipeline - with jsl assertion status detection

In [None]:
# loading
document = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

sentence = SentenceDetector()\
        .setInputCols(['document'])\
        .setOutputCol('sentence')

token = Tokenizer()\
        .setInputCols(['sentence'])\
        .setOutputCol('token')

#use .pretrained() for sparknlp online session
#use .load() for sparknlp airgap session
clinical_embeddings = WordEmbeddingsModel.load(embeddings_clinical_local_path)\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

#load the best ner model saved after training
loaded_ner_model = MedicalNerModel.load("./saved_models/" + best_ner_model)\
        .setInputCols(["sentence", "token", "embeddings"])\
        .setOutputCol("ner")

converter = NerConverter()\
        .setInputCols(["sentence", "token", "ner"])\
        .setOutputCol("ner_span")

#using jsl radiology assertion model for all ner entities 
#use .pretrained() for sparknlp online session
#use .load() for sparknlp airgap session
#radiology_assertion = AssertionDLModel.pretrained("assertion_dl_radiology", "en", "clinical/models") \
radiology_assertion = AssertionDLModel.load("./assertion_dl_radiology_en_2.7.4_2.4_1616071311532")\
    .setInputCols(["sentence", "ner_span", "embeddings"]) \
    .setOutputCol("assertion")

ner_prediction_pipeline = Pipeline(stages = [
        document,
        sentence,
        token,
        clinical_embeddings,
        loaded_ner_model,
        converter,
        radiology_assertion
])

empty_data = spark.createDataFrame([['']]).toDF("text")

ner_prediction_model = ner_prediction_pipeline.fit(empty_data)

## Sample prediction

In [None]:
text1 = """
your sample text
"""

In [None]:
text = text1
sample_data = spark.createDataFrame([[text]]).toDF("text")
sample_data.show()
sample_data.dtypes

In [None]:
preds = ner_prediction_model.transform(sample_data)

preds.select(F.explode(F.arrays_zip("ner_span.result","ner_span.metadata")).alias("entities")) \
.select(F.expr("entities['0']").alias("chunk"),
        F.expr("entities['1'].entity").alias("entity")).show(truncate=False)

## LightPipeline / Visualisation

In [None]:
lmodel = LightPipeline(ner_prediction_model)
ppres = lmodel.fullAnnotate(text)[0]
ppres.keys()

In [None]:
ppres['ner_span']

In [None]:
# assertion detection
chunks=[]
entities=[]
status=[]

for n,m in zip(ppres['ner_span'],ppres['assertion']):
    
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    status.append(m.result)
        
df = pd.DataFrame({'chunks':chunks, 'entities':entities, 'assertion':status})
df.head(20)

In [None]:
from sparknlp_display import NerVisualizer
visualiser = NerVisualizer()

# Set label filter
visualiser.display(ppres, label_col='ner_span', document_col='document', save_path="./inference/display_result/display_result.html")
df.to_csv("./inference/display_result/display_result.csv")

## Get prediction with input csv

In [None]:
# modify the codes accordingly based on your csv file
# it should contain a text column "conclusion"
df_text = pd.read_csv("./inference/50samples.csv", usecols=['sn_report_number','report','report_date','conclusion'])
df_text.count()

In [None]:
df_text.head(2)

In [None]:
# check for null text
df_text.isnull().sum()

In [None]:
# fill null
df_text['conclusion'] = df_text['conclusion'].fillna('')

In [None]:
# save the ner visualisation to html file for review
# save the ner annotation to csv for review

annotation_df = pd.DataFrame()
for i in range(df_text['sn_report_number'].count()):
    print(i)
    ppres = lmodel.fullAnnotate(df_text['conclusion'].loc[i])[0]
    visualiser.display(ppres, label_col='ner_span', document_col='document', save_path="./inference/display_result/"+df_text['sn_report_number'].loc[i]+"_report.html")

    #output to csv
    chunk=[]
    entity=[]
    status=[]
    for n,m in zip(ppres['ner_span'],ppres['assertion']):
        chunk.append(n.result)
        entity.append(n.metadata['entity']) 
        status.append(m.result)
    temp_df = pd.DataFrame({'sn_report_number':df_text['sn_report_number'].loc[i],'report_date':df_text['report_date'].loc[i],'chunk':chunk, 'entity':entity, 'assertion_status':status})    
    temp_df['entity_index'] = temp_df.index
    #print(temp_df)
    annotation_df = annotation_df.append(temp_df)
    #print(annotation_df)

columns = ['sn_report_number', 'report_date','entity_index', 'entity','chunk','assertion_status']
annotation_df.to_csv("./inference/display_result/pred_radio_ner_annotation.csv", columns=columns, index=False)    