## Purpose: To get assertion model prediction for cancer_imaging_findings entity
assertion status output:
- probability high
- probability medium
- probability low
- probability uncertain

### Note: Before running this notebook, please configure the following paths

In [None]:
# we are using sparknlp clinical embedding word model
# specify your folder containing the downloaded clinical embedding word model file, or you can use .pretrained during training instead to load it online
embeddings_clinical_local_path = r"path\to\sparknlp_pretrained\embeddings_clinical_en_2.4.0_2.4_1580237286004"

In [None]:
# specify your sparknlp online license key-need internet connection
# we are using v3.4.2
sparknlp_licence_key = r"..\sparknlp_licence_key\yourkey.json"

# specify your sparknlp offline license key-airgap env
# we are using v3.4.2
sparknlp_airgap_licence_key = r"..\sparknlp_licence_key\yourairgapkey.json"

## Import Libraries

Note: Requires Spark NLP and Spark NLP for Healthcare (licensed version) packages to be installed

In [None]:
import json, os, re, sparknlp, sparknlp_jsl, datetime, time
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import CoNLL
from sparknlp_jsl.annotator import *
from sparknlp_jsl.training import tf_graph
from sparknlp_display import AssertionVisualizer, NerVisualizer 

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

Note: Requires Spark NLP for Healthcare (licensed version) license key

### Start Spark Session (Offline)

In [None]:
# Offline-Load airgap license key
with open(sparknlp_airgap_licence_key) as f:
    airgap_license_keys = json.load(f)
    
# Defining license key-value pairs as local variables
locals().update(airgap_license_keys)
os.environ.update(airgap_license_keys)

# check variable
!echo $SECRET
!echo $JSL_VERSION
!echo $PUBLIC_VERSION

os.environ['PYSPARK_PYTHON'] = 'python'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
print(os.environ['PYSPARK_PYTHON'])
print(os.environ['PYSPARK_DRIVER_PYTHON'])

# Start Spark Session with Custom Params (OFFLINE)
def start(SECRET):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed radio_assertion") \
        .master("local[16]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.driver.maxResultSize","4000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.2") \
        .config("spark.jars", f"d:\content\spark-nlp-jsl-{JSL_VERSION}.jar, d:\content\spark-nlp_2.12-3.4.2.jar" )

    return builder.getOrCreate()


print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = start(SECRET) 

spark

### Start Spark Session (Online)

## ------------------- MODEL INFERENCE --------------------

In [None]:
# specify the name of your NER model
radio_ner_model = "clinical_embeddings_5_8_0.001_u0.4o1_train4522"

# specify the name of your Assertion model
assertion_model_name = "radio_assertion_model_20_16_0.001_2023_04_21_16_19_46_train4522"

# 4. Test Data Prediction

In [None]:
# loading
document = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

sentence = SentenceDetector()\
        .setInputCols(['document'])\
        .setOutputCol('sentences')

token = Tokenizer()\
        .setInputCols(['sentences'])\
        .setOutputCol('tokens')

words_embedder = WordEmbeddingsModel()\
    .load(embeddings_clinical_local_path)\
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("embeddings")
  
radio_ner_tagger = MedicalNerModel.load(radio_ner_model)\
    .setInputCols(["sentences", "tokens", "embeddings"])\
    .setOutputCol("ner_tags")

converter = NerConverter()\
        .setInputCols(["sentences", "tokens", "ner_tags"])\
        .setOutputCol("ner_span")\
        .setWhiteList(["cancer_imaging_findings"])

## add radio assertion model
radiology_assertion = AssertionDLModel.load('./saved_models/'+'/'+assertion_model_name) \
    .setInputCols(["sentences", "ner_span", "embeddings"]) \
    .setOutputCol("assertion")

ner_assertion_pipeline = Pipeline(stages = [
        document,
        sentence,
        token,
        words_embedder,
        radio_ner_tagger,
        converter,
        radiology_assertion
])

empty_data = spark.createDataFrame([['']]).toDF("text")

ner_assertion_model = ner_assertion_pipeline.fit(empty_data)

lmodel = LightPipeline(ner_assertion_model)

In [None]:
## sample

In [None]:
# site of mets
mtext1 = """
your sample text
"""

In [None]:
text = mtext1
sample_data = spark.createDataFrame([[text]]).toDF("text")
sample_data.show(truncate=False)
sample_data.dtypes

In [None]:
preds = ner_assertion_model.transform(sample_data)

preds.select(F.explode(F.arrays_zip("ner_span.result","ner_span.metadata")).alias("entities")) \
.select(F.expr("entities['0']").alias("chunk"),
        F.expr("entities['1'].entity").alias("entity")).show(50,truncate=False)

In [None]:
preds.select(F.explode(F.arrays_zip(preds.ner_span.result, 
                                     preds.ner_span.metadata, 
                                     preds.assertion.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunks"),
              F.expr("cols['1']['entity']").alias("ner_label"),
              F.expr("cols['1']['sentences']").alias("sent_id"),
              F.expr("cols['2']").alias("assertion")).show(50,truncate=False)

## LightPipeline / Visualisation

In [None]:
#! mkdir display_result

In [None]:
ppres = lmodel.fullAnnotate(text)[0]
ppres.keys()

In [None]:
from sparknlp_display import NerVisualizer
visualiser = NerVisualizer()
visualiser.display(ppres, label_col='ner_span', document_col='document')

In [None]:
assertion_vis = AssertionVisualizer()
assertion_vis.display(ppres, 'ner_span', 'assertion')

## Get prediction with sample.csv

In [None]:
# change the column names accordinlgy to suit your dataset
df_text = pd.read_csv("./inference/samples.csv", usecols=['sn_report_number', 'report_date','findings','conclusion'])
df_text.count()

In [None]:
df_text.head(2)

In [None]:
# check for null text
df_text.isnull().sum()

In [None]:
# fill null
df_text['conclusion'] = df_text['conclusion'].fillna('')

In [None]:
# save the visualisation to html file for review
# save the annotation to csv for review
annotation_df = pd.DataFrame()
for i in range(df_text['sn_report_number'].count()):
    print(i)
    ppres = lmodel.fullAnnotate(df_text['conclusion'].loc[i])[0]
    assertion_vis.display(ppres, 'ner_span', 'assertion',save_path="./inference/display_result/"+df_text['sn_report_number'].loc[i]+"_report.html")
    #output to csv
    chunk=[]
    entity=[]
    status=[]
    for n,m in zip(ppres['ner_span'],ppres['assertion']):
        chunk.append(n.result)
        entity.append(n.metadata['entity']) 
        status.append(m.result)
    temp_df = pd.DataFrame({'sn_report_number':df_text['sn_report_number'].loc[i],'report_date':df_text['report_date'].loc[i],'chunk':chunk, 'entity':entity, 'assertion_status':status})    
    temp_df['entity_index'] = temp_df.index
    #print(temp_df)
    annotation_df = annotation_df.append(temp_df)
    #print(annotation_df)

columns = ['sn_report_number', 'report_date','entity_index', 'entity','chunk','assertion_status']
annotation_df.to_csv("./inference/display_result/sample_ner_assertion.csv", columns=columns, index=False)    