## Purpose: To train/evaluate a custom Radio NER model based on Annotated Radiology Reports Conclusion Text (using SparkNLP library v3.4.2)

In [None]:
# uncomment to run to create the subfolders, for the first time
#!mkdir graph saved_models ner_output ner_result ner_logs_ncc inference

### Note: Before running this notebook, please configure the following paths

In [None]:
# we are using sparknlp clinical embedding word model
# specify your folder containing the downloaded clinical embedding word model file, or you can use .pretrained during training instead to load it online
embeddings_clinical_local_path = r"path\to\sparknlp_pretrained\embeddings_clinical_en_2.4.0_2.4_1580237286004"

# we are using sparknlp radiology assertion model
# specify your folder containing the downloaded jsl assertion model file, or you can use .pretrained during training instead to load it online
jsl_radiology_assertion = r"path\to\\sparknlp_pretrained\assertion_dl_radiology_en_2.7.4_2.4_1616071311532"

In [None]:
# specify your sparknlp online license key-need internet connection
# we are using v3.4.2
sparknlp_licence_key = r"..\sparknlp_licence_key\yourkey.json"

# specify your sparknlp offline license key-airgap env
# we are using v3.4.2
sparknlp_airgap_licence_key = r"..\sparknlp_licence_key\yourairgapkey.json"

In [None]:
# train/test setting
data_folder = "dataset"
train_folder = data_folder+"\\02conll\conll_train"
test_file = data_folder+"\\02conll\conll_test\\test4522.txt"
dataset_name = "train4522"


In [None]:
#!dir $train_folder

In [None]:
#!dir $test_file

## Train/Evaluate NER Model

Note: Requires Spark NLP and Spark NLP for Healthcare (licensed version) packages to be installed

### Import Libraries

In [None]:
import json, os, re, sparknlp, sparknlp_jsl, ner_log_parser, datetime, time
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import CoNLL
from sparknlp_jsl.annotator import *
from sparknlp_jsl.training import tf_graph
from sparknlp_display import AssertionVisualizer, NerVisualizer 

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
%matplotlib inline

### Start Spark Session (Online with internet)
- use this only if you need to download new models with .pretrained(..)
- after that, put the new models' jar in local path, and load it using .load(..)

### Start Spark Session (OFFLINE)

In [None]:
# Offline-Load airgap license key
with open(sparknlp_airgap_licence_key) as f:
    airgap_license_keys = json.load(f)
    
# Defining license key-value pairs as local variables
locals().update(airgap_license_keys)
os.environ.update(airgap_license_keys)

# check variable
!echo $SECRET
!echo $JSL_VERSION
!echo $PUBLIC_VERSION

Note: Requires Spark NLP for Healthcare (licensed version) license key

In [None]:
# put your downloaded spark nlp jar files in a local folder, eg d:\content
# note path cannot be too long, else there will be a java error on package not callable

def start(SECRET):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed radio ner") \
        .master("local[16]") \
        .config("spark.driver.memory", "60G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.2") \
        .config("spark.jars", f"d:\content\spark-nlp-jsl-3.4.2.jar, d:\content\spark-nlp_2.12-3.4.2.jar" ) ## change this

    return builder.getOrCreate()


print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = start(SECRET) 

spark

### Build a Tensorflow Graph

Note: Depending on the number of characters (nchars), a tensorflow graph may need to be build before the NER Model Training can take place

In [None]:
# uncomment to run for the first time
#tf_graph.build("ner_dl", build_params={"embeddings_dim": 200, "nchars":88, "ntags": 60, "is_medical": 1}, model_location="./graph/medical_ner_graphs", model_filename="auto")

### Specify training sets to use
- for the initial run, can use all sets to see the performance of each set, this will take about 15 hrs
- after that, if need to rerun only certain training sets, then specify the params accordingly

In [None]:
# this is for selected training sets
# use this to test run pipeline or to rerun specific training sets
input_resample_params = ['u0.3o1']

In [None]:
# check training file path
for resample_ratio in input_resample_params:
    training_filepath = train_folder+"\\"+dataset_name + "_"+str(resample_ratio) + ".txt"
    print(training_filepath)

### check conll file before training
- to confirm conll file is not empty

In [None]:
# check conll file before training
for resample_ratio in input_resample_params:
    training_filepath = train_folder+"\\"+dataset_name + "_"+str(resample_ratio) + ".txt"
    
    ###### Data Preparation
    print('...filename: ',training_filepath )
    training_data = CoNLL().readDataset(spark, training_filepath) 

    print("...Conll Dataset Count: " + str(training_data.count()))

    # Groupby view of conll_data 
    training_data.select(F.explode(F.arrays_zip('token.result','label.result')).alias("cols")) \
    .select(F.expr("cols['0']").alias("token"),
            F.expr("cols['1']").alias("ground_truth")).groupBy('ground_truth').count().orderBy('count', ascending=False)\
            .show(100,truncate=False)

In [None]:
# check test file name
test_file

In [None]:
# check reading of test.txt
# if there is Py4JavaError, open notebpad, copy all content to a new text file and save it as testxxxv2.txt
test_file = data_folder+"\\02conll\conll_test\\test4522v2.txt"
test_data = CoNLL().readDataset(spark, test_file)
test_data

## ------------------- START OF TRAINING  --------------------

In [None]:
# Create Output Dataframe to store the performance metrics
output_df = pd.DataFrame(columns = ['model', 'epoch', 'learning_rate', 'batch_size', 'resampling_ratio', 'start_time', 'end_time', 'duration', 'overall_accuracy','class_accuracy', 'classification_report'])

# test conll data
test_data = CoNLL().readDataset(spark, test_file)

for resample_ratio in input_resample_params:
    train_file = train_folder+"\\"+dataset_name + "_"+str(resample_ratio) + ".txt"
    
    ###### Data Preparation
    training_data = CoNLL().readDataset(spark, train_file) 

    print("Training Dataset Count: " + str(training_data.count()))
    print("Test Dataset Count: " + str(test_data.count()))

    # Groupby view of training data
    training_data.select(F.explode(F.arrays_zip('token.result','label.result')).alias("cols")) \
    .select(F.expr("cols['0']").alias("token"),
            F.expr("cols['1']").alias("ground_truth")).groupBy('ground_truth').count().orderBy('count', ascending=False)\
            .show(100,truncate=False)

    # Groupby view of testing data
    test_data.select(F.explode(F.arrays_zip('token.result','label.result')).alias("cols")) \
    .select(F.expr("cols['0']").alias("token"),
            F.expr("cols['1']").alias("ground_truth")).groupBy('ground_truth').count().orderBy('count', ascending=False)\
            .show(100,truncate=False)
    
    ## training loop
    ## for test run of pipeline, use n=1, takes about 5 mins for u0.3o1 train file
    ## for training, use n=5
    for n in [1]:
        for x in [8]:
            for z in [0.001]:
                epoch = n
                batch_size = x
                learning_rate = z
                resample_ratio = resample_ratio
                model_type = "clinical_embeddings"
                start = time.ctime()
                start2 = time.time()
                print('Start of loop: ', 'epoch =', n, ', batch_size =', x, ', learning_rate =', z, ', resample_ratio =', resample_ratio)
                print('Start time for new loop: ', start)
                print(50*'-')
                clinical_embeddings = WordEmbeddingsModel.load(embeddings_clinical_local_path)\
                          .setInputCols(["sentence", "token"])\
                          .setOutputCol("embeddings")

                filename_prefix = "ner_dl_ncctest_"  + model_type + str(datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))
                clinical_embeddings.transform(test_data).write.parquet("%s.parquet" % filename_prefix)

                nerTagger = MedicalNerApproach()\
                            .setInputCols(["sentence", "token", "embeddings"])\
                            .setLabelColumn("label")\
                            .setOutputCol("ner")\
                            .setMaxEpochs(n)\
                            .setLr(z)\
                            .setBatchSize(x)\
                            .setRandomSeed(100)\
                            .setVerbose(1)\
                            .setValidationSplit(0.1)\
                            .setEvaluationLogExtended(True) \
                            .setEnableOutputLogs(True)\
                            .setIncludeConfidence(True)\
                            .setTestDataset("%s.parquet" % filename_prefix)\
                            .setOutputLogsPath('./ner_logs_ncc')\
                            .setGraphFolder('./graph/medical_ner_graphs')\
                            .setEarlyStoppingCriterion(0.01)\
                            .setEarlyStoppingPatience(3)                
                ner_pipeline = Pipeline(stages=[
                                clinical_embeddings,
                                nerTagger])
                ner_model_clinicalembeddings = ner_pipeline.fit(training_data)
                test_data = clinical_embeddings.transform(test_data)
                predictions = ner_model_clinicalembeddings.transform(test_data)

                #===================================================
                ## Save NER Model to be used in Pipeline ##done
                filename_save = "./saved_models/"+ model_type + "_"  + str(epoch) + "_" + str(batch_size)\
                                   + "_" + str(learning_rate) + "_" + str(resample_ratio) + "_" + dataset_name

                ner_model_clinicalembeddings.stages[1].write().overwrite().save(filename_save)               
                #====================================================
                
                ## Output model predictions to csv file
                filename_prefix =  "./ner_output/"+ model_type + "_"  + str(epoch) + "_" + str(batch_size)\
                                   + "_" + str(learning_rate) + "_" + str(resample_ratio) + "_" + dataset_name
                filename = "%s.csv" % filename_prefix

                predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols"))\
                       .select(F.expr("cols['0']").alias("token"), F.expr("cols['1']").alias("ground_truth"), F.expr("cols['2']")\
                       .alias("prediction")).toPandas().to_csv(filename)

                preds_df =  predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols"))\
                       .select(F.expr("cols['0']").alias("token"), F.expr("cols['1']").alias("ground_truth"), F.expr("cols['2']")\
                       .alias("prediction")).toPandas()

                report = classification_report(preds_df['ground_truth'], preds_df['prediction'], digits=4)
                accuracy = accuracy_score(preds_df['ground_truth'] , preds_df['prediction'])
                
                # get per class accuracy
                # https://stackoverflow.com/questions/39770376/scikit-learn-get-accuracy-scores-for-each-class
                classes = np.unique(preds_df['ground_truth'])
                cm = confusion_matrix(preds_df['ground_truth'], preds_df['prediction']) 

                # We will store the results in a dictionary for easy access later
                per_class_accuracies = {}

                # Calculate the accuracy for each one of our classes
                for idx, cls in enumerate(classes):
                    # True negatives are all the samples that are not our current GT class (not the current row) 
                    # and were not predicted as the current class (not the current column)
                    true_negatives = np.sum(np.delete(np.delete(cm, idx, axis=0), idx, axis=1))

                    # True positives are all the samples of our current GT class that were predicted as such
                    true_positives = cm[idx, idx]

                    # The accuracy for the current class is ratio between correct predictions to all predictions                   
                    # 03-jul-2023: dont consider TN, use TP/(TP+FP+FN), same formulae for whole manuscript
                    per_class_accuracies[cls] = (true_positives) / (np.sum(cm)-true_negatives)                     

                # Combine class accuracies to classification report
                report_dict = classification_report(preds_df['ground_truth'], preds_df['prediction'], digits=4, labels=classes, output_dict=True)
                classification_report_df = pd.DataFrame(report_dict).transpose()
                per_class_accuracies_df = pd.DataFrame.from_dict(per_class_accuracies, orient='index', columns=['class_accuracy']) 

                #combine_report_df = pd.concat([per_class_accuracies_df,classification_report_df], axis=1)  (not used)              
                
                
                done = time.ctime()
                done2 = time.time()
                duration = done2-start2           
                print('End time of loop: ', done)
                to_append = [model_type, epoch, learning_rate, batch_size, resample_ratio, start, done, duration, accuracy, per_class_accuracies_df, report]
                df_length = len(output_df)
                output_df.loc[df_length] = to_append
                filename_prefix = "./ner_result/" + model_type + "_" + str(epoch) + "_" + str(batch_size)\
                                   + "_" + str(learning_rate) + "_" + str(resample_ratio) + "_" + dataset_name + "_" + str(datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))
                filename = "%s.csv" % filename_prefix
                output_df.to_csv(filename, header=True)
                print(50*'-')
                print("<<<Model Performance saved!>>>")
                print(50*'-')
                print(50*'-')
                

## ------------------- END OF TRAINING  --------------------

## ------------------- START OF HYPERPARAMETER TUNING  --------------------

## change to code if you need to run

For the training set with the resampling ratio combination (oversampling and undersampling) that yields the best classification performance measured on the test dataset, that training dataset can be used for hyperparameter tuning

### Generate Validation and Testing Learning Curves

Note: Each training model run will generate a log file, saved at .\ner_log_ncc\
This log file can be used as an input to Spark NLP's ner_log_parser.get_charts function to generate the validation and testing learning curves

## ------------------- END OF HYPERPARAMETER TUNING  --------------------

## ------------------- MODEL INFERENCE --------------------
same code as 03predict_radio_ner_v1.0.ipynb

In [None]:
# specify the best ner model you want to use for prediction
# this is saved during NER training
best_ner_model = "clinical_embeddings_1_8_0.001_u0.3o1_train4522"

## Prediction Pipeline - with jsl assertion status detection 

In [None]:
# loading
document = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

sentence = SentenceDetector()\
        .setInputCols(['document'])\
        .setOutputCol('sentence')

token = Tokenizer()\
        .setInputCols(['sentence'])\
        .setOutputCol('token')

#use .pretrained() for sparknlp online session
#use .load() for sparknlp airgap session
clinical_embeddings = WordEmbeddingsModel.load(embeddings_clinical_local_path)\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

#load the best ner model saved after training
loaded_ner_model = MedicalNerModel.load("./saved_models/" + best_ner_model)\
        .setInputCols(["sentence", "token", "embeddings"])\
        .setOutputCol("ner")

converter = NerConverter()\
        .setInputCols(["sentence", "token", "ner"])\
        .setOutputCol("ner_span")

#using jsl radiology assertion model for all ner entities 
#use .pretrained() for sparknlp online session
#use .load() for sparknlp airgap session
#radiology_assertion = AssertionDLModel.pretrained("assertion_dl_radiology", "en", "clinical/models") \
radiology_assertion = AssertionDLModel.load("./assertion_dl_radiology_en_2.7.4_2.4_1616071311532")\
    .setInputCols(["sentence", "ner_span", "embeddings"]) \
    .setOutputCol("assertion")

ner_prediction_pipeline = Pipeline(stages = [
        document,
        sentence,
        token,
        clinical_embeddings,
        loaded_ner_model,
        converter,
        radiology_assertion
])

empty_data = spark.createDataFrame([['']]).toDF("text")

ner_prediction_model = ner_prediction_pipeline.fit(empty_data)

In [None]:
## Sample prediction

In [None]:
text1 = """
your sample text
"""

In [None]:
text = text1
sample_data = spark.createDataFrame([[text]]).toDF("text")
sample_data.show()
sample_data.dtypes

In [None]:
preds = ner_prediction_model.transform(sample_data)

preds.select(F.explode(F.arrays_zip('ner_span.result', 
                                     'ner_span.metadata', 
                                     'assertion.result')).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label"),
              F.expr("cols['1']['sentence']").alias("sent_id"),
              F.expr("cols['2']").alias("assertion")).show(truncate=False)

In [None]:
lmodel = LightPipeline(ner_prediction_model)
ppres = lmodel.fullAnnotate(text)[0]
ppres.keys()

In [None]:
ppres['ner_span']

In [None]:
# assertion detection
chunk=[]
entity=[]
status=[]
for n,m in zip(ppres['ner_span'],ppres['assertion']):
    chunk.append(n.result)
    entity.append(n.metadata['entity']) 
    status.append(m.result)

df = pd.DataFrame({'chunk':chunk, 'entity':entity, 'assertion_status':status})
df.head(20)

In [None]:
from sparknlp_display import NerVisualizer
visualiser = NerVisualizer()

# Set label filter
visualiser.display(ppres, label_col='ner_span', document_col='document', save_path="./inference/display_result/display_result.html")
df.to_csv("./inference/display_result/display_result.csv")

## Get prediction with input csv

In [None]:
# modify the codes accordingly based on your csv file
# it should contain a text column "conclusion"
df_text = pd.read_csv("./inference/sample.csv", usecols=['sn_report_number','report','report_date','conclusion'])
df_text.count()

In [None]:
df_text.tail()

In [None]:
# check for null text
df_text.isnull().sum()

In [None]:
# fill null
df_text['conclusion'] = df_text['conclusion'].fillna('')

In [None]:
# save the ner visualisation to html file for review
# save the ner annotation to csv for review

annotation_df = pd.DataFrame()
for i in range(df_text['sn_report_number'].count()):
    print(i)
    ppres = lmodel.fullAnnotate(df_text['conclusion'].loc[i])[0]
    visualiser.display(ppres, label_col='ner_span', document_col='document', save_path="./inference/display_result/"+df_text['sn_report_number'].loc[i]+"_report.html")

    #output to csv
    chunk=[]
    entity=[]
    status=[]
    for n,m in zip(ppres['ner_span'],ppres['assertion']):
        chunk.append(n.result)
        entity.append(n.metadata['entity']) 
        status.append(m.result)
    temp_df = pd.DataFrame({'sn_report_number':df_text['sn_report_number'].loc[i],'report_date':df_text['report_date'].loc[i],'chunk':chunk, 'entity':entity, 'assertion_status':status})    
    temp_df['entity_index'] = temp_df.index
    #print(temp_df)
    annotation_df = annotation_df.append(temp_df)
    #print(annotation_df)

columns = ['sn_report_number', 'report_date','entity_index', 'entity','chunk','assertion_status']
annotation_df.to_csv("./inference/display_result/pred_radio_ner_annotation.csv", columns=columns, index=False)    