### Purpose: To preprocess/clean train csv for assertion model training
entity pairs used:

- probability high - cancer imaging findings
- probability medium - cancer imaging findings
- probability low - cancer imaging findings
- probability uncertain - cancer imaging findings

### Note: Before running this notebook, please configure the following paths

In [None]:
# train/test setting
data_folder = "dataset"
train_folder = data_folder+"\\02csv"

In [None]:
# specify your sparknlp online license key-need internet connection
# we are using v3.4.2
sparknlp_licence_key = r"..\sparknlp_licence_key\yourkey.json"

# specify your sparknlp offline license key-airgap env
# we are using v3.4.2
sparknlp_airgap_licence_key = r"..\sparknlp_licence_key\yourairgapkey.json"

## Import Libraries

In [None]:
import json, os, re, sparknlp, sparknlp_jsl, datetime, time
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import CoNLL
from sparknlp_jsl.annotator import *
from sparknlp_jsl.training import tf_graph

In [None]:
# Start Spark Session (Offline)
# Offline-Load airgap license key
with open(sparknlp_airgap_licence_key) as f:
    airgap_license_keys = json.load(f)
    
# Defining license key-value pairs as local variables
locals().update(airgap_license_keys)
os.environ.update(airgap_license_keys)

# check variable
!echo $SECRET
!echo $JSL_VERSION
!echo $PUBLIC_VERSION

os.environ['PYSPARK_PYTHON'] = 'python'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
print(os.environ['PYSPARK_PYTHON'])
print(os.environ['PYSPARK_DRIVER_PYTHON'])

# Start Spark Session with Custom Params (OFFLINE)
def start(SECRET):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed radio_assertion") \
        .master("local[16]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.driver.maxResultSize","4000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.2") \
        .config("spark.jars", f"d:\content\spark-nlp-jsl-{JSL_VERSION}.jar, d:\content\spark-nlp_2.12-3.4.2.jar" )

    return builder.getOrCreate()


print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())
spark = start(SECRET) 

spark

## Import data
- use the same csv file generated by radio_re_model > 02data_preprocessing_v1.0.ipynb
- the file can be found in radio_re_model\dataset\02csv\radio_re_train4522_allrelations_clean.csv
- copy this file to radio_assertion_model\dataset\02csv\

In [None]:
df_csv = pd.read_csv(os.path.join(train_folder,"radio_re_train4522_allrelations_clean.csv"))

In [None]:
df_csv.columns

In [None]:
# filter for probability_xx - cancer imaging findings pair
df_csv = df_csv[df_csv['relation_type']=='sentence']

columns = ['sentence','chunk1','entity1','chunk2','entity2','entity2_begin','entity2_end','doc_title','dataset']
condition1 = df_csv['entity1'].isin(['probability_high','probability_medium','probability_uncertain','probability_low'])
condition2 = df_csv['entity2']=="cancer_imaging_findings"
df_csv = df_csv[condition1 & condition2][columns]
df_csv = df_csv.reset_index(drop=True)
df_csv.head()

In [None]:
df_csv.groupby(["entity1","entity2"]).count()

## Pre-process the data

In [None]:
# drop duplicate row
df = df_csv.copy()
df = df.drop_duplicates().reset_index()
df.count()

In [None]:
df.head()

In [None]:
df.groupby('dataset').count()['sentence']

In [None]:
# renamed column
df.rename({'sentence': 'text', 'chunk2': 'target', 'entity1':'label','entity2_begin':'start','entity2_end':'end'}, axis=1, inplace=True)
columns = ['text','target', 'label','start','end','doc_title','dataset']
df = df[columns]
df.head()

In [None]:
# data cleaning, remove punctuation
df['text'] = df['text'].str.replace(r'[^\w\s]+','',regex=True) 
df['target'] = df['target'].str.replace(r'[^\w\s]+','',regex=True)
df.head()

In [None]:
# tokenize the text using sparknlp tokenizer
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

text_token = Tokenizer()\
    .setInputCols(['document'])\
    .setOutputCol('token')

tokenizer_pipeline = Pipeline(
    stages = [
    document,
    text_token])

assertion_df = spark.createDataFrame(df[['text']]).toDF("text")
assertion_token = tokenizer_pipeline.fit(assertion_df).transform(assertion_df)
assertion_token_df = assertion_token.select('text','token.result').toPandas()
assertion_token_df.head()

In [None]:
# save the token to csv for checking
assertion_token_df.to_csv(os.path.join(train_folder,"radio_assertion_traintest_token.csv"), index=False)

In [None]:
# need to get the token idx for assertion training, i.e the token start/end position in sentence
def get_token_idx (idx, target):
    target_list = target.split()
    print(target_list)
    token_list = assertion_token_df.iloc[idx]['result']
    print(token_list)
    start = token_list.index(target_list[0])
    end = start + len(target_list)-1
    print(start,end)
    return(start,end)
    
#get_token_idx(637,'hepatocellular carcinoma (hcc)')
#get_token_idx(0,'mass')
processed_df = df.copy()

for i in range(processed_df['text'].count()):
    print("***",i)
    #word = df.iloc[i]['target'].replace('.','')
    word = df.iloc[i]['target']
    try:
        s,e = get_token_idx(i,word)
        #print(s,e)
        #print(start,end)
        processed_df.at[i,'start'] = s
        processed_df.at[i,'end'] = e
    except:
        print("bad row, token not found, set start to -1, this row will be excluded from training")
        processed_df.at[i,'start'] = -1
        

In [None]:
#bad rows
processed_df[processed_df['start'] == -1]

In [None]:
# drop bad rows
processed_df = processed_df[processed_df['start'] != -1]
processed_df.count()

In [None]:
processed_df.head()

In [None]:
processed_df.groupby('dataset').count()

In [None]:
processed_df.groupby(['dataset','label']).count()

In [None]:
processed_df.to_csv(os.path.join(train_folder,"radio_assertion_traintest.csv"), index=False)