# Healthcare Data Entities Identification

In [1]:
# Installing and importing relevant libraries
import glob
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd
import pathlib
import os
from time import time
from spacy.tokenizer import Tokenizer
SENT_RANGE = 10 # Range of word to consider as features

# load the model
model = spacy.load("en_core_web_sm")
model.tokenizer = Tokenizer(model.vocab)

# Dataset prepration and overview

In [2]:
none_all = []
medical_sample = []
with open('golden_data.txt', 'r') as data_file:
    for line in data_file:
        if line.endswith('||NONE\n'):
            none_all.append(line.replace('||NONE\n', ''))
        elif line.endswith('||TREAT_FOR_DIS\n'):
            medical_sample.append(line.replace('||TREAT_FOR_DIS\n', ''))

none_sample = none_all[:len(medical_sample)]
print('Number of positive samples: ', len(none_sample), ' Number of negative samples: ',len(medical_sample))

Number of positive samples:  830  Number of negative samples:  830


In [3]:
train_len  = int(len(medical_sample) * 0.7)
train1, test1 = medical_sample[:train_len], medical_sample[train_len:] 
train2, test2 = none_sample[:train_len], none_sample[train_len:] 
train = train1 + train2
test = test1 + test2

In [4]:
def format_sentences(org_sentences):
    sentences_formatted = []
    annotations = []
    single_annot = 'O'
    for sentence in org_sentences:
        sentence_formatted = ''
        annotation = ''
        for word in sentence.split():
            if word == '<DIS>':
                single_annot = 'D'

            elif word == '</DIS>':
                single_annot = 'O'

            elif word == '<TREAT>':
                single_annot = 'T'

            elif word == '</TREAT>':
                single_annot = 'O'
            else:
                sentence_formatted += word + ' '
                annotation += single_annot + ' '

        sentences_formatted.append(sentence_formatted.strip())
        annotations.append(annotation.strip())
        
    return sentences_formatted, annotations

In [5]:
train_sentences, train_labels = format_sentences(train)
test_sentences, test_labels = format_sentences(test)

In [6]:
train_count_total = [sent_lbl for sentence_labels in train_labels for sent_lbl in sentence_labels if sent_lbl != ' ']
train_count_o = [sent_lbl for sentence_labels in train_labels for sent_lbl in sentence_labels if sent_lbl != ' ' and sent_lbl == 'O']
O_percent = round(len(train_count_o)/len(train_count_total),3)
ner_percent = 1 - O_percent
df = pd.DataFrame()
df['Train annotations counts'] = train_count_total
print('-------------------- Datasets statistics: --------------------\n')
print("'T' - Treatment")
print("'D' - Disease")
print("'O' - Other")
print('------------- Train dataset -------------')
print(df.value_counts())
print("NER tokens('D'/'T'): {}%, 'O' tokens: {}%".format(round(ner_percent,4), O_percent))
print("Number of sentences: {}".format(len(train_sentences)))


test_count_total = [sent_lbl for sentence_labels in test_labels for sent_lbl in sentence_labels if sent_lbl != ' ']
test_count_o = [sent_lbl for sentence_labels in test_labels for sent_lbl in sentence_labels if sent_lbl != ' ' and sent_lbl == 'O']
O_percent = round(len(test_count_o)/len(test_count_total),3)
ner_percent = 1 - O_percent
df = pd.DataFrame()
df['Test annotations counts'] = test_count_total

print('\n------------- Test dataset -------------')
print(df.value_counts())
print("NER tokens('D'/'T'): {}%, 'O' tokens: {}%".format(round(ner_percent,4), O_percent))
print("Number of sentences: {}".format(len(test_sentences)))

-------------------- Datasets statistics: --------------------

'T' - Treatment
'D' - Disease
'O' - Other
------------- Train dataset -------------
Train annotations counts
O                           25099
T                            2159
D                            1956
dtype: int64
NER tokens('D'/'T'): 0.141%, 'O' tokens: 0.859%
Number of sentences: 1162

------------- Test dataset -------------
Test annotations counts
O                          8336
T                           777
D                           683
dtype: int64
NER tokens('D'/'T'): 0.149%, 'O' tokens: 0.851%
Number of sentences: 498


# Defining features for CRF

## Select one out of two options:
1) 10 range word feature from left and right

2) Previous and next word features

3) Previous word features

In [7]:
# 1) Range of "SENT_RANGE" range of features
def getFeaturesForOneWord(cur_loc, sentence):
    end_loc = len(sentence) - 1

    # Obtaining features for words
    features = []
    left_range = max(0, cur_loc - SENT_RANGE)
    right_range = min(end_loc, cur_loc + 10)

    for i_loc in range(left_range, right_range):
        word = sentence[i_loc]
        i = i_loc - cur_loc
        features.extend([
        f'word{i}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{i}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{i}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{i}.dep=' + word.dep_,                                             # dependency dependent
        f'word{i}.head=' + word.head.orth_,                                      # dependency head
        f'word{i}.isupper={word.orth_.isupper()}',                            # is the word in all uppercase
        f'word{i}.isdigit={word.orth_.isdigit()}',                            # is the word a number
        f'word{i}.startsWithCapital={word.orth_[0].isupper()}'])               # is the word starting with a capital letter
        
    if(cur_loc == 0):
        features.append('BEG')                                                # feature to track begin of sentence 
 
    elif(cur_loc == end_loc - 1):
        features.append('END')                                                # feature to track end of sentence
 
    return features

In [269]:
# 2) Previous, current and next word features
def getFeaturesForOneWord(cur_loc, sentence):
    end_loc = len(sentence) - 1
    # Obtaining features for current word
    word = sentence[cur_loc]
 
    features = [
    f'word{0}.lower=' + word.orth_.lower(),                                  # serves as word id
    f'word{0}.postag=' + word.pos_,                                          # PoS tag of current word
    f'word{0}[-3:]=' + word.orth_[-3:],                                      # last three characters
    f'word{0}.dep=' + word.dep_,                                             # dependency dependent
    f'word{0}.head=' + word.head.orth_,                                      # dependency head
    f'word{0}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
    f'word{0}.isdigit={word.orth_.isdigit()}',                               # is the word a number
    f'word{0}.startsWithCapital={word.orth_[0].isupper()}']                  # is the word starting with a capital letter

        
    if(cur_loc > 0):
        word = sentence[cur_loc - 1]
        features.extend([
        f'word{-1}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{-1}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{-1}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{-1}.dep=' + word.dep_,                                             # dependency dependent
        f'word{-1}.head=' + word.head.orth_,                                      # dependency head
        f'word{-1}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
        f'word{-1}.isdigit={word.orth_.isdigit()}',                               # is the word a number
        f'word{-1}.startsWithCapital={word.orth_[0].isupper()}'])                # is the word starting with a capital letter
        
    
    else:
        features.append('BEG')                                          # feature to track begin of sentence 
 
    if(cur_loc + 1 < end_loc):
        word = sentence[cur_loc + 1]
        features.extend([
        f'word{1}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{1}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{1}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{1}.dep=' + word.dep_,                                             # dependency dependent
        f'word{1}.head=' + word.head.orth_,                                      # dependency head
        f'word{1}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
        f'word{1}.isdigit={word.orth_.isdigit()}',                               # is the word a number
        f'word{1}.startsWithCapital={word.orth_[0].isupper()}'])                  # is the word starting with a capital letter

    else:
        features.append('END')                                                # feature to track end of sentence

    return features

In [277]:
# 3) Previous and current word features
def getFeaturesForOneWord(cur_loc, sentence):
    end_loc = len(sentence) - 1
    # Obtaining features for current word
    word = sentence[cur_loc]

    features = [
    f'word{0}.lower=' + word.orth_.lower(),                                  # serves as word id
    f'word{0}.postag=' + word.pos_,                                          # PoS tag of current word
    f'word{0}[-3:]=' + word.orth_[-3:],                                      # last three characters
    f'word{0}.dep=' + word.dep_,                                             # dependency dependent
    f'word{0}.head=' + word.head.orth_,                                      # dependency head
    f'word{0}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
    f'word{0}.isdigit={word.orth_.isdigit()}',                               # is the word a number
    f'word{0}.startsWithCapital={word.orth_[0].isupper()}']                  # is the word starting with a capital letter

        
    if(cur_loc > 0):
        word = sentence[cur_loc - 1]
        features.extend([
        f'word{-1}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{-1}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{-1}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{-1}.dep=' + word.dep_,                                             # dependency dependent
        f'word{-1}.head=' + word.head.orth_,                                      # dependency head
        f'word{-1}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
        f'word{-1}.isdigit={word.orth_.isdigit()}',                               # is the word a number
        f'word{-1}.startsWithCapital={word.orth_[0].isupper()}'])                # is the word starting with a capital letter
        
    
    else:
        features.append('BEG')                                                # feature to track begin of sentence 

    if(cur_loc == end_loc):
        features.append('END')                                                # feature to track end of sentence

    return features

# Prepare data 

In [8]:
# Get features for a sentence.
def getFeaturesForOneSentence(sentence):
    sentence_parsing = model(sentence)
    return [getFeaturesForOneWord(ii, sentence_parsing) for ii,token in enumerate(sentence_parsing)]

# code to get the labels for a sentence.
def getLabelsInListForOneSentence(labels):
    return labels.split()

In [9]:
# Checking feature extraction
example_sentence = train_sentences[1]
print(f'Example sentence: "{example_sentence}"\n')

features = getFeaturesForOneSentence(example_sentence)
print('Total features in the sentence:', len(features))
print('Example of features for the word "rates":')
features[2]

Example sentence: "CONCLUSION : Methylphenidate is effective in treating children with epilepsy and ADHD and safe in children who are seizure free ."

Total features in the sentence: 21
Example of features for the word "rates":


['word-2.lower=conclusion',
 'word-2.postag=NOUN',
 'word-2[-3:]=ION',
 'word-2.dep=dep',
 'word-2.head=is',
 'word-2.isupper=True',
 'word-2.isdigit=False',
 'word-2.startsWithCapital=True',
 'word-1.lower=:',
 'word-1.postag=PUNCT',
 'word-1[-3:]=:',
 'word-1.dep=punct',
 'word-1.head=CONCLUSION',
 'word-1.isupper=False',
 'word-1.isdigit=False',
 'word-1.startsWithCapital=False',
 'word0.lower=methylphenidate',
 'word0.postag=NOUN',
 'word0[-3:]=ate',
 'word0.dep=nsubj',
 'word0.head=is',
 'word0.isupper=False',
 'word0.isdigit=False',
 'word0.startsWithCapital=True',
 'word1.lower=is',
 'word1.postag=AUX',
 'word1[-3:]=is',
 'word1.dep=ROOT',
 'word1.head=is',
 'word1.isupper=False',
 'word1.isdigit=False',
 'word1.startsWithCapital=False',
 'word2.lower=effective',
 'word2.postag=ADJ',
 'word2[-3:]=ive',
 'word2.dep=acomp',
 'word2.head=is',
 'word2.isupper=False',
 'word2.isdigit=False',
 'word2.startsWithCapital=False',
 'word3.lower=in',
 'word3.postag=ADP',
 'word3[-3:]=in',
 

In [19]:
X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sentences]
X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sentences]

Y_train = [getLabelsInListForOneSentence(labels) for labels in train_labels]
Y_test = [getLabelsInListForOneSentence(labels) for labels in test_labels]

# build the CRF Classifier

In [21]:
def get_crf(X_train, Y_train):
    crf = sklearn_crfsuite.CRF(max_iterations=300)

    try:
        crf.fit(X_train, Y_train)
    except AttributeError:
        pass
    return crf

crf = get_crf(X_train, Y_train)

# Evaluation

In [12]:
Y_pred = crf.predict(X_test)

In [13]:
print('Using oprtion (1) - "Range of SENT_RANGE word features":')
print("Weighted F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='weighted')))
print("Macro F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='macro')))
print("Recall F1: {}".format(metrics.flat_recall_score(Y_test, Y_pred, average='macro')))
print("Precision F1: {}".format(metrics.flat_precision_score(Y_test, Y_pred, average='macro')))
print("Accuracy F1: {}".format(metrics.flat_accuracy_score(Y_test, Y_pred)))

Using oprtion (1) - "Range of SENT_RANGE word features":
Weighted F1: 0.9222604008175131
Macro F1: 0.7947574532912421
Recall F1: 0.7347480494361633
Precision F1: 0.8849859829189922
Accuracy F1: 0.9280318497345855


In [276]:
print('Using oprtion (2) - "Previous, current and next word features":')
print("Weighted F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='weighted')))
print("Macro F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='macro')))
print("Recall F1: {}".format(metrics.flat_recall_score(Y_test, Y_pred, average='macro')))
print("Precision F1: {}".format(metrics.flat_precision_score(Y_test, Y_pred, average='macro')))
print("Accuracy F1: {}".format(metrics.flat_accuracy_score(Y_test, Y_pred)))

Using oprtion (2) - "Previous, current and next word features":
Weighted F1: 0.9138709791733991
Macro F1: 0.7690004082496799
Recall F1: 0.7004792772905395
Precision F1: 0.8813663196202324
Accuracy F1: 0.921600653327889


In [284]:
print('Using oprtion (3) - "Previous and current word features":')
print("Weighted F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='weighted')))
print("Macro F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='macro')))
print("Recall F1: {}".format(metrics.flat_recall_score(Y_test, Y_pred, average='macro')))
print("Precision F1: {}".format(metrics.flat_precision_score(Y_test, Y_pred, average='macro')))
print("Accuracy F1: {}".format(metrics.flat_accuracy_score(Y_test, Y_pred)))

Using oprtion (3) - "Previous and current word features":
Weighted F1: 0.9051708510330074
Macro F1: 0.7440366657060755
Recall F1: 0.6754440241483685
Precision F1: 0.863239130302844
Accuracy F1: 0.9146590445079624


# Extract all relations

In [39]:
all_sentences = train_sentences + test_sentences
all_sentences_string = train_sentences + test_sentences
all_sentences = [i.split() for i in all_sentences]
all_labels = train_labels + test_labels
all_labels = [i.split() for i in all_labels]
condition_treatment_evidence = {'condition':[], 'treatment':[], 'evidence':[]}            # Initializing an empty dictionary


for i in range(len(all_labels)):


        #print(test_sentences[i])
        cnt_disease = 0           # Count of number of diseases mentioned in the sentence
        cnt_treatment = 0         # Count of the number of treatments mentioned in the sentence
        diseases = [""]           # Initializing a blank list of diseases for current sentence.
        treatment = [""]          # Initializing a blank list of treatments for current sentence.
        sentence_number = [""]
        evidence = [""]
        
        length = len(all_labels[i])   # Length of current sentence.
        for j in range(length):
            if (all_labels[i][j] == 'D'):                                                     # Checking for label indicating disease for current word ('D')
                diseases[cnt_disease] += (all_sentences[i][j] + " ")            # Adding word to diseases list.
                if j < length - 1:
                    if (all_labels[i][j+1] != 'D'):                                           # Check for name of disease extending over multiple words. 
                        # If next word does not have label 'D', then truncate the space added at the end of the last word.
                        diseases[cnt_disease] = diseases[cnt_disease][:-1]
                        cnt_disease += 1
                        diseases.append("")                                               # Adding a placeholder for the next disease in the current sentence.
                else:
                    diseases[cnt_disease] = diseases[cnt_disease][:-1]
                    cnt_disease += 1
                    diseases.append("")
                                
            if (all_labels[i][j] == 'T'):                                                     # Checking for label indicating treatment for current word ('T')
                treatment[cnt_treatment] += (all_sentences[i][j] + " ") # Adding word to corresponding treatment list.
                if j < length - 1:
                    if (all_labels[i][j+1] != 'T'):                                           # Check for name of treatment extending over multiple words. 
                        # If next word does not have label 'T', then truncate the space added at the end of the last word.
                        treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                        cnt_treatment += 1
                        treatment.append("")                                              # Adding a placeholder for the next treatment in the current sentence.
                else:
                    treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                    cnt_treatment += 1
                    treatment.append("")

        diseases.pop(-1)    # Getting rid of the last empty placeholder in diseases list
        treatment.pop(-1)   # Getting rid of the last empty placeholder in treatments list
        if cnt_disease and cnt_treatment:
            for i_deases in range(cnt_disease):
                for j in range(cnt_treatment):             
                    condition_treatment_evidence['condition'].append(diseases[i_deases])            
                    condition_treatment_evidence['treatment'].append(treatment[j])
                    condition_treatment_evidence['evidence'].append(all_sentences_string[i])
  
# Create the pandas DataFrame
df_gold = pd.DataFrame(condition_treatment_evidence)

# Train on all the data and predicting on a new queries

In [23]:
def extract_new_relations(all_data_to_pred, all_predictions):
    cte = {'condition':[], 'treatment':[], 'evidence':[]}

    for sentence, preds in zip(data_to_pred1, Y1_pred):
        single_treatment = []
        single_condition = []
        sentence_treatments = []
        sentence_conditions = []
        cnt_D = 0;cnt_T = 0;cnt_O = 0;
        for word, pred in zip(sentence.split(), preds):
            if pred == 'D':
                cnt_D += 1; cnt_T = 0;
                single_condition.append(word)
                if cnt_T != 0:
                    sentence_treatments.append(' '.join(single_treatment))
                    single_treatment = []

            elif pred == 'T':
                cnt_T += 1; cnt_D = 0;
                single_treatment.append(word)
                if cnt_D != 0:
                    sentence_conditions.append(' '.join(single_condition))
                    single_condition = []                

            elif pred == 'O':            
                if cnt_T != 0:
                    sentence_treatments.append(' '.join(single_treatment))
                    single_treatment = []
                if cnt_D != 0:
                    sentence_conditions.append(' '.join(single_condition))
                    single_condition = []  

                cnt_D = 0; cnt_T = 0;

        if cnt_T != 0:
            sentence_treatments.append(' '.join(single_treatment))
            single_treatment = []
        if cnt_D != 0:
            sentence_conditions.append(' '.join(single_condition))
            single_condition = []  

        if len(sentence_conditions) and len(sentence_treatments):
            for t in sentence_treatments:
                for c in sentence_conditions:
                    cte['condition'].append(c)
                    cte['treatment'].append(t)
                    cte['evidence'].append(sentence)

    return pd.DataFrame(cte)

In [22]:
# Train model on all data
all_sentences = train_sentences + test_sentences
all_labels = train_labels + test_labels
all_sentences_f = [getFeaturesForOneSentence(sentence) for sentence in all_sentences]
all_labels_joined = [getLabelsInListForOneSentence(labels) for labels in all_labels]

crf = get_crf(all_sentences_f, all_labels_joined)

In [30]:
#get unique values
df_all_raw = pd.DataFrame()
# Load query data and extract relations
for file_name in glob.glob("spike_queries/*.csv"):
    df_raw1 = pd.read_csv(file_name)
    df_all_raw = pd.concat([df_all_raw, df_raw1], ignore_index=True)
    
# df_all_raw = pd.concat([df_all_raw, df_RE_neg[['sentence_id','sentence_text']]], ignore_index=True)
# df_all_raw = pd.concat([df_all_raw, df_RE_pos[['sentence_id','sentence_text']]], ignore_index=True)
print("THe total length",len(df_all_raw))

df_uniq = pd.DataFrame()
df_uniq['sentence_text'] = df_all_raw.sentence_text.unique()

print("THe total unique length",len(df_uniq))

THe total length 29452
THe total unique length 29139


In [34]:
data_to_pred1 = df_uniq['sentence_text'].to_list()

X1_pred = [getFeaturesForOneSentence(sentence) for sentence in data_to_pred1]
Y1_pred = crf.predict(X1_pred)

df_relations = extract_new_relations(data_to_pred1, Y1_pred)
print('Number of extracted realtions: ',len(df_relations))
df_all_re = pd.concat([df_relations, df_gold], ignore_index=True)
print('Number of all realtions: ',len(df_all_re))
df_all_re.to_json(r'relation2.jsonl',orient = 'records', lines = 'True')

Number of extracted realtions:  5013
Number of all realtions:  6068


In [27]:

# Example for Re that should not be extracted
txt1 = 'A case of tuberculous mastitis not cured by prolonged streptomycin therapy .'
yy1 = crf.predict([getFeaturesForOneSentence(txt1)])
txt2 = 'A case of tuberculous mastitis cured by prolonged streptomycin therapy .'
yy2 = crf.predict([getFeaturesForOneSentence(txt2)])

strr = ''
for i,j in zip(txt1.split(),yy1[0]):
    strr += i+'('+j+') '
strr2 = ''
for i,j in zip(txt2.split(),yy2[0]):
    strr2 += i+'('+j+') '
    
print(strr)
print(strr2)

A(O) case(O) of(O) tuberculous(D) mastitis(D) not(O) cured(O) by(O) prolonged(T) streptomycin(T) therapy(T) .(O) 
A(O) case(O) of(O) tuberculous(D) mastitis(D) cured(O) by(O) prolonged(T) streptomycin(T) therapy(T) .(O) 
