# Healthcare Data Entities Identification



## Workspace set up: Import and Install useful packages.

- mount the drive

In [3]:
# Installing and importing relevant libraries
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd
import pathlib
import os

# load the model
model = spacy.load("en_core_web_sm")

# Dataset prepration and overview


In [4]:
with open('train_sent', 'r') as train_sent_file:
  train_words = train_sent_file.readlines()

with open('train_label', 'r') as train_labels_file:
  train_labels_by_word = train_labels_file.readlines()

with open('test_sent', 'r') as test_sent_file:
  test_words = test_sent_file.readlines()

with open('test_label', 'r') as test_labels_file:
  test_labels_by_word = test_labels_file.readlines()

In [5]:
def convert_to_sentences(dataset):
    sent_list = []
    sent = ""
    for entity in dataset:
        if entity != '\n':
            sent = sent + entity[:-1] + " "       # Adding word/label to current sentence / sequence of labels 
        else: 
            sent_list.append(sent[:-1])           # Getting rid of the space added after the last entity.
            sent = ""
    return sent_list

In [6]:
train_sentences = convert_to_sentences(train_words)
train_labels = convert_to_sentences(train_labels_by_word)
test_sentences = convert_to_sentences(test_words)
test_labels = convert_to_sentences(test_labels_by_word)

In [8]:
train_count_total = [sent_lbl for sentence_labels in train_labels for sent_lbl in sentence_labels if sent_lbl != ' ']
train_count_o = [sent_lbl for sentence_labels in train_labels for sent_lbl in sentence_labels if sent_lbl != ' ' and sent_lbl == 'O']
O_percent = round(len(train_count_o)/len(train_count_total),3)
ner_percent = 1 - O_percent
print('-------------------- Datasets statistics: --------------------\n')

print('------------- Train dataset -------------')
print("Number of 'O' in the labels: {}".format(len(train_count_o)))
print("Number of total labels: {}".format(len(train_count_total)))
print("NER tokens: {}%, 'O' tokens: {}%".format(ner_percent, O_percent))
print("Number of sentences: {}".format(len(train_sentences)))


test_count_total = [sent_lbl for sentence_labels in test_labels for sent_lbl in sentence_labels if sent_lbl != ' ']
test_count_o = [sent_lbl for sentence_labels in test_labels for sent_lbl in sentence_labels if sent_lbl != ' ' and sent_lbl == 'O']
O_percent = round(len(test_count_o)/len(test_count_total),3)
ner_percent = 1 - O_percent
print('\n------------- Test dataset -------------')
print("Number of 'O' in the labels: {}".format(len(test_count_o)))
print("Number of total labels: {}".format(len(test_count_total)))
print("NER tokens: {}%, 'O' tokens: {}%".format(ner_percent, O_percent))
print("Number of sentences: {}".format(len(test_sentences)))

-------------------- Datasets statistics: --------------------

------------- Train dataset -------------
Number of 'O' in the labels: 39683
Number of total labels: 45902
NER tokens: 0.135%, 'O' tokens: 0.865%
Number of sentences: 2599

------------- Test dataset -------------
Number of 'O' in the labels: 16127
Number of total labels: 18618
NER tokens: 0.134%, 'O' tokens: 0.866%
Number of sentences: 1056


# Concept Identification





- Extract those tokens which have NOUN or PROPN as their PoS tag and find their frequency

In [9]:
# Creating a combined dataset from training and test sentences, since this is an Exploratory analysis.
combined = train_sentences + test_sentences
print("Number of sentences in combined dataset (training + test): {}".format(len(combined)))

Number of sentences in combined dataset (training + test): 3655


In [11]:
# Creating a list of tokens which have PoS tag of 'NOUN' or 'PROPN'
noun_propn = []         # Initiating list for nouns and proper nouns
pos_tag = []            # initiating list for corresponding PoS tags.
for sent in combined:
    for token in model(sent):
        if token.pos_ in ['NOUN', 'PROPN']:
           noun_propn.append(token.text)
           pos_tag.append(token.pos_)
print("No. of tokens in combined dataset with PoS tag of 'NOUN' or 'PROPN': {}".format(len(noun_propn)))

No. of tokens in combined dataset with PoS tag of 'NOUN' or 'PROPN': 24292


In [12]:
# Print the top 25 most common tokens with NOUN or PROPN PoS tags
noun_pos = pd.DataFrame({"NOUN_PROPN":noun_propn,"POS_tag":pos_tag})
print("Top 25 comon tokens with PoS tag of 'NOUN' or 'PROPN' \n")
print(noun_pos["NOUN_PROPN"].value_counts().head(25))

Top 25 comon tokens with PoS tag of 'NOUN' or 'PROPN' 

patients        492
treatment       281
%               247
cancer          200
therapy         175
study           152
disease         141
cell            140
lung            116
group            94
chemotherapy     88
gene             87
effects          85
results          78
women            77
use              74
risk             71
surgery          71
cases            71
analysis         70
rate             67
response         66
survival         65
children         64
effect           63
Name: NOUN_PROPN, dtype: int64


# Defining features for CRF





In [13]:
# Analysis of PoS tags - Independent assignment for words vs Contextual assignment in a sentence.
sentence = train_sentences[1]   
sent_list = sentence.split()      # Splitting the sentence into its constituent words.
position = 2                      # Choosing position of word within sentence. Index starts at 0.

word = sent_list[position]        # Extracting word for PoS tag analysis.

print(sentence)

# Independent assignment of PoS tag (No contextual info)
print("\nPoS tag of word in isolation\nWord:",word,"--",model(word)[0].pos_,"\n")

# Contextual assignment of PoS tag based on other words in the sentence.
print("PoS tag of all words in sentence with context intact.")
for token in model(sentence):
    print(token.text, "--", token.pos_)

# Modified workflow to obtain PoS tag of specific word in question while keeping sentence context intact.
print("\nResult of modified workflow to obtain PoS tag of word at a specific position while keeping context within sentence in-tact.")
cnt = 0                           # Count of the word position within sentence.
for token in model(sentence):
      postag = token.pos_
      if (token.text == word) and (cnt == position):
          break
      cnt += 1
print("Word:", word,"POSTAG:",postag)


The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )

PoS tag of word in isolation
Word: cesarean -- PROPN 

PoS tag of all words in sentence with context intact.
The -- DET
total -- ADJ
cesarean -- ADJ
rate -- NOUN
was -- AUX
14.4 -- NUM
% -- NOUN
( -- PUNCT
344 -- NUM
of -- ADP
2395 -- NUM
) -- PUNCT
, -- PUNCT
and -- CCONJ
the -- DET
primary -- ADJ
rate -- NOUN
was -- AUX
11.4 -- NUM
% -- NOUN
( -- PUNCT
244 -- NUM
of -- ADP
2144 -- NUM
) -- PUNCT

Result of modified workflow to obtain PoS tag of word at a specific position while keeping context within sentence in-tact.
Word: cesarean POSTAG: ADJ


In [14]:
# Function to obtain contextual PoS tagger.
def contextual_pos_tagger(sent_list,position):
   

    sentence = " ".join(sent_list)          # Sentence needs to be in string format to process it with spacy model. List of words won't work.
    posit = 0                               # Initialising variable to record position of word in joined sentence to compare with the position of the word under considertion.
    for token in model(sentence):
        postag = token.pos_
        if (token.text == word) and (posit == position):
            break
        posit += 1
    return postag

In [None]:
# Previous word features
def getFeaturesForOneWord(sent_list, position):
  word = sent_list[position]
    
  # Obtaining features for current word
  features = [
    'word.lower=' + word.lower(),                                   # serves as word id
    'word.postag=' + contextual_pos_tagger(sent_list, position),    # PoS tag of current word
    'word[-3:]=' + word[-3:],                                       # last three characters
    'word[-2:]=' + word[-2:],                                       # last two characters
    'word.isupper=%s' % word.isupper(),                             # is the word in all uppercase
    'word.isdigit=%s' % word.isdigit(),                             # is the word a number
    'words.startsWithCapital=%s' % word[0].isupper()                # is the word starting with a capital letter
  ]
 
  if(position > 0):
    prev_word = sent_list[position-1]
    features.extend([
    'prev_word.lower=' + prev_word.lower(),                               # previous word
    'prev_word.postag=' + contextual_pos_tagger(sent_list, position - 1), # PoS tag of previous word
    'prev_word.isupper=%s' % prev_word.isupper(),                         # is the previous word in all uppercase
    'prev_word.isdigit=%s' % prev_word.isdigit(),                         # is the previous word a number
    'prev_words.startsWithCapital=%s' % prev_word[0].isupper()            # is the previous word starting with a capital letter
  ])
  else:
    features.append('BEG')                                                # feature to track begin of sentence 
 
  if(position == len(sent_list)-1):
    features.append('END')                                                # feature to track end of sentence
 
  return features

In [15]:
# Previous and next word features
def getFeaturesForOneWord(sent_list, position):
  word = sent_list[position]
    
  # Obtaining features for current word
  features = [
    'word.lower=' + word.lower(),                                   # serves as word id
    'word.postag=' + contextual_pos_tagger(sent_list, position),    # PoS tag of current word
    'word[-3:]=' + word[-3:],                                       # last three characters
    'word[-2:]=' + word[-2:],                                       # last two characters
    'word.isupper=%s' % word.isupper(),                             # is the word in all uppercase
    'word.isdigit=%s' % word.isdigit(),                             # is the word a number
    'words.startsWithCapital=%s' % word[0].isupper()                # is the word starting with a capital letter
  ]
 
  if(position > 0):
    prev_word = sent_list[position-1]
    features.extend([
    'prev_word.lower=' + prev_word.lower(),                               # previous word
    'prev_word.postag=' + contextual_pos_tagger(sent_list, position - 1), # PoS tag of previous word
    'prev_word.isupper=%s' % prev_word.isupper(),                         # is the previous word in all uppercase
    'prev_word.isdigit=%s' % prev_word.isdigit(),                         # is the previous word a number
    'prev_words.startsWithCapital=%s' % prev_word[0].isupper()            # is the previous word starting with a capital letter
  ])
  else:
    features.append('BEG')                                                # feature to track begin of sentence 
 
  if(position < len(sent_list) - 1):
    next_word = sent_list[position+1]
    features.extend([
    'next_word.lower=' + next_word.lower(),                               # Next word
    'next_word.postag=' + contextual_pos_tagger(sent_list, position + 1), # PoS tag of previous word
    'next_word.isupper=%s' % next_word.isupper(),                         # is the next word in all uppercase
    'next_word.isdigit=%s' % next_word.isdigit(),                         # is the next word a number
    'next_word.startsWithCapital=%s' % next_word[0].isupper()            # is the next word starting with a capital letter
  ])
  else:
    features.append('END')                                                # feature to track end of sentence

  return features

# Getting the features

In [16]:
# Write a code to get features for a sentence.
def getFeaturesForOneSentence(sentence):
  sentence_list = sentence.split()
  return [getFeaturesForOneWord(sentence_list, position) for position in range(len(sentence_list))]

In [17]:
# Checking feature extraction
example_sentence = train_sentences[5]
print(f'Example sentence: "{example_sentence}"\n')

features = getFeaturesForOneSentence(example_sentence)
print('Total features in the sentence:', len(features))
print('Example of features for the word "rates":')
features[1]

Example sentence: "Cesarean rates at tertiary care hospitals should be compared with rates at community hospitals only after correcting for dissimilar patient groups or gestational age"

Total features in the sentence: 24
Example of features for the word "rates":


['word.lower=rates',
 'word.postag=NOUN',
 'word[-3:]=tes',
 'word[-2:]=es',
 'word.isupper=False',
 'word.isdigit=False',
 'words.startsWithCapital=False',
 'prev_word.lower=cesarean',
 'prev_word.postag=NOUN',
 'prev_word.isupper=False',
 'prev_word.isdigit=False',
 'prev_words.startsWithCapital=True',
 'next_word.lower=at',
 'next_word.postag=NOUN',
 'next_word.isupper=False',
 'next_word.isdigit=False',
 'next_word.startsWithCapital=False']

## Write a code/function to get the labels of a sentence

In [18]:
# Write a code to get the labels for a sentence.
def getLabelsInListForOneSentence(labels):
  return labels.split()

# Define input and target variables


In [20]:
X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sentences]
X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sentences]

### Define the labels as the target variable for test and the train dataset

In [21]:
Y_train = [getLabelsInListForOneSentence(labels) for labels in train_labels]
Y_test = [getLabelsInListForOneSentence(labels) for labels in test_labels]

# Build the CRF Classifier

In [22]:
crf = sklearn_crfsuite.CRF(max_iterations=300)

try:
    crf.fit(X_train, Y_train)
except AttributeError:
    pass
    
predictions = crf.predict(X_test)

# Evaluation

In [23]:
Y_pred = crf.predict(X_test)

### Calculate the f1 score using the actual labels and the predicted labels of the test dataset.

In [24]:
expended_list = [Y_sentence for Y_sentences in Y_test for Y_sentence in Y_sentences]

df = pd.DataFrame()
df['Test annotations'] = expended_list
df.value_counts()

Test annotations
O                   16127
D                    1450
T                    1041
dtype: int64

In [29]:
print(metrics.flat_f1_score(Y_test, Y_pred, average='weighted'))
print(metrics.flat_f1_score(Y_test, Y_pred, average='macro'))

0.9157843882472863
0.7476424017612026


In [58]:
metrics.flat_f1_score(Y_test, Y_pred, average='weighted')

0.9083079133720954

In [65]:
metrics.flat_f1_score(Y_test, Y_pred, average='macro')

0.7211435117446926

In [59]:
# Example test sentence and corresponding actual and predicted labels 
print("Sentence: ",test_sentences[13])
print("Actual labels:    ", Y_test[13])
print("Predicted labels: ", Y_pred[13])

Sentence:  The objective of this study was to determine if the rate of preeclampsia is increased in triplet as compared to twin gestations
Actual labels:     ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'D', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted labels:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'D', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [60]:
# Feature list of sentence above
print(X_test[13])

[['word.lower=the', 'word.postag=NOUN', 'word[-3:]=The', 'word[-2:]=he', 'word.isupper=False', 'word.isdigit=False', 'words.startsWithCapital=True', 'BEG'], ['word.lower=objective', 'word.postag=NOUN', 'word[-3:]=ive', 'word[-2:]=ve', 'word.isupper=False', 'word.isdigit=False', 'words.startsWithCapital=False', 'prev_word.lower=the', 'prev_word.postag=NOUN', 'prev_word.isupper=False', 'prev_word.isdigit=False', 'prev_words.startsWithCapital=True'], ['word.lower=of', 'word.postag=NOUN', 'word[-3:]=of', 'word[-2:]=of', 'word.isupper=False', 'word.isdigit=False', 'words.startsWithCapital=False', 'prev_word.lower=objective', 'prev_word.postag=NOUN', 'prev_word.isupper=False', 'prev_word.isdigit=False', 'prev_words.startsWithCapital=False'], ['word.lower=this', 'word.postag=NOUN', 'word[-3:]=his', 'word[-2:]=is', 'word.isupper=False', 'word.isdigit=False', 'words.startsWithCapital=False', 'prev_word.lower=of', 'prev_word.postag=NOUN', 'prev_word.isupper=False', 'prev_word.isdigit=False', 'pr

# Medical NER and REL



In [61]:
disease_treatment = {}            # Initializing an empty dictionary
numsent_evidence = {}             # Initializing an empty dictionary
for i in range(len(Y_pred)):
    #print(test_sentences[i])
    cnt_disease = 0           # Count of number of diseases mentioned in the sentence
    cnt_treatment = 0         # Count of the number of treatments mentioned in the sentence
    diseases = [""]           # Initializing a blank list of diseases for current sentence.
    treatment = [""]          # Initializing a blank list of treatments for current sentence.
    sentence_number = [""]
    evidence = [""]
    
    length = len(Y_pred[i])   # Length of current sentence.
    for j in range(length):
        if (Y_pred[i][j] == 'D'):                                                     # Checking for label indicating disease for current word ('D')
            diseases[cnt_disease] += (X_test[i][j][0].split('=')[1] + " ")            # Adding word to diseases list.
            if j < length - 1:
                if (Y_pred[i][j+1] != 'D'):                                           # Check for name of disease extending over multiple words. 
                    # If next word does not have label 'D', then truncate the space added at the end of the last word.
                    diseases[cnt_disease] = diseases[cnt_disease][:-1]
                    cnt_disease += 1
                    diseases.append("")                                               # Adding a placeholder for the next disease in the current sentence.
            else:
                diseases[cnt_disease] = diseases[cnt_disease][:-1]
                cnt_disease += 1
                diseases.append("")
                            
        if (Y_pred[i][j] == 'T'):                                                     # Checking for label indicating treatment for current word ('T')
            treatment[cnt_treatment] += (X_test[i][j][0].split('=')[1] + " ") # Adding word to corresponding treatment list.
            if j < length - 1:
                if (Y_pred[i][j+1] != 'T'):                                           # Check for name of treatment extending over multiple words. 
                    # If next word does not have label 'T', then truncate the space added at the end of the last word.
                    treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                    cnt_treatment += 1
                    treatment.append("")                                              # Adding a placeholder for the next treatment in the current sentence.
            else:
                treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                cnt_treatment += 1
                treatment.append("")

    diseases.pop(-1)    # Getting rid of the last empty placeholder in diseases list
    treatment.pop(-1)   # Getting rid of the last empty placeholder in treatments list

    # To our dictionary, add or append treatments to the diseases identified from the current sentence, if any.
    if len(diseases) > 0:       # Checking if any diseases have been identified for the current sentence.

        sentence_number = i
        evidence = test_sentences[i]   
        numsent_evidence[sentence_number] = evidence
        for disease in diseases:
            if disease in disease_treatment.keys():
                # Extend treatment list if other treatments for the particular disease already exist
                disease_treatment[disease].extend(treatment)
            else:
                # Creating list of treatments for particular disease if it doesn not exist already.
                disease_treatment[disease] = treatment


In [62]:
numsent_evidence

{5: 'This study tested the hypothesis that to reduce the rate of macrosomic infants in gestational diabetes cases , good glycemic control should be initiated before 34 completed gestational weeks',
 13: 'The objective of this study was to determine if the rate of preeclampsia is increased in triplet as compared to twin gestations',
 15: 'The rate of severe preeclampsia was increased significantly in the triplet group 12 of 53 ( 22.6 % ) as compared with the twin group 3 of 53 ( 5.7 % ) ( OR = 4.9 , 95 % CI 1.2-23.5 , p = 0.02 )',
 20: 'Sequelae include severe developmental delay and asymmetric double hemiplegia',
 21: 'A subchorial placental hematoma , which detected as a subchorial placental lucencies by ultrasonography , can be a cause of reversible nonimmune hydrops fetalis',
 27: 'CONTEXT : A mutation in the BRCA1 gene may confer substantial risk for breast and/or ovarian cancer',
 29: 'PARTICIPANTS : Institutions selected 798 persons representing families ( 1 person for each famil

In [63]:
disease_treatment

{'gestational diabetes cases': [],
 'preeclampsia': [],
 'severe preeclampsia': [],
 'asymmetric double hemiplegia': [],
 'reversible nonimmune hydrops fetalis': [],
 'breast and/or ovarian cancer': [],
 'breast cancer': ['hormone replacement therapy',
  'undergone subcutaneous mastectomy'],
 'ovarian cancer': ['hormone replacement therapy',
  'undergone subcutaneous mastectomy'],
 'prostate cancer': ['radical prostatectomy and iodine 125 interstitial radiotherapy'],
 'mutated prostate cancer': ['radical prostatectomy and iodine 125 interstitial radiotherapy'],
 'hereditary prostate cancer': [],
 'multiple sclerosis ( ms )': [],
 'hereditary retinoblastoma': ['radiotherapy'],
 'epilepsy': [],
 'unstable angina or non-q-wave myocardial infarction': ['roxithromycin'],
 'coronary-artery disease': ['antichlamydial antibiotics'],
 'early-stage cervical carcinoma': [],
 'advanced disease': [],
 'cerebral palsy': ['hyperbaric oxygen therapy'],
 'severe pain': [],
 'myofascial trigger point pa

In [64]:
# Obtaining a cleaned version of our "disease_treatment" dictionary

cleaned_dict = {"sentence_number_in_test_set" : [], "disease" : [], "treatment" : [], "evidence" : []}
for sentence_number, disease in zip(numsent_evidence.keys(),disease_treatment.keys()):
  if disease_treatment[disease] != []:

    cleaned_dict["sentence_number_in_test_set"].append(sentence_number)
    cleaned_dict["disease"].append(disease)
    cleaned_dict["treatment"].append(disease_treatment[disease])
    cleaned_dict["evidence"].append(numsent_evidence[sentence_number])


# Import pandas library
import pandas as pd

  
# Create the pandas DataFrame
df = pd.DataFrame(cleaned_dict)
  
# print dataframe.
df



Unnamed: 0,sentence_number_in_test_set,disease,treatment,evidence
0,29,breast cancer,"[hormone replacement therapy, undergone subcut...",PARTICIPANTS : Institutions selected 798 perso...
1,33,ovarian cancer,"[hormone replacement therapy, undergone subcut...",CONTEXT : Approximately 9 % of prostate cancer...
2,36,prostate cancer,[radical prostatectomy and iodine 125 intersti...,FAMILIES : A total of 74 North American famili...
3,37,mutated prostate cancer,[radical prostatectomy and iodine 125 intersti...,CONCLUSIONS : Families that provide evidence f...
4,61,hereditary retinoblastoma,[radiotherapy],CONCLUSION : Methylphenidate is effective in t...
...,...,...,...,...
107,993,postvitrectomy diabetic vitreous hemorrhage,[peripheral retinal cryotherapy],Transabdominal extensive esophagogastric devas...
108,995,hepatitis b,[vaccine containing mf59],Thoracoscopy for empyema in children
109,996,temporomandibular joint arthropathy,[arthroscopic treatment],Conventional treatments for non-Hodgkin 's lym...
110,998,severe secondary peritonitis,[surgical management],Excimer laser assisted in situ keratomileusis ...
