# Healthcare Data Entities Identification

In [1]:
# Installing and importing relevant libraries
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd
import pathlib
import os
from time import time
from spacy.tokenizer import Tokenizer
SENT_RANGE = 10 # Range of word to consider as features

# load the model
model = spacy.load("en_core_web_sm")
model.tokenizer = Tokenizer(model.vocab)

# Dataset prepration and overview

In [2]:
none_all = []
medical_sample = []
with open('data.txt', 'r') as data_file:
    for line in data_file:
        if line.endswith('||NONE\n'):
            none_all.append(line.replace('||NONE\n', ''))
        elif line.endswith('||TREAT_FOR_DIS\n'):
            medical_sample.append(line.replace('||TREAT_FOR_DIS\n', ''))

none_sample = none_all[:len(medical_sample)]
print('Number of positive samples: ', len(none_sample), ' Number of negative samples: ',len(medical_sample))

Number of positive samples:  830  Number of negative samples:  830


In [3]:
train_len  = int(len(medical_sample) * 0.7)
train1, test1 = medical_sample[:train_len], medical_sample[train_len:] 
train2, test2 = none_sample[:train_len], none_sample[train_len:] 
train = train1 + train2
test = test1 + test2

In [4]:
def format_sentences(org_sentences):
    sentences_formatted = []
    annotations = []
    single_annot = 'O'
    for sentence in org_sentences:
        sentence_formatted = ''
        annotation = ''
        for word in sentence.split():
            if word == '<DIS>':
                single_annot = 'D'

            elif word == '</DIS>':
                single_annot = 'O'

            elif word == '<TREAT>':
                single_annot = 'T'

            elif word == '</TREAT>':
                single_annot = 'O'
            else:
                sentence_formatted += word + ' '
                annotation += single_annot + ' '

        sentences_formatted.append(sentence_formatted.strip())
        annotations.append(annotation.strip())
        
    return sentences_formatted, annotations

In [5]:
train_sentences, train_labels = format_sentences(train)
test_sentences, test_labels = format_sentences(test)

In [6]:
train_count_total = [sent_lbl for sentence_labels in train_labels for sent_lbl in sentence_labels if sent_lbl != ' ']
train_count_o = [sent_lbl for sentence_labels in train_labels for sent_lbl in sentence_labels if sent_lbl != ' ' and sent_lbl == 'O']
O_percent = round(len(train_count_o)/len(train_count_total),3)
ner_percent = 1 - O_percent
df = pd.DataFrame()
df['Train annotations counts'] = train_count_total
print('-------------------- Datasets statistics: --------------------\n')
print("'T' - Treatment")
print("'D' - Disease")
print("'O' - Other")
print('------------- Train dataset -------------')
print(df.value_counts())
print("NER tokens('D'/'T'): {}%, 'O' tokens: {}%".format(round(ner_percent,4), O_percent))
print("Number of sentences: {}".format(len(train_sentences)))


test_count_total = [sent_lbl for sentence_labels in test_labels for sent_lbl in sentence_labels if sent_lbl != ' ']
test_count_o = [sent_lbl for sentence_labels in test_labels for sent_lbl in sentence_labels if sent_lbl != ' ' and sent_lbl == 'O']
O_percent = round(len(test_count_o)/len(test_count_total),3)
ner_percent = 1 - O_percent
df = pd.DataFrame()
df['Test annotations counts'] = test_count_total

print('\n------------- Test dataset -------------')
print(df.value_counts())
print("NER tokens('D'/'T'): {}%, 'O' tokens: {}%".format(round(ner_percent,4), O_percent))
print("Number of sentences: {}".format(len(test_sentences)))

-------------------- Datasets statistics: --------------------

'T' - Treatment
'D' - Disease
'O' - Other
------------- Train dataset -------------
Train annotations counts
O                           25099
T                            2159
D                            1956
dtype: int64
NER tokens('D'/'T'): 0.141%, 'O' tokens: 0.859%
Number of sentences: 1162

------------- Test dataset -------------
Test annotations counts
O                          8336
T                           777
D                           683
dtype: int64
NER tokens('D'/'T'): 0.149%, 'O' tokens: 0.851%
Number of sentences: 498


In [7]:
# 'T' - Treatment
# 'D' - Disease
# 'O' - Other
test_count_total = [sent_lbl for sentence_labels in test_labels for sent_lbl in sentence_labels if sent_lbl != ' ']
df = pd.DataFrame()
df['Test annotations'] = test_count_total
print(df.value_counts())

Test annotations
O                   8336
T                    777
D                    683
dtype: int64


# Defining features for CRF

## Select one out of two options:
1) 10 range word feature from left and right

2) Previous and next word features

3) Previous word features

In [30]:
# 1) Range of "SENT_RANGE" range of features
def getFeaturesForOneWord(cur_loc, sentence):
    end_loc = len(sentence) - 1

    # Obtaining features for words
    features = []
    left_range = max(0, cur_loc - SENT_RANGE)
    right_range = min(end_loc, cur_loc + 10)

    for i_loc in range(left_range, right_range):
        word = sentence[i_loc]
        i = i_loc - cur_loc
        features.extend([
        f'word{i}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{i}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{i}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{i}.dep=' + word.dep_,                                             # dependency dependent
        f'word{i}.head=' + word.head.orth_,                                      # dependency head
        f'word{i}.isupper={word.orth_.isupper()}',                            # is the word in all uppercase
        f'word{i}.isdigit={word.orth_.isdigit()}',                            # is the word a number
        f'word{i}.startsWithCapital={word.orth_[0].isupper()}'])               # is the word starting with a capital letter
        
    if(cur_loc == 0):
        features.append('BEG')                                                # feature to track begin of sentence 
 
    elif(cur_loc == end_loc - 1):
        features.append('END')                                                # feature to track end of sentence
 
    return features

In [15]:
# 2) Previous, current and next word features
def getFeaturesForOneWord(cur_loc, sentence):
    end_loc = len(sentence) - 1
    # Obtaining features for current word
    word = sentence[cur_loc]
 
    features = [
    f'word{0}.lower=' + word.orth_.lower(),                                  # serves as word id
    f'word{0}.postag=' + word.pos_,                                          # PoS tag of current word
    f'word{0}[-3:]=' + word.orth_[-3:],                                      # last three characters
    f'word{0}.dep=' + word.dep_,                                             # dependency dependent
    f'word{0}.head=' + word.head.orth_,                                      # dependency head
    f'word{0}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
    f'word{0}.isdigit={word.orth_.isdigit()}',                               # is the word a number
    f'word{0}.startsWithCapital={word.orth_[0].isupper()}']                  # is the word starting with a capital letter

        
    if(cur_loc > 0):
        word = sentence[cur_loc - 1]
        features.extend([
        f'word{-1}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{-1}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{-1}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{-1}.dep=' + word.dep_,                                             # dependency dependent
        f'word{-1}.head=' + word.head.orth_,                                      # dependency head
        f'word{-1}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
        f'word{-1}.isdigit={word.orth_.isdigit()}',                               # is the word a number
        f'word{-1}.startsWithCapital={word.orth_[0].isupper()}'])                # is the word starting with a capital letter
        
    
    else:
        features.append('BEG')                                          # feature to track begin of sentence 
 
    if(cur_loc + 1 < end_loc):
        word = sentence[cur_loc + 1]
        features.extend([
        f'word{1}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{1}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{1}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{1}.dep=' + word.dep_,                                             # dependency dependent
        f'word{1}.head=' + word.head.orth_,                                      # dependency head
        f'word{1}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
        f'word{1}.isdigit={word.orth_.isdigit()}',                               # is the word a number
        f'word{1}.startsWithCapital={word.orth_[0].isupper()}'])                  # is the word starting with a capital letter

    else:
        features.append('END')                                                # feature to track end of sentence

    return features

In [22]:
# 3) Previous and current word features
def getFeaturesForOneWord(cur_loc, sentence):
    end_loc = len(sentence) - 1
    # Obtaining features for current word
    word = sentence[cur_loc]

    features = [
    f'word{0}.lower=' + word.orth_.lower(),                                  # serves as word id
    f'word{0}.postag=' + word.pos_,                                          # PoS tag of current word
    f'word{0}[-3:]=' + word.orth_[-3:],                                      # last three characters
    f'word{0}.dep=' + word.dep_,                                             # dependency dependent
    f'word{0}.head=' + word.head.orth_,                                      # dependency head
    f'word{0}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
    f'word{0}.isdigit={word.orth_.isdigit()}',                               # is the word a number
    f'word{0}.startsWithCapital={word.orth_[0].isupper()}']                  # is the word starting with a capital letter

        
    if(cur_loc > 0):
        word = sentence[cur_loc - 1]
        features.extend([
        f'word{-1}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{-1}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{-1}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{-1}.dep=' + word.dep_,                                             # dependency dependent
        f'word{-1}.head=' + word.head.orth_,                                      # dependency head
        f'word{-1}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
        f'word{-1}.isdigit={word.orth_.isdigit()}',                               # is the word a number
        f'word{-1}.startsWithCapital={word.orth_[0].isupper()}'])                # is the word starting with a capital letter
        
    
    else:
        features.append('BEG')                                                # feature to track begin of sentence 

    if(cur_loc == end_loc):
        features.append('END')                                                # feature to track end of sentence

    return features

# Prepare data 

In [31]:
# Get features for a sentence.
def getFeaturesForOneSentence(sentence):
    sentence_parsing = model(sentence)
    return [getFeaturesForOneWord(ii, sentence_parsing) for ii,token in enumerate(sentence_parsing)]

# code to get the labels for a sentence.
def getLabelsInListForOneSentence(labels):
    return labels.split()

In [37]:
train_labels[1]

'O O T O O O O O O D O D O O O O O O O O O'

In [39]:
# Checking feature extraction
example_sentence = train_sentences[1]
print(f'Example sentence: "{example_sentence}"\n')

features = getFeaturesForOneSentence(example_sentence)
print('Total features in the sentence:', len(features))
print('Example of features for the word "rates":')
features[2]

Example sentence: "CONCLUSION : Methylphenidate is effective in treating children with epilepsy and ADHD and safe in children who are seizure free ."

The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.
Total features in the sentence: 21
Example of features for the word "rates":


['word-2.lower=conclusion',
 'word-2.postag=NOUN',
 'word-2[-3:]=ION',
 'word-2.dep=dep',
 'word-2.head=is',
 'word-2.isupper=True',
 'word-2.isdigit=False',
 'word-2.startsWithCapital=True',
 'word-1.lower=:',
 'word-1.postag=PUNCT',
 'word-1[-3:]=:',
 'word-1.dep=punct',
 'word-1.head=CONCLUSION',
 'word-1.isupper=False',
 'word-1.isdigit=False',
 'word-1.startsWithCapital=False',
 'word0.lower=methylphenidate',
 'word0.postag=NOUN',
 'word0[-3:]=ate',
 'word0.dep=nsubj',
 'word0.head=is',
 'word0.isupper=False',
 'word0.isdigit=False',
 'word0.startsWithCapital=True',
 'word1.lower=is',
 'word1.postag=AUX',
 'word1[-3:]=is',
 'word1.dep=ROOT',
 'word1.head=is',
 'word1.isupper=False',
 'word1.isdigit=False',
 'word1.startsWithCapital=False',
 'word2.lower=effective',
 'word2.postag=ADJ',
 'word2[-3:]=ive',
 'word2.dep=acomp',
 'word2.head=is',
 'word2.isupper=False',
 'word2.isdigit=False',
 'word2.startsWithCapital=False',
 'word3.lower=in',
 'word3.postag=ADP',
 'word3[-3:]=in',
 

In [40]:
p = ''
for i in zip(features[2], train_labels[1]):
    p.

word-2.lower=conclusion
word-2.postag=NOUN
word-2[-3:]=ION
word-2.dep=dep
word-2.head=is
word-2.isupper=True
word-2.isdigit=False
word-2.startsWithCapital=True
word-1.lower=:
word-1.postag=PUNCT
word-1[-3:]=:
word-1.dep=punct
word-1.head=CONCLUSION
word-1.isupper=False
word-1.isdigit=False
word-1.startsWithCapital=False
word0.lower=methylphenidate
word0.postag=NOUN
word0[-3:]=ate
word0.dep=nsubj
word0.head=is
word0.isupper=False
word0.isdigit=False
word0.startsWithCapital=True
word1.lower=is
word1.postag=AUX
word1[-3:]=is
word1.dep=ROOT
word1.head=is
word1.isupper=False
word1.isdigit=False
word1.startsWithCapital=False
word2.lower=effective
word2.postag=ADJ
word2[-3:]=ive
word2.dep=acomp
word2.head=is
word2.isupper=False
word2.isdigit=False
word2.startsWithCapital=False
word3.lower=in
word3.postag=ADP
word3[-3:]=in
word3.dep=prep
word3.head=effective
word3.isupper=False
word3.isdigit=False
word3.startsWithCapital=False
word4.lower=treating
word4.postag=VERB
word4[-3:]=ing
word4.dep=pco

In [25]:
X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sentences]
X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sentences]

Y_train = [getLabelsInListForOneSentence(labels) for labels in train_labels]
Y_test = [getLabelsInListForOneSentence(labels) for labels in test_labels]

# build the CRF Classifier

In [26]:
crf = sklearn_crfsuite.CRF(max_iterations=300)

try:
    crf.fit(X_train, Y_train)
except AttributeError:
    pass
    
predictions = crf.predict(X_test)

# Evaluation

In [27]:
Y_pred = crf.predict(X_test)

In [14]:
print('Using oprtion (1) - "Range of SENT_RANGE word features":')
print("Weighted F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='weighted')))
print("Macro F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='macro')))

Using oprtion (1) - "Range of SENT_RANGE word features":
Weighted F1: 0.9222604008175131
Macro F1: 0.7947574532912421


In [21]:
print('Using oprtion (2) - "Previous, current and next word features":')
print("Weighted F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='weighted')))
print("Macro F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='macro')))

Using oprtion (2) - "Previous, current and next word features":
Weighted F1: 0.9138709791733991
Macro F1: 0.7690004082496799


In [28]:
print('Using oprtion (3) - "Previous and current word features":')
print("Weighted F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='weighted')))
print("Macro F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='macro')))

Using oprtion (3) - "Previous and current word features":
Weighted F1: 0.9051708510330074
Macro F1: 0.7440366657060755


# Extract all relations

In [296]:
all_sentences = train_sentences + test_sentences
all_sentences_string = train_sentences + test_sentences
all_sentences = [i.split() for i in all_sentences]
all_labels = train_labels + test_labels
all_labels = [i.split() for i in all_labels]
condition_treatment_evidence = {'condition':[], 'treatment':[], 'evidence':[]}            # Initializing an empty dictionary


for i in range(len(all_labels)):


        #print(test_sentences[i])
        cnt_disease = 0           # Count of number of diseases mentioned in the sentence
        cnt_treatment = 0         # Count of the number of treatments mentioned in the sentence
        diseases = [""]           # Initializing a blank list of diseases for current sentence.
        treatment = [""]          # Initializing a blank list of treatments for current sentence.
        sentence_number = [""]
        evidence = [""]
        
        length = len(all_labels[i])   # Length of current sentence.
        for j in range(length):
            if (all_labels[i][j] == 'D'):                                                     # Checking for label indicating disease for current word ('D')
                diseases[cnt_disease] += (all_sentences[i][j] + " ")            # Adding word to diseases list.
                if j < length - 1:
                    if (all_labels[i][j+1] != 'D'):                                           # Check for name of disease extending over multiple words. 
                        # If next word does not have label 'D', then truncate the space added at the end of the last word.
                        diseases[cnt_disease] = diseases[cnt_disease][:-1]
                        cnt_disease += 1
                        diseases.append("")                                               # Adding a placeholder for the next disease in the current sentence.
                else:
                    diseases[cnt_disease] = diseases[cnt_disease][:-1]
                    cnt_disease += 1
                    diseases.append("")
                                
            if (all_labels[i][j] == 'T'):                                                     # Checking for label indicating treatment for current word ('T')
                treatment[cnt_treatment] += (all_sentences[i][j] + " ") # Adding word to corresponding treatment list.
                if j < length - 1:
                    if (all_labels[i][j+1] != 'T'):                                           # Check for name of treatment extending over multiple words. 
                        # If next word does not have label 'T', then truncate the space added at the end of the last word.
                        treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                        cnt_treatment += 1
                        treatment.append("")                                              # Adding a placeholder for the next treatment in the current sentence.
                else:
                    treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                    cnt_treatment += 1
                    treatment.append("")

        diseases.pop(-1)    # Getting rid of the last empty placeholder in diseases list
        treatment.pop(-1)   # Getting rid of the last empty placeholder in treatments list
        if cnt_disease and cnt_treatment:
            for i_deases in range(cnt_disease):
                for j in range(cnt_treatment):             
                    condition_treatment_evidence['condition'].append(diseases[i_deases])            
                    condition_treatment_evidence['treatment'].append(treatment[j])
                    condition_treatment_evidence['evidence'].append(all_sentences_string[i])
  
# Create the pandas DataFrame
df = pd.DataFrame(condition_treatment_evidence)
  
# # save a .json file
df.to_json(r'relation2.jsonl',orient = 'records', lines = 'True')

# Train on all the data for prediction on new model

In [298]:
tic = time()

all_sentences = train_sentences + test_sentences
all_labels = train_labels + test_labels
all_sentences_f = [getFeaturesForOneSentence(sentence) for sentence in all_sentences]
predictions = crf.predict(all_sentences_f)
toc = time()

print('Process time(minutes): ',round(((toc - tic)/60), 4))

Process time(minutes):  9.956


# Predicting on new data

In [226]:
condition_treatment_evidence = {'condition':[], 'treatment':[], 'evidence':[]}            # Initializing an empty dictionary


for i in range(len(Y_pred)):


        #print(test_sentences[i])
        cnt_disease = 0           # Count of number of diseases mentioned in the sentence
        cnt_treatment = 0         # Count of the number of treatments mentioned in the sentence
        diseases = [""]           # Initializing a blank list of diseases for current sentence.
        treatment = [""]          # Initializing a blank list of treatments for current sentence.
        sentence_number = [""]
        evidence = [""]
        
        length = len(Y_pred[i])   # Length of current sentence.
        for j in range(length):
            if (Y_pred[i][j] == 'D'):                                                     # Checking for label indicating disease for current word ('D')
                diseases[cnt_disease] += (X_test[i][j][0].split('=')[1] + " ")            # Adding word to diseases list.
                if j < length - 1:
                    if (Y_pred[i][j+1] != 'D'):                                           # Check for name of disease extending over multiple words. 
                        # If next word does not have label 'D', then truncate the space added at the end of the last word.
                        diseases[cnt_disease] = diseases[cnt_disease][:-1]
                        cnt_disease += 1
                        diseases.append("")                                               # Adding a placeholder for the next disease in the current sentence.
                else:
                    diseases[cnt_disease] = diseases[cnt_disease][:-1]
                    cnt_disease += 1
                    diseases.append("")
                                
            if (Y_pred[i][j] == 'T'):                                                     # Checking for label indicating treatment for current word ('T')
                treatment[cnt_treatment] += (X_test[i][j][0].split('=')[1] + " ") # Adding word to corresponding treatment list.
                if j < length - 1:
                    if (Y_pred[i][j+1] != 'T'):                                           # Check for name of treatment extending over multiple words. 
                        # If next word does not have label 'T', then truncate the space added at the end of the last word.
                        treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                        cnt_treatment += 1
                        treatment.append("")                                              # Adding a placeholder for the next treatment in the current sentence.
                else:
                    treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                    cnt_treatment += 1
                    treatment.append("")

        diseases.pop(-1)    # Getting rid of the last empty placeholder in diseases list
        treatment.pop(-1)   # Getting rid of the last empty placeholder in treatments list
        if cnt_disease and cnt_treatment:
            for i_deases in range(cnt_disease):
                for j in range(cnt_treatment):             
                    condition_treatment_evidence['condition'].append(diseases[i_deases])            
                    condition_treatment_evidence['treatment'].append(treatment[j])
                    condition_treatment_evidence['evidence'].append(test_sentences[i])


# cleaned_condition_treatment_evidence = {'condition':[], 'treatment':[], 'evidence':[]}
# print('the number of detected medical pairs plus evidence is:', len(cleaned_condition_treatment_evidence['evidence']))

# Import pandas library
import pandas as pd

  
# Create the pandas DataFrame
df = pd.DataFrame(condition_treatment_evidence)
  
# print dataframe.
df

# save a .json file
#df.to_json(r'/content/drive/MyDrive/REL_Medical/relation2.jsonl',orient = 'records', lines = 'True')


Unnamed: 0,condition,treatment,evidence
0,advanced renal cell carcinoma,various interferon alpha preparations,Studies with various interferon alpha preparat...
1,advanced renal cell carcinoma,"interferon alfa-n1 , interferon alfa-2a",Studies with various interferon alpha preparat...
2,low-grade non-hodgkin 's lymphoma,interferon alpha,Recombinant and natural forms of interferon al...
3,low-grade non-hodgkin 's lymphomas,interferon and various cytotoxic drugs,This approach is being extended to the clinic ...
4,locally advanced squamous cell carcinoma of th...,"docetaxel , cisplatin , fluorouracil ( 5-fu ) ...","PURPOSE : A phase I/II trial of docetaxel , ci..."
...,...,...,...
83,gangrenous and perforated appendicitis,imipenem/cilistatin,Ticarcillin/clavulanate versus imipenem/cilist...
84,acute coronary syndromes,antithrombotic therapy,Issues and challenges with antithrombotic ther...
85,laser-thermal angioplasty,balloon angioplasty,Reduction of vasoreactivity and thrombogenicit...
86,epithelial ovarian cancer,high-dose chemotherapy with autologous stem-ce...,High-dose chemotherapy with autologous stem-ce...


# Extracting the medical relations

In [57]:
# determine the number of test sentence with T or D or both T and D label.
num_sent_with_D_T = 0
for i in range(len(Y_test)):

  string = ''.join([str(item) for item in Y_test[i]])
  num_T = string.count('T')
  num_D = string.count('D')

  if num_T >= 1 :
    if num_D >= 1 :

      num_sent_with_D_T = num_sent_with_D_T + 1
      #print(i)

print('the number of test sentence with both T and D actual label is :', num_sent_with_D_T)

the number of test sentence with both T and D actual label is : 297


In [218]:
condition_treatment_evidence = {'condition':[], 'treatment':[], 'evidence':[]}            # Initializing an empty dictionary


for i in range(len(Y_pred)):


        #print(test_sentences[i])
        cnt_disease = 0           # Count of number of diseases mentioned in the sentence
        cnt_treatment = 0         # Count of the number of treatments mentioned in the sentence
        diseases = [""]           # Initializing a blank list of diseases for current sentence.
        treatment = [""]          # Initializing a blank list of treatments for current sentence.
        sentence_number = [""]
        evidence = [""]
        
        length = len(Y_pred[i])   # Length of current sentence.
        for j in range(length):
            if (Y_pred[i][j] == 'D'):                                                     # Checking for label indicating disease for current word ('D')
                diseases[cnt_disease] += (X_test[i][j][0].split('=')[1] + " ")            # Adding word to diseases list.
                if j < length - 1:
                    if (Y_pred[i][j+1] != 'D'):                                           # Check for name of disease extending over multiple words. 
                        # If next word does not have label 'D', then truncate the space added at the end of the last word.
                        diseases[cnt_disease] = diseases[cnt_disease][:-1]
                        cnt_disease += 1
                        diseases.append("")                                               # Adding a placeholder for the next disease in the current sentence.
                else:
                    diseases[cnt_disease] = diseases[cnt_disease][:-1]
                    cnt_disease += 1
                    diseases.append("")
                                
            if (Y_pred[i][j] == 'T'):                                                     # Checking for label indicating treatment for current word ('T')
                treatment[cnt_treatment] += (X_test[i][j][0].split('=')[1] + " ") # Adding word to corresponding treatment list.
                if j < length - 1:
                    if (Y_pred[i][j+1] != 'T'):                                           # Check for name of treatment extending over multiple words. 
                        # If next word does not have label 'T', then truncate the space added at the end of the last word.
                        treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                        cnt_treatment += 1
                        treatment.append("")                                              # Adding a placeholder for the next treatment in the current sentence.
                else:
                    treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                    cnt_treatment += 1
                    treatment.append("")

        diseases.pop(-1)    # Getting rid of the last empty placeholder in diseases list
        treatment.pop(-1)   # Getting rid of the last empty placeholder in treatments list
        condition_treatment_evidence['condition'].append(diseases)
        condition_treatment_evidence['treatment'].append(treatment)
        condition_treatment_evidence['evidence'].append(test_sentences[i])


cleaned_condition_treatment_evidence = {'condition':[], 'treatment':[], 'evidence':[]}

for i in range(len(test_sentences)):
    conditions = condition_treatment_evidence['condition'][i]
    treatments = condition_treatment_evidence['treatment'][i]
    evidence = condition_treatment_evidence['evidence'][i]
    both_list_ind = False
    single_value_list_ind = False

    if conditions != [] and treatments != []:
        if 1 < len(treatments[0]) and 1 == len(conditions[0]):
            list1 = treatments
            list_name = 'treatment'
            value = conditions
            value_name = 'condition' 
        elif 1 < len(conditions[0]) and 1 == len(treatments[0]):
            list1 = conditions
            list_name = 'condition' 
            value = treatments
            value_name = 'treatment'

        elif 1 == len(conditions[0]) and 1 == len(treatments[0]):
            single_value_list_ind = True
        else:
            both_list_ind = True

        if both_list_ind:
            print(conditions)
            print(len(conditions[0]),conditions,conditions[0])
            break
        elif single_value_list_ind:
              cleaned_condition_treatment_evidence['condition'].append(conditions)
              cleaned_condition_treatment_evidence['treatment'].append(treatments)
              cleaned_condition_treatment_evidence['evidence'].append(evidence)            
        else:
            for list_val in list1:
              cleaned_condition_treatment_evidence[list_name].append(list_val)
              cleaned_condition_treatment_evidence[value_name].append(value)
              cleaned_condition_treatment_evidence['evidence'].append(evidence)

print('the number of detected medical pairs plus evidence is:', len(cleaned_condition_treatment_evidence['evidence']))

# Import pandas library
import pandas as pd

  
# Create the pandas DataFrame
df = pd.DataFrame(cleaned_condition_treatment_evidence)
  
# print dataframe.
df

# save a .json file


['advanced renal cell carcinoma']
29 ['advanced renal cell carcinoma'] advanced renal cell carcinoma
the number of detected medical pairs plus evidence is: 0


Unnamed: 0,condition,treatment,evidence


In [31]:
disease_treatment = {}            # Initializing an empty dictionary
numsent_evidence = {}             # Initializing an empty dictionary
for i in range(len(Y_pred)):
    #print(test_sentences[i])
    cnt_disease = 0           # Count of number of diseases mentioned in the sentence
    cnt_treatment = 0         # Count of the number of treatments mentioned in the sentence
    diseases = [""]           # Initializing a blank list of diseases for current sentence.
    treatment = [""]          # Initializing a blank list of treatments for current sentence.
    sentence_number = [""]
    evidence = [""]
    
    length = len(Y_pred[i])   # Length of current sentence.
    for j in range(length):
        if (Y_pred[i][j] == 'D'):                                                     # Checking for label indicating disease for current word ('D')
            diseases[cnt_disease] += (X_test[i][j][0].split('=')[1] + " ")            # Adding word to diseases list.
            if j < length - 1:
                if (Y_pred[i][j+1] != 'D'):                                           # Check for name of disease extending over multiple words. 
                    # If next word does not have label 'D', then truncate the space added at the end of the last word.
                    diseases[cnt_disease] = diseases[cnt_disease][:-1]
                    cnt_disease += 1
                    diseases.append("")                                               # Adding a placeholder for the next disease in the current sentence.
            else:
                diseases[cnt_disease] = diseases[cnt_disease][:-1]
                cnt_disease += 1
                diseases.append("")
                            
        if (Y_pred[i][j] == 'T'):                                                     # Checking for label indicating treatment for current word ('T')
            treatment[cnt_treatment] += (X_test[i][j][0].split('=')[1] + " ") # Adding word to corresponding treatment list.
            if j < length - 1:
                if (Y_pred[i][j+1] != 'T'):                                           # Check for name of treatment extending over multiple words. 
                    # If next word does not have label 'T', then truncate the space added at the end of the last word.
                    treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                    cnt_treatment += 1
                    treatment.append("")                                              # Adding a placeholder for the next treatment in the current sentence.
            else:
                treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                cnt_treatment += 1
                treatment.append("")

    diseases.pop(-1)    # Getting rid of the last empty placeholder in diseases list
    treatment.pop(-1)   # Getting rid of the last empty placeholder in treatments list

    # To our dictionary, add or append treatments to the diseases identified from the current sentence, if any.
    if len(diseases) > 0:       # Checking if any diseases have been identified for the current sentence.

        sentence_number = i
        evidence = test_sentences[i]   
        numsent_evidence[sentence_number] = evidence
        for disease in diseases:
            if disease in disease_treatment.keys():
                # Extend treatment list if other treatments for the particular disease already exist
                disease_treatment[disease].extend(treatment)
            else:
                # Creating list of treatments for particular disease if it doesn not exist already.
                disease_treatment[disease] = treatment


In [None]:
# Obtaining a cleaned version of our "disease_treatment" dictionary

cleaned_dict = {"sentence_number_in_test_set" : [], "disease" : [], "treatment" : [], "evidence" : []}
for sentence_number, disease in zip(numsent_evidence.keys(),disease_treatment.keys()):
    if disease_treatment[disease] != []:
        treatments = disease_treatment[disease]
        if 1 < len(treatments[0]):
            for treatment in treatments:
                cleaned_dict["sentence_number_in_test_set"].append(sentence_number)
                cleaned_dict["disease"].append(disease)
                cleaned_dict["treatment"].append(treatment)
                cleaned_dict["evidence"].append(numsent_evidence[sentence_number])
        else:
                cleaned_dict["sentence_number_in_test_set"].append(sentence_number)
                cleaned_dict["disease"].append(disease)
                cleaned_dict["treatment"].append(treatments)
                cleaned_dict["evidence"].append(numsent_evidence[sentence_number])

# Import pandas library
import pandas as pd

  
# Create the pandas DataFrame
df = pd.DataFrame(cleaned_dict)
  
# print dataframe.
df

In [52]:
df.drop(['sentence_number_in_test_set'], axis=1, inplace=True)

In [55]:
df.rename(columns={'disease':'condition'}, inplace=True)

In [56]:
f = open('relation22.jsonl', 'w')
print(df.to_json(orient='records', lines=True), file=f, flush=False)
f.close()

In [54]:
df

Unnamed: 0,disease,treatment,evidence
0,macrosomic infants in gestational diabetes cases,good glycemic control,This study tested the hypothesis that to reduc...
1,cancer,organ transplantation and chemotherapy,CONTEXT : A mutation in the BRCA1 gene may con...
2,cancer,oral drugs,CONTEXT : A mutation in the BRCA1 gene may con...
3,cancer,chemotherapy,CONTEXT : A mutation in the BRCA1 gene may con...
4,cancer,matrix metalloproteinase inhibitors,CONTEXT : A mutation in the BRCA1 gene may con...
...,...,...,...
178,pertussis,vaccines,Thoracoscopy for empyema in children
179,temporomandibular joint arthropathy,arthroscopic treatment,Conventional treatments for non-Hodgkin 's lym...
180,acute colonic pseudo-obstruction,neostigmine,Antiplatelet therapy in acute cerebral ischemia
181,severe secondary peritonitis,surgical management,Interferon treatment of renal cell carcinoma
