# Name Entity Recognition Assignment

# Workspace set up: Import and Install useful packages.

In [14]:
#!pip install pycrf
#!pip install sklearn-crfsuite

import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd
from tqdm import tqdm
from spacy import displacy

model = spacy.load("en_core_web_sm", disable=['ner'])

In [15]:
def form_sentence(wordlist, seperator):
    sentence = ""
    #print(wordlist)
    for count in range(len(wordlist)):
        if count == 0:
            sentence = wordlist[count]
        else:
            sentence = sentence + seperator +wordlist[count]
    return sentence

In [16]:
def load_input_data(filepath, seperator):
    with open(filepath, 'r') as file_handler:
        input_data = file_handler.readlines()
    wordlist = []
    sentences = []
    for word in input_data:
        #print(word)
        word = word.strip(seperator)
        #print(word)
        if word == '':
            #print(wordlist)
            if wordlist:
                sentences.append(form_sentence(wordlist=wordlist, seperator=' '))
                del wordlist
                wordlist = []
        else:
            #print(wordlist)
            wordlist.append(word)
    return sentences

In [67]:
def get_sent_pos_df(sentences, labels):
    sent_pos_df = pd.DataFrame(columns=['sent_idx', 'word', 'pos', 'lemma', 'label'])
    sent_idx = 0
    row = 0
    for sentence, label in tqdm(zip(sentences, labels)):
        #processed_doc = model(sentence)
        sent_idx+=1
        for word, char_lbl in zip(sentence.split(), label.split()):
            doc = model(word)
            for token in doc:
                sent_pos_df.loc[row, ['sent_idx', 'word', 'pos', 'lemma', 'label']] = [sent_idx, token.text, token.pos_, token.lemma_, char_lbl]
                row+=1
    return sent_pos_df

# Identifying Entities in Healthcare Data

### Construct the proper sentences from individual words and print the 5 sentences.

In [18]:
# Load the train sentences data and print the total sentneces and sample sentences
train_sents = load_input_data(filepath = 'train_sent',seperator = '\n')

In [19]:
print(len(train_sents))
print(train_sents[:5])

2599
['All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )', 'The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )', 'Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )', "The `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 )", "Arrest of dilation was the most common indication in both `` corrected '' subgroups ( 23.4 and 24.6 % , respectively )"]


In [20]:
# Load the train labels data and print the total labels and sample labels
train_labels = load_input_data(filepath = 'train_label',seperator = '\n')

In [21]:
print(len(train_labels))
print(train_labels[:5])

2599
['O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O O O O']


In [22]:
# Load the test sentences data and print the total sentneces and sample sentences
test_sents = load_input_data(filepath = 'test_sent', seperator = '\n')

In [23]:
print(len(test_sents))
print(test_sents[:5])

1056
['Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 )', 'As the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration', 'The daily high temperature ranged from 71 to 104 degrees F and AFI values ranged from 1.7 to 24.7 cm during the study period', 'There was a significant correlation between the 2- , 3- , and 4-day mean temperature and AFI , with the 4-day mean being the most significant ( r = 0.31 , p & # 60 ; 0.001 )', 'Fluctuations in ambient temperature are inversely correlated to changes in AFI']


In [24]:
# Load the test labels data and print the total labels and sample labels
test_labels = load_input_data(filepath = 'test_label', seperator = '\n')

In [25]:
print(len(test_labels))
print(test_labels[:5])

1056
['O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O']


In [26]:
#Check the POS tags for one of the sentence. 
doc = model(test_sents[46])
for token in doc:
    print(token.text, '--', token.dep_, '--', token.pos_)

< -- dep -- X
TO_SEE -- dep -- PROPN
> -- punct -- X
CONTEXT -- appos -- PROPN
: -- punct -- PUNCT
There -- expl -- PRON
is -- ROOT -- VERB
a -- det -- DET
substantial -- amod -- ADJ
risk -- attr -- NOUN
of -- prep -- ADP
a -- det -- DET
second -- amod -- ADJ
cancer -- pobj -- NOUN
for -- prep -- ADP
persons -- pobj -- NOUN
with -- prep -- ADP
hereditary -- amod -- ADJ
retinoblastoma -- pobj -- NOUN
, -- punct -- PUNCT
which -- nsubjpass -- PRON
is -- auxpass -- AUX
enhanced -- relcl -- VERB
by -- agent -- ADP
radiotherapy -- pobj -- NOUN


In [27]:
#Checking the tokenization to identify the rigt strategy to club the label, POS and sentence words
model.tokenizer.explain(train_sents[3])

[('TOKEN', 'The'),
 ('PREFIX', '`'),
 ('SUFFIX', '`'),
 ('TOKEN', 'corrected'),
 ('SPECIAL-1', "''"),
 ('TOKEN', 'cesarean'),
 ('TOKEN', 'rate'),
 ('PREFIX', '('),
 ('TOKEN', 'maternal'),
 ('INFIX', '-'),
 ('TOKEN', 'fetal'),
 ('TOKEN', 'medicine'),
 ('TOKEN', 'and'),
 ('TOKEN', 'transported'),
 ('TOKEN', 'patients'),
 ('TOKEN', 'excluded'),
 ('PREFIX', ')'),
 ('TOKEN', 'was'),
 ('TOKEN', '12.4'),
 ('PREFIX', '%'),
 ('PREFIX', '('),
 ('TOKEN', '273'),
 ('TOKEN', 'of'),
 ('TOKEN', '2194'),
 ('PREFIX', ')'),
 ('PREFIX', ','),
 ('TOKEN', 'and'),
 ('TOKEN', 'the'),
 ('PREFIX', '`'),
 ('SUFFIX', '`'),
 ('TOKEN', 'corrected'),
 ('SPECIAL-1', "''"),
 ('TOKEN', 'primary'),
 ('TOKEN', 'rate'),
 ('TOKEN', 'was'),
 ('TOKEN', '9.6'),
 ('PREFIX', '%'),
 ('PREFIX', '('),
 ('TOKEN', '190'),
 ('TOKEN', 'of'),
 ('TOKEN', '1975'),
 ('PREFIX', ')')]

### Count the number of sentences in the processed train and test dataset 

In [28]:
print('Total Train sentences are: ',len(train_sents))
print('Total Test sentences are: ',len(test_sents))

Total Train sentences are:  2599
Total Test sentences are:  1056


### Count the number of lines of labels in the processed train and test dataset.

In [29]:
print('Total Train labels are: ',len(train_labels))
print('Total Test labels are: ',len(test_labels))

Total Train labels are:  2599
Total Test labels are:  1056


#### Observations:


- <font color='blue'>There are total of 2599 sentences of medical records and corresponding labels in the train data set.</font>
- <font color='blue'>There are total of 1056 sentences of medical records and corresponding labels in the test data set.</font>
- <font color='blue'>The observation is that the double quaotes are processed in a strange way, this will impact the further relation of texts with corresponding labels.</font>


# Step 2: Concept Identification





## Extract those tokens which have NOUN or PROPN as their PoS tag and find their frequency



In [68]:
train_sp_df = get_sent_pos_df(train_sents, train_labels)
train_sp_df.head(5)

2599it [04:43,  9.15it/s]


Unnamed: 0,sent_idx,word,pos,lemma,label
0,1,All,ADV,all,O
1,1,live,VERB,live,O
2,1,births,NOUN,birth,O
3,1,>,PUNCT,>,O
4,1,or,CCONJ,or,O


In [69]:
test_sp_df = get_sent_pos_df(test_sents, test_labels)
test_sp_df.head(5)

1056it [01:22, 12.80it/s]


Unnamed: 0,sent_idx,word,pos,lemma,label
0,1,Furthermore,ADV,furthermore,O
1,1,",",PUNCT,",",O
2,1,when,SCONJ,when,O
3,1,all,PRON,all,O
4,1,deliveries,NOUN,delivery,O


### Print the top 25 most common tokens with NOUN or PROPN PoS tags

In [70]:
nound_propn_freq_df = pd.DataFrame()
nound_propn_freq_df = pd.concat((train_sp_df, test_sp_df), axis=0)
nound_propn_freq_df.reset_index(inplace = True, drop = True)
nound_propn_freq_df.head(5)

Unnamed: 0,sent_idx,word,pos,lemma,label
0,1,All,ADV,all,O
1,1,live,VERB,live,O
2,1,births,NOUN,birth,O
3,1,>,PUNCT,>,O
4,1,or,CCONJ,or,O


In [71]:
nound_propn_freq_df[(nound_propn_freq_df['pos'] == 'NOUN') | ((nound_propn_freq_df['pos'] == 'PROPN'))]['word'].value_counts()[:25]

patients        492
treatment       281
%               247
cancer          200
therapy         175
study           162
disease         143
cell            140
lung            116
group            94
gene             88
chemotherapy     88
effects          85
use              78
women            77
patient          75
risk             71
cases            71
surgery          71
analysis         70
rate             67
response         66
dose             66
survival         65
children         64
Name: word, dtype: int64

In [72]:
nound_propn_freq_df[(nound_propn_freq_df['pos'] == 'NOUN') | ((nound_propn_freq_df['pos'] == 'PROPN'))]['lemma'].value_counts()[:25]

patient         587
treatment       316
%               247
study           227
cancer          226
cell            202
therapy         182
disease         172
effect          163
group           145
case            132
lung            120
gene            112
year            105
rate            103
trial            91
chemotherapy     91
woman            89
analysis         86
dose             83
protein          82
response         81
risk             78
use              78
child            78
Name: lemma, dtype: int64

#### Observations

- <font color='blue'>The combined (Train + Test) dataset has above frequency for the Noun and Proper noun PoS tags.

# Step 3: Defining features for CRF



In [1]:
# Let's define the features to get the feature value for one word.
def getFeaturesForOneWord(sentence, i):
    #print('****SENT: ', sentence)
    
    word = sentence[i][0]
    postag = sentence[i][1]
    
    #print('****WORD: ', word)
    #print('****VALUE_I: ', i)
    #print('####LEN(SENT) -- ', len(sentence))
    features = [
        'bias=1.0',
        'word.lower=' + word.lower(), # serves as word id
        'word[-3:]=' + word[-3:],     # last three characters
        'word[-2:]=' + word[-2:],     # last two characters
        'word.isupper=%s' % word.isupper(),  # is the word in all uppercase
        'word.isdigit=%s' % word.isdigit(),  # is the word a number
        'words.startsWithCapital=%s' % word[0].isupper(), # is the word starting with a capital letter
        'postag=' + postag,
        'postag_isnounpronoun=%s' % (postag in ['NOUN','PROPN']),
        'postag[:2]=' + postag[:2]]
    
    if(i > 0):
        prev_word = sentence[i-1][0]
        prev_postag = sentence[i-1][1]
        features.extend([
            'prev_word.lower=' + prev_word.lower(), 
            'prev_word.isupper=%s' % prev_word.isupper(),
            'prev_word[-3:]=' + prev_word[-3:],     # last three characters
            'prev_word[-2:]=' + prev_word[-2:],     # last two characters
            'prev_word.isdigit=%s' % prev_word.isdigit(),
            'prev_words.startsWithCapital=%s' % prev_word[0].isupper(),
            'prev_postag=' + prev_postag,
            'prev_postag_isnounpronoun=%s' % (prev_postag in ['NOUN','PROPN']),
            'prev_postag[:2]=' + prev_postag[:2]])
    else:
        features.append('BEG') # feature to track begin of sentence 

    if i == len(sentence)-1:
        features.append('END')
#     if i < len(sentence)-1:
#         nxt_word = sentence[i+1][0]
#         nxt_postag = sentence[i+1][1]
#         features.extend([
#             'nxt_word.lower=' + nxt_word.lower(), 
#             'nxt_word.isupper=%s' % nxt_word.isupper(),
#             'nxt_word[-3:]=' + nxt_word[-3:],     # last three characters
#             'nxt_word[-2:]=' + nxt_word[-2:],     # last two characters
#             'nxt_word.isdigit=%s' % nxt_word.isdigit(),
#             'nxt_word.startsWithCapital=%s' % nxt_word[0].isupper(),
#             'nxt_postag=' + nxt_postag,
#             'nxt_postag_isnounpronoun=%s' % (nxt_postag in ['NOUN','PROPN']),
#             'nxt_postag[:2]=' + nxt_postag[:2]])
#     else:
#         print("END")
#         features.append('END') # feature to track end of sentence
# if i < len(sent)-1:
#         word1 = sent[i+1][0]
#         postag1 = sent[i+1][1]
#         features.update({
#             '+1:word.lower()': word1.lower(),
#             '+1:word.istitle()': word1.istitle(),
#             '+1:word.isupper()': word1.isupper(),
#             '+1:postag': postag1,
#             '+1:postag[:2]': postag1[:2],
#         })
#     else:
#         features['EOS'] = True
    
#     features = {
#         'bias': 1.0,
#         'word.lower()': word.lower(),
#         'word[0]': word[0],
#         'word[-1]': word[-1],
#         'word[-2:]': word[-2:],
#         'word.isupper()': word.isupper(),
#         'word.istitle()': word.istitle(),
#         'word.isdigit()': word.isdigit(),
#         'postag':postag,
#         'postag_isnounpronoun': postag in ['NOUN','PROPN'],
#         'postag[:2]': postag[:2],
#     }
#     if i > 0:
#         word1 = sentence[i-1][0]
#         postag1 = sentence[i-1][1]
#         features.update({
#             '-1:word.lower()': word1.lower(),
#             '-1:word[0]': word1[0],
#             '-1:word[-1]': word1[-1],
#             '-1:word.istitle()': word1.istitle(),
#             '-1:word.isupper()': word1.isupper(),
#             '-1:postag': postag1,
#             '-1:postag[:2]': postag1[:2],
#             '-1:postag_isnounpronoun': postag1 in ['NOUN','PROPN']
#         })
#     else:
#         features['BEG'] = True

#     if i < len(sentence)-1:
#         word1 = sentence[i+1][0]
#         postag1 = sentence[i+1][1]
#         features.update({
#             '+1:word.lower()': word1.lower(),
#             '+1:word.istitle()': word1.istitle(),
#             '+1:word.isupper()': word1.isupper(),
#             '+1:postag': postag1,
#             '+1:postag[:2]': postag1[:2],
#             '+1:postag_isnounpronoun': postag1 in ['NOUN','PROPN']
#         })
#     else:
#         features['END'] = True

    return features

#### Sentence formation Strategy 
- Grouping the data frame rows based on sentence and forming the tuples.
- The tuples will help to get the text and relative POS tags while mapping the features.

In [40]:
def grouped_sentences(df):
    agg_func = lambda s: [(w, p, l) for w, p, l in zip(s["word"].values.tolist(), s["pos"].values.tolist(),s["label"].values.tolist())]
    grouped_sents = df.groupby("sent_idx").apply(agg_func)
    return [s for s in grouped_sents]

In [41]:
train_grp_sents = grouped_sentences(train_sp_df)
train_grp_sents[18]

[('We', 'PRON', 'O'),
 ('conclude', 'VERB', 'O'),
 ('that', 'PRON', 'O'),
 ('to', 'ADP', 'O'),
 ('reduce', 'VERB', 'O'),
 ('the', 'PRON', 'O'),
 ('rate', 'NOUN', 'O'),
 ('of', 'ADP', 'O'),
 ('macrosomic', 'NOUN', 'D'),
 ('infants', 'NOUN', 'D'),
 ('in', 'ADP', 'D'),
 ('gestational', 'ADJ', 'D'),
 ('diabetes', 'NOUN', 'D'),
 ('cases', 'NOUN', 'D'),
 (',', 'PUNCT', 'O'),
 ('good', 'ADJ', 'T'),
 ('glycemic', 'ADJ', 'T'),
 ('control', 'NOUN', 'T'),
 ('should', 'AUX', 'O'),
 ('be', 'AUX', 'O'),
 ('initiated', 'VERB', 'O'),
 ('before', 'ADP', 'O'),
 ('34', 'NUM', 'O'),
 ('completed', 'VERB', 'O'),
 ('gestational', 'ADJ', 'O'),
 ('weeks', 'NOUN', 'O')]

In [42]:
test_grp_sents = grouped_sentences(test_sp_df)
test_grp_sents[68]

[('At', 'ADP', 'O'),
 ('293', 'NUM', 'O'),
 ('sites', 'NOUN', 'O'),
 (',', 'PUNCT', 'O'),
 ('we', 'PRON', 'O'),
 ('randomly', 'ADJ', 'O'),
 ('assigned', 'VERB', 'O'),
 ('8803', 'NUM', 'O'),
 ('patients', 'NOUN', 'O'),
 ('who', 'PRON', 'O'),
 ('had', 'VERB', 'O'),
 ('had', 'VERB', 'O'),
 ('myocardial', 'ADJ', 'D'),
 ('infarction', 'NOUN', 'D'),
 (',', 'PUNCT', 'O'),
 ('treatment', 'NOUN', 'O'),
 ('with', 'ADP', 'O'),
 ('160', 'NUM', 'T'),
 ('mg', 'PUNCT', 'T'),
 ('aspirin', 'NOUN', 'T'),
 (',', 'PUNCT', 'O'),
 ('3', 'X', 'T'),
 ('mg', 'PUNCT', 'T'),
 ('warfarin', 'PROPN', 'T'),
 ('with', 'ADP', 'T'),
 ('80', 'NUM', 'T'),
 ('mg', 'PUNCT', 'T'),
 ('aspirin', 'NOUN', 'T'),
 (',', 'PUNCT', 'O'),
 ('or', 'CCONJ', 'O'),
 ('1', 'X', 'T'),
 ('mg', 'PUNCT', 'T'),
 ('warfarin', 'PROPN', 'T'),
 ('with', 'ADP', 'T'),
 ('80', 'NUM', 'T'),
 ('mg', 'PUNCT', 'T'),
 ('aspirin', 'NOUN', 'T')]

# Step 4: Getting the features

#### Get the features

In [43]:
# Write a code to get features for a sentence.
def getFeaturesForOneSentence(sentence):
    return [getFeaturesForOneWord(sentence, i) for i in range(len(sentence))]

In [44]:
# Apply function 'getFeaturesForOneSentence' to get features on a single sentence which is at index value 5 in train_sentences
example_sentence = train_grp_sents[18]
print(example_sentence)

features = getFeaturesForOneSentence(example_sentence)
features[8]

[('We', 'PRON', 'O'), ('conclude', 'VERB', 'O'), ('that', 'PRON', 'O'), ('to', 'ADP', 'O'), ('reduce', 'VERB', 'O'), ('the', 'PRON', 'O'), ('rate', 'NOUN', 'O'), ('of', 'ADP', 'O'), ('macrosomic', 'NOUN', 'D'), ('infants', 'NOUN', 'D'), ('in', 'ADP', 'D'), ('gestational', 'ADJ', 'D'), ('diabetes', 'NOUN', 'D'), ('cases', 'NOUN', 'D'), (',', 'PUNCT', 'O'), ('good', 'ADJ', 'T'), ('glycemic', 'ADJ', 'T'), ('control', 'NOUN', 'T'), ('should', 'AUX', 'O'), ('be', 'AUX', 'O'), ('initiated', 'VERB', 'O'), ('before', 'ADP', 'O'), ('34', 'NUM', 'O'), ('completed', 'VERB', 'O'), ('gestational', 'ADJ', 'O'), ('weeks', 'NOUN', 'O')]


['bias=1.0',
 'word.lower=macrosomic',
 'word[-3:]=mic',
 'word[-2:]=ic',
 'word.isupper=False',
 'word.isdigit=False',
 'words.startsWithCapital=False',
 'postag=NOUN',
 'postag_isnounpronoun=True',
 'postag[:2]=NO',
 'prev_word.lower=of',
 'prev_word.isupper=False',
 'prev_word[-3:]=of',
 'prev_word[-2:]=of',
 'prev_word.isdigit=False',
 'prev_words.startsWithCapital=False',
 'prev_postag=ADP',
 'prev_postag_isnounpronoun=False',
 'prev_postag[:2]=AD']

#### Get the labels

In [45]:
# Write a code to get the labels for a sentence.
def getLabelsForOneSentence(sentence):
    return [label for word, postag, label in sentence]

In [46]:
getLabelsForOneSentence(example_sentence)

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'D',
 'D',
 'D',
 'D',
 'D',
 'D',
 'O',
 'T',
 'T',
 'T',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

# Step 5: Define input and target variables


### Define the features' values for each sentence as input variable  for CRF model in test and the train dataset 

In [47]:
X_train = [getFeaturesForOneSentence(s) for s in train_grp_sents]
X_test = [getFeaturesForOneSentence(s) for s in test_grp_sents]

### Define the labels as the target variable for test and the train dataset

In [48]:
Y_train = [getLabelsForOneSentence(s) for s in train_grp_sents]
Y_test = [getLabelsForOneSentence(s) for s in test_grp_sents]

# Step 6: Build the CRF Model

In [49]:
# Build the CRF model.
# Build the CRF model.
crf = sklearn_crfsuite.CRF(max_iterations=100)
#crf.fit(X_train, Y_train)

try:
    crf.fit(X_train, Y_train)
except AttributeError:
    pass

# Step 7: Evaluation

### Predict the labels of each of the tokens in each sentence of the test dataset that has been pre processed earlier.

In [50]:
# Calculate the f1 score using the test data
Y_pred = crf.predict(X_test)
print(Y_pred)

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'D', 'D', 'D', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

### Calculate the f1 score using the actual labels and the predicted labels of the test dataset.

In [51]:
metrics.flat_f1_score(Y_test, Y_pred, average='weighted')

0.9002243497879628

In [52]:
pred_label=[]
for i in Y_pred:
    pred_label.extend(i)

In [64]:
test_sp_df['pred_lbl'] = pred_label
test_sp_df[test_sp_df.sent_idx==21]

Unnamed: 0,sent_idx,word,pos,label,pred_lbl
547,21,Sequelae,PROPN,O,O
548,21,include,VERB,O,O
549,21,severe,ADJ,O,O
550,21,developmental,ADJ,O,O
551,21,delay,NOUN,O,O
552,21,and,CCONJ,O,O
553,21,asymmetric,PROPN,O,D
554,21,double,ADJ,O,D
555,21,hemiplegia,PROPN,D,D


#### Observations

- With the custom NER model that was built, achieved <font color='blue'><b>F1 score is 0.90</b></font>

# Step 8: Identifying Diseases and Treatments using Custom NER



In [54]:
dis_treat_df =test_sp_df[(test_sp_df['pred_lbl'] != 'O')]
dis_treat_df.head(10)

Unnamed: 0,sent_idx,word,pos,label,pred_lbl
154,6,gestational,ADJ,O,D
155,6,diabetes,NOUN,O,D
156,6,cases,NOUN,O,D
360,13,Trisomy,NOUN,D,T
376,13,nonimmune,PROPN,D,D
377,13,hydrops,NOUN,D,D
378,13,fetalis,PROPN,D,D
391,14,preeclampsia,NOUN,D,D
427,16,severe,ADJ,O,D
428,16,preeclampsia,NOUN,D,D


In [55]:
dis_treat_df.set_index('sent_idx',inplace=True)
dis_treat_df.head(10)

Unnamed: 0_level_0,word,pos,label,pred_lbl
sent_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,gestational,ADJ,O,D
6,diabetes,NOUN,O,D
6,cases,NOUN,O,D
13,Trisomy,NOUN,D,T
13,nonimmune,PROPN,D,D
13,hydrops,NOUN,D,D
13,fetalis,PROPN,D,D
14,preeclampsia,NOUN,D,D
16,severe,ADJ,O,D
16,preeclampsia,NOUN,D,D


In [56]:
disease=[]
treatment=[]
sentence=[]
med_dict = {}
for i in dis_treat_df.index.unique():
    try:
        val = dis_treat_df.loc[i,'pred_lbl'].unique()
        #print('#### VAl: ', val)
        #print('#### I: ', i)
        if len(val) >= 2:
            disease_val = dis_treat_df[dis_treat_df['pred_lbl'] == 'D'].loc[i,'word']
            treatment_val = dis_treat_df[dis_treat_df['pred_lbl'] == 'T'].loc[i,'word']
            disease_single = disease_val if type(disease_val) == str else " ".join(disease_val)
            treatment_single = treatment_val if type(treatment_val) == str else " ".join(treatment_val)
            if disease_single not in disease:
                med_dict[disease_single] = treatment_single
            else:
                print('Entered')
                med_dict[disease_single] = med_dict.get(disease_single)+'/'+treatment_single
    except AttributeError:
        pass

In [57]:
display(med_dict)

{'nonimmune hydrops fetalis': 'Trisomy',
 'hereditary retinoblastoma': 'radiotherapy',
 'unstable angina or non - Q - wave myocardial infarction': 'roxithromycin',
 'cellulitis': 'G - CSF therapy intravenous antibiotic treatment',
 'foot infection': 'G - CSF treatment',
 'cardiac disease': 'fenfluramine - phentermine',
 "early Parkinson 's disease": 'Ropinirole monotherapy',
 "exclude abdominal tuberculosis Crohn 's disease": 'steroids',
 'female stress urinary incontinence': 'surgical treatment',
 'stress urinary incontinence': 'therapy',
 'preeclampsia': 'intrauterine insemination with donor sperm versus intrauterine insemination',
 'nuclear genoma': 'mutagenic agents',
 'intra - abdominal injury': 'senior surgery celiotomy',
 'severe acquired hyperammonemia cancer': 'organ transplantation and chemotherapy',
 'major pulmonary embolism': 'Thrombolytic treatment',
 'malignant pleural mesothelioma': 'thoracotomy , radiotherapy , and chemotherapy',
 'tumor markers pulmonary symptoms': 'c

In [58]:
from IPython.display import display, HTML
pd.set_option("display.max_columns", None)
medi_df = pd.DataFrame(list(zip(list(med_dict.keys()), list(med_dict.values()))),
               columns =['Disease', 'Treatments'])
display(medi_df.style.background_gradient(cmap='PuBu'))

Unnamed: 0,Disease,Treatments
0,nonimmune hydrops fetalis,Trisomy
1,hereditary retinoblastoma,radiotherapy
2,unstable angina or non - Q - wave myocardial infarction,roxithromycin
3,cellulitis,G - CSF therapy intravenous antibiotic treatment
4,foot infection,G - CSF treatment
5,cardiac disease,fenfluramine - phentermine
6,early Parkinson 's disease,Ropinirole monotherapy
7,exclude abdominal tuberculosis Crohn 's disease,steroids
8,female stress urinary incontinence,surgical treatment
9,stress urinary incontinence,therapy


### Predict the treatment for the disease name: 'hereditary retinoblastoma'

In [59]:
#Predict treatment withthe help of dictionary
sent_pos_lbl=[]
disease=''
test_sent=[]
treatment=''

input_sent = 'hereditary retinoblastoma'
m = spacy.load('en_core_web_sm')
doc = m(input_sent)
for i in doc:
    sent_pos_lbl.append((i.text,i.pos_,'D'))
test_sent.append(getFeaturesForOneSentence(sent_pos_lbl))
counter = 0
for i,tag in enumerate(crf.predict(test_sent)[0]):
    #print(tag)
    if tag == 'D':
        counter+=1
        tr = input_sent.split()[i]
        if counter > 1:
            disease += ' '
        disease += tr
        #print('TR ', tr)
        if tr in med_dict:
            treatment += ''+med_dict.get(tr)

if len(treatment) == 0:
        treatment = med_dict[disease]

if len(treatment) == 0:        
        treatment='None'
        
print('Identified Disease: ',disease)
print('Identified Treatment: ', treatment)

Identified Disease:  hereditary retinoblastoma
Identified Treatment:  radiotherapy


#### Observation

- <font color='blue'><b>The correct treatment was successfully identified for the disease.</b></font>

# Epilogue

- The feature selection plays a vital role in the custom NER.
- The custom NER exercise has several loose ends like the correct mapping of diseases to treatment. The manual labelling of the string which is prone to human errors.