In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
!pip install pycrf
!pip install sklearn-crfsuite

In [5]:
import spacy
import textwrap
import warnings
import json
import sklearn_crfsuite

In [6]:
from tqdm import tqdm
from sklearn_crfsuite import metrics

warnings.filterwarnings("ignore")
model = spacy.load("en_core_web_sm")

# **Data Preprocessing**
The dataset provided is in the form of one word per line. Let's understand the format of data below:

Suppose there are x words in a sentence, then there will be x continuous lines with one word in each line.
Further, the two sentences are separated by empty lines. The labels for the data follow the same format.
We need to pre-process the data to recover the complete sentences and their labels.

In [7]:
# Function to read the file if given filename
def read_file(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        content = file.readlines()
    
    sentences = []
    sentence = ""
    word_count = 0
    
    for word in content:
        word = word.strip('\n')
        if word == "":
            sentences.append(sentence.rstrip(" "))
            sentence = ""
        else:
            word_count += 1
            sentence += word + " "
            
    print("Items in File       : ", len(content))
    print("Number of Words     : ", word_count)
    print("Number of Sentences : ", len(sentences))
    
    prefix = "First Sentence      :  "
    wrapper = textwrap.TextWrapper(initial_indent = prefix, width = 150, subsequent_indent = ' '*len(prefix))
    print(wrapper.fill(sentences[0]))
    
    return sentences

# Count the number of sentences in the processed train and test dataset

In [8]:
print("Training Sentences")
print("------------------")
train_sentences = read_file("../input/medical-entity-recognition-ner/Dataset/train_sent")

print("\n")
print("Testing Sentences")
print("------------------")
test_sentences = read_file("../input/medical-entity-recognition-ner/Dataset/test_sent")

In [9]:
print("Training Labels")
print("--------------")
train_labels = read_file("../input/medical-entity-recognition-ner/Dataset/train_label")

print("\n")
print("Testing Labels")
print("--------------")
test_labels = read_file("../input/medical-entity-recognition-ner/Dataset/test_label")

In [10]:
def get_pos_tags(reviews, labels, tag = ""):
    
    sentence = []
    pos = []
    lemma = []
    text = []
    label = []
    
    i = 1 # Sentence Count
    
    for review, review_labels in tqdm(zip(reviews, labels)):
        #doc = model(review)
        for doc, review_label in zip(review.split(), review_labels.split()):
            s = model(doc)
            for tok in s:
                sentence.append(tag + str(i))
                pos.append(tok.pos_)
                lemma.append(tok.lemma_)
                text.append(tok.text)
                label.append(review_label)
        
        i += 1
    
    return sentence, pos, lemma, text, label

print("Training Sentences")
print("------------------")
train_sentence, train_pos, train_lemma, train_text, train_label = get_pos_tags(train_sentences, train_labels, "train_")
train_frequency_df = pd.DataFrame({'sentence':train_sentence, 'text':train_text,'lemma':train_lemma,'pos':train_pos,'label':train_label})

print("\n")
print("Testing Sentences")
print("------------------")
test_sentence, test_pos, test_lemma, test_text, test_label = get_pos_tags(test_sentences, test_labels, "test_")
test_frequency_df = pd.DataFrame({'sentence':test_sentence, 'text':test_text,'lemma':test_lemma,'pos':test_pos,'label':test_label})

# Convert the data into a dataframe object.
frequency_df = pd.concat((train_frequency_df.copy(), test_frequency_df.copy()),axis=0)

In [11]:
# Get most frequent text forms of nouns
frequency_df[(frequency_df['pos'] == 'NOUN') | (frequency_df['pos'] == 'PROPN')]['text'].value_counts()[:25]

In [12]:
# Get most frequent lemma forms of nouns
frequency_df[(frequency_df['pos'] == 'NOUN') | (frequency_df['pos'] == 'PROPN')]['lemma'].value_counts()[:25]

# Defining features for CRF
We have defined the following features for CRF modeule building:

f1 = input word is in lower case;
f2 = last 3 characters of word;
f3 = last 2 characters of word;
f4 = 1; if the word is in uppercase; otherwise, 0
f5 = 1; if the word is a number; otherwise, 0
f6 = 1; if the word starts with a capital letter; otherwise, 0
f7 = 1; if PoS Tag of the word is Noun or Pronoun; otherwise, 0
f8 = 1; if PoS Tag of the word is Noun or Pronoun; otherwise, 0
f9 = B; if beginning
f10 = E; if ending

In [13]:
# Let's define the features to get the feature value for one word.
def getFeaturesForOneWord(word_details, pos):
    word_details.reset_index(drop=True, inplace=True)
    word = word_details[pos][0]
    postag = word_details[pos][1]
    
    features = [
        'bias=' + "1.0",
        'word.lower=' + word.lower(),
        'word[-3]=' + word[:-3],
        'word[-2]=' + word[:-2],
        'word.islower=%s' % word.islower(),
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag.isnounpronoun=%s' % (postag in ['NOUN','PROPN']),
    ]
    
    if (pos > 0):
        prev_word = word_details[pos-1][0]
        prev_postag = word_details[pos-1][1]
        
        features.extend([
            'prev_word.lower=' + prev_word.lower(),
            'prev_word[-3]=' + prev_word[:-3],
            'prev_word[-2]=' + prev_word[:-2],
            'prev_word.islower=%s' % prev_word.islower(),
            'prev_word.isupper=%s' % prev_word.isupper(),
            'prev_word.istitle=%s' % prev_word.istitle(),
            'prev_word.isdigit=%s' % prev_word.isdigit(),
            'prev_postag=' + prev_postag,
            'prev_postag.isnounpronoun=%s' % (prev_postag in ['NOUN','PROPN']),
        ])
    else:
        features.append('BEG')
        
    if (pos < len(word_details) - 1):
        next_word = word_details[pos+1][0]
        next_postag = word_details[pos+1][1]
        
        features.extend([
            'next_word.lower=' + next_word.lower(),
            'next_word[-3]=' + next_word[:-3],
            'next_word[-2]=' + next_word[:-2],
            'next_word.islower=%s' % next_word.islower(),
            'next_word.isupper=%s' % next_word.isupper(),
            'next_word.istitle=%s' % next_word.istitle(),
            'next_word.isdigit=%s' % next_word.isdigit(),
            'next_postag=' + next_postag,
            'next_postag.isnounpronoun=%s' % (next_postag in ['NOUN','PROPN']),
        ])
    else:
        features.append('END')
        
    return features

## Getting the features
Write a code/function to get the features for a sentence

In [14]:
# Write a code to get features for a sentence.
def get_word_details(item):
    return item["text"], item["pos"]

def getFeaturesForOneSentence(sentence_id):
    words_for_features = frequency_df[frequency_df["sentence"] == sentence_id].apply(get_word_details, axis=1)
    return [getFeaturesForOneWord(words_for_features, pos) for pos in range(len(words_for_features))]

In [15]:
features = getFeaturesForOneSentence("train_1")
prefix = "01 Sentence : "
wrapper = textwrap.TextWrapper(initial_indent = prefix, width = 150, subsequent_indent = ' '*len(prefix))
print(wrapper.fill(train_sentences[0]))
print('\n')

i = 1
for feature in features:
    prefix = str('%02d' % i) + " Word     : "
    wrapper = textwrap.TextWrapper(initial_indent = prefix, width = 150, subsequent_indent = ' '*len(prefix))
    print(wrapper.fill(str(feature)))
    i += 1

In [16]:
# Write a code to get the labels for a sentence.
def getLabelsForOneSentence(sentence_id):
    return frequency_df[frequency_df["sentence"] == sentence_id]["label"]

In [17]:
labels = getLabelsForOneSentence("train_1")

prefix = "01 Labels  : "
wrapper = textwrap.TextWrapper(initial_indent = prefix, width = 150, subsequent_indent = ' '*len(prefix))
print(wrapper.fill(" ".join(labels)))

### Define input and target variables
Correctly computing X and Y sequence matrices for training and test data. Check that both sentences and labels are processed

Define the features' values for each sentence as input variable for CRF model in test and the train dataset

In [18]:
print("Training Sentences")
print("------------------")
X_train = [getFeaturesForOneSentence("train_" + str(i+1)) for i in tqdm(range(len(train_sentences)))] 

print("\n")
print("Testing Sentences")
print("------------------")
X_test = [getFeaturesForOneSentence("test_" + str(i+1)) for i in tqdm(range(len(test_sentences)))]

In [19]:
print("Training Labels")
print("------------------")
Y_train = [getLabelsForOneSentence("train_" + str(i+1)) for i in tqdm(range(len(train_labels)))] 

print("\n")
print("Testing Labels")
print("------------------")
Y_test = [getLabelsForOneSentence("test_" + str(i+1)) for i in tqdm(range(len(test_labels)))]

In [22]:
# Build the CRF model.
crf = sklearn_crfsuite.CRF(c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
try:
    crf.fit(X_train, Y_train)
except AttributeError:
    pass
predictions = crf.predict(X_test)

## Evaluation
Predict the labels of each of the tokens in each sentence of the test dataset that has been pre processed earlier.

In [23]:
Y_pred = crf.predict(X_test)

## Calculate the f1 score using the actual labels and the predicted labels of the test dataset.

In [24]:
f1_score = metrics.flat_f1_score(Y_test, Y_pred, average='weighted')
print('Predicted F1-Score : {0} % '.format(round(f1_score*100,2)))

Identifying Diseases and Treatments using Custom NER
We now use the CRF model's prediction to prepare a record of diseases identified in the corpus and treatments used for the diseases.

In [25]:
def get_labels_as_array(labels):
    pred_label = []

    for label in labels:
        pred_label.extend(label)
        
    return pred_label

test_frequency_df["pred_label"] = get_labels_as_array(Y_pred)
test_frequency_df.head(5)

In [26]:
new_df = test_frequency_df[(test_frequency_df.pred_label != 'O')]
new_df.set_index('sentence',inplace=True)

disease=[]
treatment=[]
sentence=[]
med_dict = {}

for i in new_df.index.unique():
    try:
        val = new_df.loc[i,'pred_label'].unique()
        if len(val) == 2:
            disease_val = new_df[new_df.pred_label == 'D'].loc[i,'text']
            treatment_val = new_df[new_df.pred_label == 'T'].loc[i,'text']
            disease_single = disease_val if type(disease_val) == str else " ".join(disease_val)
            treatment_single = treatment_val if type(treatment_val) == str else " ".join(treatment_val)
            if disease_single not in disease:
                med_dict[disease_single] = treatment_single
            else:
                print('Entered')
                med_dict[disease_single] = med_dict.get(disease_single)+'/'+treatment_single
    except AttributeError:
        pass

print(json.dumps(dict(sorted(med_dict.items())), indent = 4))

In [27]:
disease=''
treatment=''

input_text = []
input_pos = []
input_label = []

input_sent = 'hereditary retinoblastoma'

input_model = model(input_sent)

for word in input_model:
    input_text.append(word.text)
    input_pos.append(word.pos_)
    input_label.append('D')

details_sent = pd.DataFrame({'text':input_text, 'pos':input_pos,'label':input_label})
words_for_features = details_sent.apply(get_word_details, axis=1)

test_sent = []

for i in range(len(input_sent.split())):
    test_sent.append(getFeaturesForOneWord(words_for_features, i))

for i,tag in enumerate(crf.predict([test_sent])[0]):
    
    if tag == 'D':
        tr = input_sent.split()[i]
        disease += tr + " "

        if tr in med_dict:
            treatment += med_dict.get(tr) + ", "
            
        if disease.strip() in med_dict:
            treatment += med_dict.get(disease.strip()) + ", "

disease = disease.strip()

if len(treatment) == 0:
    treatment = 'Not Available'
else:
    treatment = treatment.rstrip(", ")
    
print('Identified Disease   :', disease)
print('Identified Treatment :', treatment)