# Named Entity Recognition

# Bio-Entity Recognition Task at BioNLP/NLPBA 2004:
### Task Definition:
The task aims to identify and classify technical terms in the domain of molecular biology that correspond to instances of concepts that are of interest to biologists.
### Data:
The training data used in the task came from the GENIA version 3.02 corpus, This was formed from a controlled search on MEDLINE using the MeSH terms 'human', 'blood cells' and 'transcription factors'. From this search, 2,000 abstracts were selected and hand annotated according to a small taxonomy of 48 classes based on a chemical classification. Among the classes, 36 terminal classes were used to annotate the GENIA corpus. For the shared task we decided however to simplify the 36 classes and used only the classes protein, DNA, RNA, cell line and cell type.

### File Visualization:


In [None]:
import pandas as pd
from IPython.display import display_html

#Try to change this variable value
sentence_to_visualize = 0

with open("./BetterDataset/data.train", 'rb') as file_handle:
    file_content = file_handle.read().decode('utf-8').strip()
    annotated_sentences = file_content.split('\r\n\r\n')
    sentence = annotated_sentences[sentence_to_visualize]
    sentence = sentence.split()
    sentence = [sentence[i:i + 2] for i in range(0, len(sentence), 2)]
    cols=['Tokens','Entity']
    df2 = pd.DataFrame(sentence, columns=cols)
    df2_styler = df2.reset_index(drop=True).style.set_table_attributes("style='display:inline'").set_caption('Entities Distribution')
    display_html(df2_styler._repr_html_(), raw=True)

# Task:1 Prepare the dataset for the model
### Read data from a ConLL file:

In [None]:
import os
def read_conll(filename_end):
    word_pos = 0
    pos_pos= None
    iob_pos = 3
    sep = '\t'
    IOB= 'IOB2'
    corpus_root="./BetterDataset"

    for root, dirs, files in os.walk(corpus_root):
        for filename in files:
            if filename.endswith(filename_end):
                with open(os.path.join(root, filename), 'rb') as file_handle:
                    try:
                        file_content = file_handle.read().decode('utf-8').strip()
                    except:
                        raise ValueError("Can't process!")
                    
                    annotated_sentences = file_content.split('\r\n\r\n')
                    for annotated_sentence in annotated_sentences:
                        annotated_tokens = [seq for seq in annotated_sentence.split('\r\n')]
                        all_tokens = []
                        #print(annotated_tokens)
                        for annotation in annotated_tokens:
                            conll_tokens = annotation.split(sep)
                            #print(conll_tokens)
                            all_tokens.append(conll_tokens)
                        yield all_tokens
                            
                            
data_train = list(read_conll('.train'))
data_test = list(read_conll('.test'))

#We can visualize the input for each sentence:
print(data_train[0])

### Dataset Statistics:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
def visualizeClassImbalance(data):
    # View the class count distribution
    array = np.array(data)
    
    dic={"O":0,"B-protein":0,"I-protein":0,"B-DNA":0, "I-DNA":0, "B-RNA":0, "I-RNA":0,
         "B-cell_type":0,"I-cell_type":0, "B-cell_line":0, "I-cell_line":0}
    for sub_array in array:
        for sub_sub_array in sub_array:
            dic[sub_sub_array[1]]=dic.get(sub_sub_array[1])+1
    f, ax = plt.subplots(figsize=(18,5)) 
    plt.bar(dic.keys(), dic.values(),width=0.5, color='C0')
    plt.title("Number of instances per label")
    plt.ylabel('# of Occurrences', fontsize=12)
    plt.xlabel('Label', fontsize=12)
    plt.show()
    
    dic={"B-protein":0,"I-protein":0,"B-DNA":0, "I-DNA":0, "B-RNA":0, "I-RNA":0,
         "B-cell_type":0,"I-cell_type":0, "B-cell_line":0, "I-cell_line":0}
    for sub_array in array:
        for sub_sub_array in sub_array:
            if sub_sub_array[1] in dic:
                dic[sub_sub_array[1]]=dic.get(sub_sub_array[1])+1
    f, ax = plt.subplots(figsize=(18,5)) 
    plt.bar(dic.keys(), dic.values(),width=0.5, color='C0')
    plt.title("Number of instances per label")
    plt.ylabel('# of Occurrences', fontsize=12)
    plt.xlabel('Label', fontsize=12)
    plt.show()
    
    dic={"protein":0,"DNA":0,"RNA":0,"cell_type":0,"cell_line":0}
    for sub_array in array:
        for sub_sub_array in sub_array:
            if sub_sub_array[1][2:] in dic:
                dic[sub_sub_array[1][2:]]=dic.get(sub_sub_array[1][2:])+1
    f, ax = plt.subplots(figsize=(18,5)) 
    plt.bar(dic.keys(), dic.values(),width=0.5, color='C0')
    plt.title("Number of instances per label")
    plt.ylabel('# of Occurrences', fontsize=12)
    plt.xlabel('Label', fontsize=12)
    plt.show()



data_train = list(read_conll('.train'))
data_test = list(read_conll('.test'))

visualizeClassImbalance(data_train)


### Merge sentence and label vectors:

In [None]:
def transform(data):
    sentences_array=[]
    labels_array=[]
    for data_input in data:
        sentence=[]
        labels=[]
        for vec in data_input:
            sentence.append(vec[0])
            labels.append(vec[1])
        sentences_array.append(sentence)
        labels_array.append(labels)

    return sentences_array,labels_array

sentences_train, labels_train = transform(data_train)
sentences_test, labels_test = transform(data_test)

#We can visualize the input for each sentence:
print(sentences_train[0])
print(labels_train[0])

### Convert Labels to Numeric Values:

In [None]:
import numpy as np
def convert(labels_array):
    labels_array_changed = np.copy(labels_array)
    for idx,label_vec in enumerate(labels_array_changed):
        for idx,label in enumerate(label_vec):
            if label=="O":
                label_vec[idx]=0
            if label=="B-protein":
                label_vec[idx]=1
            if label=="I-protein":
                label_vec[idx]=2
            if label=="B-DNA":
                label_vec[idx]=3
            if label=="I-DNA":
                label_vec[idx]=4
            if label=="B-RNA":
                label_vec[idx]=5
            if label=="I-RNA":
                label_vec[idx]=6
            if label=="B-cell_type":
                label_vec[idx]=7
            if label=="I-cell_type":
                label_vec[idx]=8
            if label=="B-cell_line":
                label_vec[idx]=9
            if label=="I-cell_line":
                label_vec[idx]=10
    return labels_array_changed

labels_train_transformed = convert(labels_train)
labels_test_transformed = convert(labels_test)
print(labels_train_transformed.shape)
print(labels_test_transformed.shape)
print(labels_train_transformed[0])
print(labels_test_transformed[0])

### Optional to convert to one-hot encoding :

In [None]:
import keras
def to_categorical(labels_array):
    print(labels_array.shape)
    labels_train_changed = np.copy(labels_array)
    for idx,label_vec in enumerate(labels_train_changed):
        label_vec = keras.utils.to_categorical(label_vec, num_classes=11, dtype='float32')
        labels_train_changed[idx]=label_vec
    return labels_train_changed

print(labels_train_transformed[0])
example = to_categorical(labels_train_transformed)
print(example[0])

### Visualize Reports length:

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib.ticker import FuncFormatter
tlen = [len(x) for x in sentences_train] 
fig, ax = plt.subplots()
plt.hist(tlen, bins=np.arange(max(tlen)), histtype='barstacked', linewidth=2)
plt.title("Length of reports")
plt.ylabel('# of Instances', fontsize=12)
plt.xlabel('Length of reports', fontsize=12)
plt.show()

### Padding to Input Shape:

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np


tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_train)
tokenizer.fit_on_texts(sentences_test)
voc_size = len(tokenizer.word_index)+1

def convert2(x,y):
    X_total = tokenizer.texts_to_sequences(x)
    X_total = pad_sequences(X_total, maxlen=50, padding='post')
    Y_total = pad_sequences(y, maxlen=50, padding='post', value=0)
    return X_total,Y_total

X_train, y_train = convert2(sentences_train, labels_train_transformed)
X_test, y_test = convert2(sentences_test, labels_test_transformed)

print("Input Shapes:")
print(X_train.shape)
print(X_test.shape)

print("Target Shapes:")
y_train = np.expand_dims(y_train, axis=2)
print(y_train.shape)
y_test = np.expand_dims(y_test, axis=2)
print(y_test.shape)

### Metric Evaluation:

In [None]:
from tensorflow.python.ops import math_ops
from tensorflow.python.framework import ops
from tensorflow.python.keras import backend as K
from tensorflow.python.ops import array_ops
def new_sparse_categorical_accuracy(y_true, y_pred):
        y_pred_rank = ops.convert_to_tensor(y_pred).get_shape().ndims
        y_true_rank = ops.convert_to_tensor(y_true).get_shape().ndims
        # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
        if (y_true_rank is not None) and (y_pred_rank is not None) and (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
            y_true = array_ops.squeeze(y_true, [-1])
        y_pred = math_ops.argmax(y_pred, axis=-1)
        # If the predicted output and actual output types don't match, force cast them
        # to match.
        if K.dtype(y_pred) != K.dtype(y_true):
            y_pred = math_ops.cast(y_pred, K.dtype(y_true))
        return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())

# Attempt 1:
### Create Model:

In [None]:
from keras.models import Model
from keras.layers import Dense,TimeDistributed, Input, Embedding,Bidirectional,LSTM,Dropout
from sklearn.utils import class_weight
import numpy as np

def create_model(voc):
    sequence_input = Input(shape=(50,), dtype='int32')
    embedded_sequences = Embedding(voc, 32, input_length=50)(sequence_input)
    bilstm = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True, return_state=False))(embedded_sequences)
    dense = Dense(256)(bilstm)
    drop = Dropout(0.2)(dense)
    preds = TimeDistributed(Dense(11, activation='softmax'))(drop)
    model = Model(inputs=sequence_input, outputs=preds)

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[new_sparse_categorical_accuracy])
    model.summary()
    return model

model = create_model(voc_size)

### Train:

In [None]:
from keras_tqdm import TQDMNotebookCallback
from sklearn_crfsuite import metrics

model.fit(x=X_train, y=y_train,validation_split=0.2,batch_size=32, epochs=3,verbose=0, callbacks=[TQDMNotebookCallback(leave_inner=True)])
y_pred = model.predict(X_test)
y_pred = y_pred.argmax(axis=-1)
y_pred = np.expand_dims(y_pred, axis=2)
print(metrics.flat_classification_report(y_test, y_pred, digits=3))



### Train2:

In [None]:
from sklearn_crfsuite import metrics
from sklearn.utils import class_weight
import numpy as np

def checkifAllOut(sentence):
    for output in sentence:
        if(output!=0):
            return True
    return False

def RemoveOnlyOtherSentences(x_train,y_train):
    x_train_final = []
    y_train_final = []
    for idx, sentence in enumerate(y_train):
        if(checkifAllOut(sentence)):
            x_train_final.append(x_train[idx])
            y_train_final.append(sentence)
    return np.array(x_train_final),np.array(y_train_final)

unique, counts = np.unique(y_train, return_counts=True)
print(unique)
print(counts)

X_train_new, y_train_new = RemoveOnlyOtherSentences(X_train,y_train)

unique, counts = np.unique(y_train_new, return_counts=True)
print(unique)
print(counts)

print(X_train_new.shape)
print(y_train_new.shape)

model.fit(x=X_train_new, y=y_train_new,validation_split=0.2,batch_size=64,
          epochs=3,verbose=0,
          callbacks=[TQDMNotebookCallback(leave_inner=True)])
y_pred = model.predict(X_test)
y_pred = y_pred.argmax(axis=-1)
y_pred = np.expand_dims(y_pred, axis=2)
print(metrics.flat_classification_report(y_test, y_pred, digits=3))


In [None]:
from keras.models import Model
from keras.layers import Dense,TimeDistributed, Input, Embedding,Bidirectional,LSTM,Dropout
from keras_contrib.layers import CRF
from keras_contrib import losses, metrics
from sklearn.utils import class_weight
import numpy as np


def create_model(voc):
    sequence_input = Input(shape=(50,), dtype='int32')
    embedded_sequences = Embedding(voc,64, input_length=50)(sequence_input)
    bilstm = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True, return_state=False))(embedded_sequences)
    dense = Dense(256)(bilstm)
    drop = Dropout(0.2)(dense)
    preds = TimeDistributed(Dense(64, activation='relu'))(drop)
    crf = CRF(11)
    out = crf(preds)
    model = Model(inputs=sequence_input, outputs=out)
    model.compile(optimizer="rmsprop", loss=losses.crf_loss, metrics=[metrics.crf_accuracy])
    model.summary()
    return model



model = create_model(voc_size)

In [None]:
y_train_input = np.squeeze(y_train_new, axis=2)
from keras.utils import to_categorical
from sklearn_crfsuite import metrics
# One-Hot encode
y = [to_categorical(i, num_classes=11) for i in y_train_new]  # n_tags+1(PAD)
y = np.array(y)
print(y.shape)

model.fit(x=X_train_new, y=y,validation_split=0.2,batch_size=64,
          epochs=3,verbose=0,
          callbacks=[TQDMNotebookCallback(leave_inner=True)])
y_pred = model.predict(X_test)
y_pred = y_pred.argmax(axis=-1)
y_pred = np.expand_dims(y_pred, axis=2)
print(metrics.flat_classification_report(y_test, y_pred, digits=3))