# Download dataset

In [1]:
with open("kaggle.json", "w") as f: 
    f.write("""{"username":"quclongphan","key":"d872fbcff24be69d93eecb5930088b19"}""")

In [2]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
'chmod 600 /root/.kaggle/kaggle.json'
!kaggle datasets download -d abhinavwalia95/entity-annotated-corpus
!unzip entity-annotated-corpus.zip
!rm -r ./entity-annotated-corpus.zip

Downloading entity-annotated-corpus.zip to /content
 87% 23.0M/26.4M [00:00<00:00, 54.7MB/s]
100% 26.4M/26.4M [00:00<00:00, 70.5MB/s]
Archive:  entity-annotated-corpus.zip
  inflating: ner.csv                 
  inflating: ner_dataset.csv         


In [3]:
!pip install sklearn-crfsuite
!pip install seqeval
!pip show tensorflow

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743 kB)
[K     |████████████████████████████████| 743 kB 5.1 MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.3 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16181 sha256=7c946082812a1c5f8e7f28ae1c3f17e1a9e3e2b3cb3c6e7fbef7efd5aab3cdef
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Name: ten

# Import Libraries

In [4]:
import pandas as pd
import numpy as np
import os 

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Embedding, Bidirectional, Input, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping 

from sklearn.model_selection import train_test_split
from sklearn_crfsuite.metrics import flat_classification_report

from seqeval.metrics import precision_score, recall_score, f1_score, classification_report


# Load Dataset

In [85]:
def load_data(filename='./ner_dataset.csv'):
    # Read CSV
    df = pd.read_csv(filename, encoding = "ISO-8859-1", error_bad_lines=False)
    
    # Fill null cell by cell above => Mark all sentence col
    df = df.fillna(method = 'ffill')
    return df

df = load_data()
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


In [86]:
# All tag label for NER
print('Number of tag for NER: ', len(df['Tag'].value_counts()))
df['Tag'].value_counts()

Number of tag for NER:  17


O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

# Preprocess Dataset

In [87]:
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                      s['POS'].values.tolist(),
                                                      s['Tag'].values.tolist())]

        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]

In [88]:
sent = sentence(df)

In [89]:
def process_data(df, sentences):
    max_len = max(map(len, sentences))

    # Xây dựng vocab cho word và tag
    words = list(df['Word'].unique())
    tags = list(df['Tag'].unique())

    # Tạo dict word to index, thêm 2 từ đặc biệt là Unknow và Padding
    word2idx = {w : i + 2 for i, w in enumerate(words)}
    word2idx["UNK"] = 1
    word2idx["PAD"] = 0

    # Tạo dict tag to index, thêm 1 tag đặc biệt và Padding
    tag2idx = {t : i + 1 for i, t in enumerate(tags)}
    tag2idx["PAD"] = 0

    # Tạo 2 dict index to word và index to tag
    idx2word = {i: w for w, i in word2idx.items()}
    idx2tag = {i: w for w, i in tag2idx.items()}

    # Chuyển các câu về dạng vector of index
    X = [[word2idx[w[0]] for w in s] for s in sentences]
    # Padding các câu về max_len
    X = pad_sequences(maxlen = max_len, sequences = X, padding = "post", value = word2idx["PAD"])
    # Chuyển các tag về dạng index
    y = [[tag2idx[w[2]] for w in s] for s in sentences]
    # Tiền hành padding về max_len
    y = pad_sequences(maxlen = max_len, sequences = y, padding = "post", value = tag2idx["PAD"])

    # Chuyển y về dạng one-hot
    num_tag = df['Tag'].nunique()
    y = [to_categorical(i, num_classes = num_tag + 1) for i in y]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)

    # Save data
    return X_train, X_test, y_train, y_test, word2idx, tag2idx, idx2word, idx2tag, num_tag, words, tags

In [90]:
x_train, x_test, y_train, y_test, word2idx, tag2idx, idx2word, idx2tag, num_tag, words, tags = process_data(df, sent.sentences)

# Build Model

In [91]:
# Config
max_len = max(map(len, sent.sentences))
embedding = 40

def build_model(num_tags, hidden_size = 50):
    input = Input(shape=(max_len,))
    embed = Embedding(input_dim=len(words) + 2, output_dim=embedding, input_length=max_len, mask_zero=False)(input)
    embed = Dropout(0.1)(embed)
    arch = Bidirectional(LSTM(units=hidden_size, return_sequences=True, recurrent_dropout=0.1))(embed)
    output = TimeDistributed(Dense(num_tags + 1, activation="softmax"))(arch)
    
    model = Model(input, output)
    model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

    model.summary()
    return model

# Train model

In [99]:
early_stopping = EarlyStopping(
    monitor = 'val_accuracy',
    patience =2,
    verbose = 0,
    mode = 'max',
    restore_best_weights = True)

checkpoint = ModelCheckpoint(
    filepath = 'model.hdf5',
    verbose = 0,
    mode = 'auto',
    save_best_only = True,
    monitor='val_loss')
    
if not os.path.exists("model.hdf5"):
    model = build_model(num_tag)
        
    history = model.fit(
        x_train,
        np.array(y_train),
        validation_split =0.1,
        batch_size = 64,
        epochs = 10,
        callbacks=[early_stopping, checkpoint]
    )
else:
    model = build_model(num_tag)
    model.load_weights("model.hdf5")

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 104)]             0         
                                                                 
 embedding_5 (Embedding)     (None, 104, 40)           1407200   
                                                                 
 dropout_5 (Dropout)         (None, 104, 40)           0         
                                                                 
 bidirectional_5 (Bidirectio  (None, 104, 100)         36400     
 nal)                                                            
                                                                 
 time_distributed_5 (TimeDis  (None, 104, 18)          1818      
 tributed)                                                       
                                                                 
Total params: 1,445,418
Trainable params: 1,445,418
Non-tra

# Evaluate

In [100]:
# Test với toàn bộ tập test
y_pred = model.predict(x_test)

# Get max softmax value for each words
y_pred = np.argmax(y_pred, axis=-1)
# Mapping to label name
y_pred = [[idx2tag[i] for i in row] for row in y_pred]
# Remove PAD
y_pred = [[w for w in s if w != 'PAD'] for s in y_pred]

# Get max value for each word in test label
y_test_true = np.argmax(y_test, -1)
# Mapping to label name
y_test_true = [[idx2tag[i] for i in row] for row in y_test_true]
# Remove PAD
y_test_true = [[w for w in s if w != 'PAD'] for s in y_test_true]

# Make y_pred and y_test_true has same shape
for i in range(len(y_pred)):
    l1 = [*map(len, y_test_true)]
    l2 = [*map(len, y_pred)]
    if l1[i] != l2[i]:
        minlen = min(l1[i], l2[i])
        y_test_true[i] = y_test_true[i][:minlen]
        y_pred[i] = y_pred[i][:minlen]

# Kiểm thử Precision, Recall và F1-Score
k = precision_score(y_test_true, y_pred)
print("Precision-score is : {:.1%}".format(precision_score(y_test_true, y_pred)))
print("Recall-score is : {:.1%}".format(recall_score(y_test_true, y_pred)))
print("F1-score is : {:.1%}".format(f1_score(y_test_true, y_pred)))

Precision-score is : 80.3%
Recall-score is : 80.0%
F1-score is : 80.2%


In [101]:
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer

labels = list(set(df['Tag']))

# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

flat_y_test_true = [t for s in y_test_true for t in s]
flat_y_pred = [t for s in y_pred for t in s]

report = metrics.classification_report(flat_y_test_true, flat_y_pred, labels=sorted_labels, digits=3, zero_division=0)#, output_dict=True)
print(report)

              precision    recall  f1-score   support

           O      0.987     0.993     0.990    132315
       B-art      0.500     0.018     0.034        56
       I-art      0.000     0.000     0.000        39
       B-eve      0.778     0.149     0.250        47
       I-eve      0.286     0.049     0.083        41
       B-geo      0.849     0.871     0.860      5598
       I-geo      0.813     0.737     0.773      1083
       B-gpe      0.957     0.937     0.947      2311
       I-gpe      1.000     0.364     0.533        22
       B-nat      0.750     0.115     0.200        26
       I-nat      0.000     0.000     0.000         4
       B-org      0.796     0.698     0.744      3039
       I-org      0.771     0.783     0.777      2425
       B-per      0.849     0.794     0.820      2565
       I-per      0.839     0.872     0.855      2609
       B-tim      0.905     0.877     0.891      3077
       I-tim      0.803     0.713     0.755      1024

    accuracy              

In [102]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def predict_ner(model, sentence, word2idx, idx2tag):
    sent_token = word_tokenize(sentence)

    # Chuyển câu về dạng vector of index
    x = []
    for w in sent_token:
        if w in word2idx:
            x+=[word2idx[w]]
        else:
            x.append(word2idx['UNK'])

    # Padding các câu về max_len
    x = pad_sequences(maxlen=max_len, sequences=[x], padding="post", value=word2idx["PAD"])

    # Predict
    y_pred = model.predict(x)

    # Get max softmax value for each words
    y_pred = np.argmax(y_pred, axis=-1)
    # Mapping to label name
    y_pred = [idx2tag[i] for i in y_pred[0]]
    # Remove PAD
    y_pred = [w for w in y_pred if w != 'PAD']

    return zip(sent_token, y_pred)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [103]:
#@title Test with Input string { vertical-output: true, form-width: "50%", display-mode: "both" }

input_text = "He says Andrew Jeffrey  cannot block lawsuit from accuser Virginia on grounds she no longer lives in US." #@param [] {allow-input: true}

#@markdown ---

result = [*predict_ner(model, input_text, word2idx, idx2tag)]
filter_result = filter(lambda w: w[1]!='O', result)

print("=========== All NER detected ===========")
print(pd.DataFrame(filter_result, columns =['Word', 'Label']))

print("\n=========== Result ===========")
pd.DataFrame(result, columns =['Word', 'Label']).T

       Word  Label
0    Andrew  B-per
1   Jeffrey  I-per
2  Virginia  B-geo
3        US  B-geo



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Word,He,says,Andrew,Jeffrey,can,not,block,lawsuit,from,accuser,Virginia,on,grounds,she,no,longer,lives,in,US,.
Label,O,O,B-per,I-per,O,O,O,O,O,O,B-geo,O,O,O,O,O,O,O,B-geo,O
