<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/deeplearning.ai/tf/tf_ner_bi_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Download Kaggle Dataset
#@markdown Dataset: Annotated Corpus for Named Entity Recognition <br>
#@markdown https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
#@markdown ___

kaggle_dataset_id = "abhinavwalia95/entity-annotated-corpus" #@param {type:"string"}

!pip install -q kaggle
from google.colab import drive
drive.mount('/content/gdrive')

!mkdir -p ~/.kaggle
!cp /content/gdrive/My\ Drive/kaggle/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d {kaggle_dataset_id}
!ls -l /content
!unzip -o /content/entity-annotated-corpus

#@markdown ___
#@markdown Install dependencies<br>
#@markdown - seqeval: Sequence labeling evaluation (F1, precision, etc).
#@markdown - fastprogress: Progress bar for Jupyter notebooks.

!pip install -Uqq seqeval
!pip install -Uqq fastprogress

Mounted at /content/gdrive
Downloading entity-annotated-corpus.zip to /content
 64% 17.0M/26.4M [00:01<00:00, 13.2MB/s]
100% 26.4M/26.4M [00:01<00:00, 18.9MB/s]
total 27064
-rw-r--r-- 1 root root 27703149 Dec 28 17:08 entity-annotated-corpus.zip
drwx------ 5 root root     4096 Dec 28 17:08 gdrive
drwxr-xr-x 1 root root     4096 Dec 21 17:29 sample_data
Archive:  /content/entity-annotated-corpus.zip
  inflating: ner.csv                 
  inflating: ner_dataset.csv         
[K     |████████████████████████████████| 51kB 1.7MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [3]:
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-f4ca656e-a449-1e4b-359b-42f4add93ee1)


In [4]:
import math
import pathlib
import shutil
import tempfile

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tf_ad
from fastprogress.fastprogress import progress_bar
from numpy.random import seed
from seqeval.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential, Model
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import (
    LSTM,
    Bidirectional,
    Dense,
    Dropout,
    Embedding,
    SpatialDropout1D,
    TimeDistributed,
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.random import set_seed

set_seed(42)
seed(42)

logdir = pathlib.Path(tempfile.mkdtemp())/"tensorflow_logs"
shutil.rmtree(logdir, ignore_errors=True)

In [5]:
#@title Utils
#@markdown ```
#@markdown - build_vocab(): Extracts unique tokens and tags
#@markdown - build_indexes(): Builds the tokens and tags mapping indexes
#@markdown - decode_one_hot_tags_sequence():
#@markdown - decode_tags_batch()
#@markdown - test_inference()
#@markdown ```

AUTOTUNE = tf.data.experimental.AUTOTUNE

def configure_dataset(dataset):
    return dataset.cache().prefetch(buffer_size=AUTOTUNE)

def build_vocab(data):
    tokens = {token for token in data["word"]}
    tokens = {"unk" if t is math.nan or isinstance(t, float) else t for t in tokens}
    tokens.add("PAD")

    tags = {tag for tag in data["tag"]}
    tags = {"O" if t is math.nan or isinstance(t, float) else t for t in tags}
    return tokens, tags

def build_tagged_senteces(data):
    agg_func = lambda s: [(w, t) for w, t in zip(s["word"], s["tag"])]
    grouped = data.groupby("sentence_idx").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences

def build_indexes(tokens, tags):
    token2idx = {token: idx for idx, token in enumerate(tokens)}
    idx2token = {idx: token for idx, token in enumerate(tokens)}
    tag2idx = {tag: idx for idx, tag in enumerate(tags)}
    idx2tag = {idx: tag for idx, tag in enumerate(tags)}
    return token2idx, idx2token, tag2idx, idx2tag

def tokenize(sentences, token2idx, tag2idx, one_hot_encode_tags=True):
    pad_token_idx, pad_tag_idx = token2idx["PAD"], tag2idx["O"]

    X = [[token2idx[t] for t, _ in s] for s in sentences]
    X = pad_sequences(X, maxlen=maxlen, padding="post", value=pad_token_idx)

    y = [[tag2idx[t] for _, t in s] for s in sentences]
    y = pad_sequences(y, maxlen=maxlen, padding="post", value=pad_tag_idx)
    if one_hot_encode_tags:
        y = [to_categorical(tag_idx, num_classes=num_tags) for tag_idx in y]
    return X, np.array(y)

def decode_one_hot_tags_sequence(tags_sequence):
    idx_tags = np.argmax(tags_sequence, axis=-1)
    return [idx2tag[idx] for idx in idx_tags]

def decode_tags_batch(encoded_tags_sequences):
    return [decode_one_hot_tags_sequence(seq) for seq in encoded_tags_sequences]

def test_inference(inference_model, test_dataset):
    true_labels = []
    pred_labels = []

    num_test_batches = test_dataset.cardinality().numpy()
    for X_batch, y_true in progress_bar(test_dataset, total=num_test_batches):
        y_pred = inference_model.predict(X_batch)
        pred_labels.append(decode_tags_batch(y_pred))
        true_labels.append(decode_tags_batch(y_true.numpy()))

    true_labels = np.array(true_labels)
    num_batches, num_samples, sentence_lenght = true_labels.shape
    true_labels = true_labels.reshape(num_batches * num_samples, sentence_lenght)
    true_labels = true_labels.tolist()

    pred_labels = np.array(pred_labels)
    pred_labels = pred_labels.reshape(num_batches * num_samples, sentence_lenght)
    pred_labels = pred_labels.tolist()

    return true_labels, pred_labels

## Load the dataset

In [6]:
df = pd.read_csv("ner.csv", encoding="ISO-8859-1", error_bad_lines=False)
df.head()

b'Skipping line 281837: expected 25 fields, saw 34\n'


Unnamed: 0.1,Unnamed: 0,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,pos,prev-iob,prev-lemma,prev-pos,prev-prev-iob,prev-prev-lemma,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
0,0,thousand,of,demonstr,NNS,lowercase,demonstrators,IN,lowercase,of,NNS,__START1__,__start1__,__START1__,__START2__,__start2__,__START2__,wildcard,__START2__,wildcard,__START1__,1.0,capitalized,Thousands,O
1,1,of,demonstr,have,VBP,lowercase,have,NNS,lowercase,demonstrators,IN,O,thousand,NNS,__START1__,__start1__,__START1__,wildcard,__START1__,capitalized,Thousands,1.0,lowercase,of,O
2,2,demonstr,have,march,VBN,lowercase,marched,VBP,lowercase,have,NNS,O,of,IN,O,thousand,NNS,capitalized,Thousands,lowercase,of,1.0,lowercase,demonstrators,O
3,3,have,march,through,IN,lowercase,through,VBN,lowercase,marched,VBP,O,demonstr,NNS,O,of,IN,lowercase,of,lowercase,demonstrators,1.0,lowercase,have,O
4,4,march,through,london,NNP,capitalized,London,IN,lowercase,through,VBN,O,have,VBP,O,demonstr,NNS,lowercase,demonstrators,lowercase,have,1.0,lowercase,marched,O


In [7]:
data = df[["sentence_idx", "word", "tag"]]
data.head(15)

Unnamed: 0,sentence_idx,word,tag
0,1.0,Thousands,O
1,1.0,of,O
2,1.0,demonstrators,O
3,1.0,have,O
4,1.0,marched,O
5,1.0,through,O
6,1.0,London,B-geo
7,1.0,to,O
8,1.0,protest,O
9,1.0,the,O


In [8]:
data["tag"].value_counts()

O        889973
B-geo     37525
B-tim     20193
B-org     20184
I-per     17382
B-per     17011
I-org     16537
B-gpe     16392
I-geo      7409
I-tim      6298
B-art       434
B-eve       348
I-eve       297
I-art       280
I-gpe       229
B-nat       226
I-nat        76
Name: tag, dtype: int64

### Build vocab

In [9]:
tagged_sentences = build_tagged_senteces(data)
print("Sample tagged sentence")
print(repr(tagged_sentences[0][:4]), "...")

tokens, tags = build_vocab(data)
num_tokens, num_tags = len(tokens), len(tags)
print("\nStats")
print(f"Num tokens: {num_tokens:,}")
print(f"Num tags: {num_tags}")

maxlen = max([len(t) for t in tokens])
print(f"maxlen: {maxlen}")


Sample tagged sentence
[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O')] ...

Stats
Num tokens: 30,173
Num tags: 17
maxlen: 64


### Tokenize sentence and label sequences

In [10]:
token2idx, idx2token, tag2idx, idx2tag = build_indexes(tokens, tags)
X, y = tokenize(tagged_sentences, token2idx, tag2idx)

print(f"Sentences dimension: {X.shape}")
print(f"Labels dimension: {y.shape}")

Sentences dimension: (35177, 64)
Labels dimension: (35177, 64, 17)


### Split the dataset into train and test

In [11]:
VALIDATION_SIZE = int(len(X) * 0.1)
BUFFER_SIZE = 50000

dataset = tf.data.Dataset.from_tensor_slices((X, y))
train_dataset = dataset.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE).batch(64, drop_remainder=True)
train_dataset = configure_dataset(train_dataset)

test_dataset = dataset.take(VALIDATION_SIZE)
test_dataset = configure_dataset(test_dataset).batch(64, drop_remainder=True)

train_dataset.cardinality(), test_dataset.cardinality()

(<tf.Tensor: shape=(), dtype=int64, numpy=494>,
 <tf.Tensor: shape=(), dtype=int64, numpy=54>)

In [12]:
model = Sequential([
    Embedding(input_dim=num_tokens, output_dim=64),
    SpatialDropout1D(0.1),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(num_tags, activation="softmax"))
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          1931072   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, None, 64)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, None, 200)         132000    
_________________________________________________________________
time_distributed (TimeDistri (None, None, 17)          3417      
Total params: 2,066,489
Trainable params: 2,066,489
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [14]:
# history = model.fit(train_dataset, epochs=3, verbose=1)
history = model.fit(train_dataset, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [15]:
model.evaluate(test_dataset)



[0.11357695609331131, 0.9696316123008728]

In [34]:
X_test, y_test = next(test_dataset.take(1).as_numpy_iterator())
sample_idx = np.random.randint(0, len(X_test))

X_test = X_test[sample_idx]
y_pred = model.predict(X_test)
pred_tags = decode_tags_batch(y_pred)
pred_tags = np.squeeze(pred_tags)

y_test = y_test[sample_idx]
true_tags = decode_tags_batch([y_test])
true_tags = np.squeeze(true_tags)

print(f"{'Word':15}{'True':5}\t {'Pred'}\n")
print("_"*30)
for token_idx, true_tag, pred_tag in zip(X_test, true_tags, pred_tags):
    print(f"{idx2token[token_idx]:15}{true_tag}\t{pred_tag}")

Word           True 	 Pred

______________________________
The            O	O
area           O	O
became         O	O
a              O	O
refuge         O	O
for            O	O
many           O	O
al-Qaida       B-org	B-org
and            O	O
Taleban        B-org	B-org
fighters       O	O
after          O	O
the            O	O
Taleban        B-org	B-org
government     O	O
was            O	O
ousted         O	O
in             O	O
Afghanistan    B-geo	B-gpe
in             O	O
2001           B-tim	I-tim
.              O	O
The            O	O
area           O	O
became         O	O
a              O	O
refuge         O	O
for            O	O
many           O	O
al-Qaida       B-org	B-org
and            O	O
Taleban        B-org	B-org
fighters       O	O
after          O	O
the            O	O
Taleban        B-org	B-org
government     O	O
was            O	O
ousted         O	O
in             O	O
Afghanistan    B-geo	B-gpe
in             O	O
2001           B-tim	I-tim
.              O	O
PAD            O	O
PAD   

## Evaluate Model

In [17]:
true_labels, pred_labels = test_inference(model, test_dataset)
print(classification_report(true_labels, pred_labels))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art       0.00      0.00      0.00       121
         eve       0.00      0.00      0.00        93
         geo       0.72      0.84      0.78      4770
         gpe       0.93      0.75      0.83      2716
         nat       0.00      0.00      0.00        46
         org       0.54      0.55      0.54      2704
         per       0.65      0.60      0.62      2403
         tim       0.81      0.82      0.82      2658

   micro avg       0.72      0.72      0.72     15511
   macro avg       0.46      0.44      0.45     15511
weighted avg       0.72      0.72      0.71     15511



# Bidirectional LSTM CRF

In [61]:
X, y = tokenize(tagged_sentences, token2idx, tag2idx, one_hot_encode_tags=False)

print(f"Sentences dimension: {X.shape}")
print(f"Labels dimension: {y.shape}")

dataset = tf.data.Dataset.from_tensor_slices((X, y))
train_dataset = (
    dataset.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE).batch(64, drop_remainder=True)
)
train_dataset = configure_dataset(train_dataset)

test_dataset = dataset.take(VALIDATION_SIZE)
test_dataset = configure_dataset(test_dataset).batch(64, drop_remainder=True)

train_dataset.cardinality(), test_dataset.cardinality()

Sentences dimension: (35177, 64)
Labels dimension: (35177, 64)


(<tf.Tensor: shape=(), dtype=int64, numpy=494>,
 <tf.Tensor: shape=(), dtype=int64, numpy=54>)

In [69]:
SPATIAL_DROPOUT_RATE = 0.5
RECURRENT_DROPOUT_RATE = 0.5


class NerBiLstmCRF(Model):
    def __init__(
        self,
        vocab_dim,
        tag_dim,
        max_seq_len,
        embedding_dim=128,
        lstm_men_dim=200,
        name="BiLstmCRF",
        **kwargs
    ):
        super(NerBiLstmCRF, self).__init__(name=name, **kwargs)
        self.embedding = Embedding(vocab_dim, embedding_dim)
        self.dropout = SpatialDropout1D(SPATIAL_DROPOUT_RATE)
        self.lstm = LSTM(
            lstm_men_dim,
            return_sequences=True,
            recurrent_dropout=RECURRENT_DROPOUT_RATE,
        )
        self.bi_lstm = Bidirectional(self.lstm)
        self.classifier = Dense(tag_dim, activation="softmax")
        self.time_distributed_classifier = TimeDistributed(self.classifier)
        self.sequence_lengths = tf.expand_dims(max_seq_len, axis=0)

        # self.transition_params = tf.Variable(
        #     tf.random.uniform(shape=(tag_dim, tag_dim))
        # )

    def call(self, inputs, labels=None, training=False):
        # text_lens = tf.math.reduce_sum(
        #     tf.cast(tf.math.not_equal(inputs, 0), dtype=tf.int32), axis=-1
        # )
        token_embeddings = self.embedding(inputs)
        token_embeddings = self.dropout(token_embeddings, training)
        logits = self.bi_lstm(token_embeddings)
        logits = self.time_distributed_classifier(logits)

        if labels is not None:
            # label_sequences = tf.convert_to_tensor(labels, dtype=tf.int32)
            label_sequences = labels
            log_likelihood, _ = tf_ad.text.crf_log_likelihood(
                logits, label_sequences, self.sequence_lengths
            )
            return logits, log_likelihood
        return logits

In [70]:
# @tf.function
def train_one_step(model, optimizer, tokens_batch, labels_batch):
    with tf.GradientTape() as tape:
        logits, log_likelihood = model(
            text_batch, labels_batch, training=True
        )
        loss = -tf.reduce_mean(log_likelihood)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss, logits

In [71]:
bi_lstm_crf_model = NerBiLstmCRF(num_tokens, num_tags, max_seq_len=maxlen)
optimizer = tf.keras.optimizers.Adam()

for text_batch, labels_batch in train_dataset.take(1):
    loss, logits = train_one_step(
        bi_lstm_crf_model, optimizer, text_batch, labels_batch
    )

Training!!!!
Training!!!!
Training!!!!
Training!!!!


In [None]:
m = NerBiLstmCRF(num_tokens, num_tags)
m.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
m.fit(train_dataset, epochs=3)
#m.summary()

In [None]:
m.evaluate(test_dataset)

In [None]:
true_labels, pred_labels = test_inference(m, test_dataset)
print(classification_report(true_labels, pred_labels))

In [None]:
true_labels = []
pred_labels = []

num_test_batches = test_dataset.cardinality().numpy()
for X_batch, y_true in progress_bar(test_dataset, total=num_test_batches):
    y_pred = model.predict(X_batch)
    pred_labels.append(decode_tags_batch(y_pred))
    true_labels.append(decode_tags_batch(y_true.numpy()))

true_labels = np.array(true_labels)
num_batches, num_samples, sentence_lenght = true_labels.shape
true_labels = true_labels.reshape(num_batches * num_samples, sentence_lenght)
true_labels = true_labels.tolist()

pred_labels = np.array(pred_labels)
pred_labels = pred_labels.reshape(num_batches * num_samples, sentence_lenght)
pred_labels = pred_labels.tolist()

In [None]:
model_222 = Sequential([
    Embedding(input_dim=num_tokens, output_dim=64),
    SpatialDropout1D(0.5),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.5)),
    LSTM(units=100, return_sequences=True, recurrent_dropout=0.5),
    TimeDistributed(Dense(num_tags))
])

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model_222.compile(optimizer="adam", loss=loss, metrics=["accuracy"])
X = [[token2idx[t] for t, _ in s] for s in sentences]
X = pad_sequences(X, maxlen=maxlen, padding="post", value=unk_token_idx)

y = [[tag2idx[t] for _, t in s] for s in sentences]
y = pad_sequences(y, maxlen=maxlen, padding="post", value=unk_tag_idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
history = model_222.fit(X_train, np.array(y_train), validation_split=0.2, batch_size=32, epochs=3, verbose=1)

In [None]:
model_222.evaluate(X_test, np.array(y_test))

In [None]:
sample_idx = np.random.randint(0, len(X_test))
pred = model_222(tf.expand_dims(X_test[sample_idx], 0))
pred = tf.squeeze(pred, 0)
pred = tf.random.categorical(pred, num_samples=1)
pred_tags = pred.numpy().flatten()
ground_truth = y_test[sample_idx]

print(f"{'Word':15}{'True':5}\t {'Pred'}\n")
print("_"*30)
for token, gt_tag, pred_tag in zip(X_test[sample_idx], ground_truth, pred_tags):
    print(f"{idx2token[token]:15}{idx2tag[gt_tag]}\t{idx2tag[pred_tag]}")