<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/deeplearning.ai/tf/tf_ner_bi_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Download Kaggle Dataset
#@markdown Dataset: Annotated Corpus for Named Entity Recognition <br>
#@markdown https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
#@markdown ___

kaggle_dataset_id = "abhinavwalia95/entity-annotated-corpus" #@param {type:"string"}

!pip install -q kaggle
from google.colab import drive
drive.mount('/content/gdrive')
!mkdir -p ~/.kaggle
!cp /content/gdrive/My\ Drive/kaggle/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d {kaggle_dataset_id}
!ls -l /content
!unzip -o /content/entity-annotated-corpus

#@markdown ___
#@markdown Install dependencies<br>
#@markdown - seqeval
!pip install -Uqq seqeval

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
entity-annotated-corpus.zip: Skipping, found more recently modified local copy (use --force to force download)
total 195268
-rw-r--r-- 1 root root  27703149 Dec 26 05:40 entity-annotated-corpus.zip
drwx------ 5 root root      4096 Dec 26 05:40 gdrive
-rw-r--r-- 1 root root 157030359 Sep 20  2019 ner.csv
-rw-r--r-- 1 root root  15208151 Sep 20  2019 ner_dataset.csv
drwxr-xr-x 1 root root      4096 Dec 21 17:29 sample_data
Archive:  /content/entity-annotated-corpus.zip
  inflating: ner.csv                 
  inflating: ner_dataset.csv         


In [None]:
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-9cbd5fbf-ff72-0849-7da7-51e586f90cf7)


In [None]:
import math
import pathlib
import shutil
import tempfile

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tf_ad
from numpy.random import seed
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import (
    LSTM,
    Bidirectional,
    Dense,
    Embedding,
    TimeDistributed,
    Dropout,
    SpatialDropout1D
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.random import set_seed

set_seed(42)
seed(42)

logdir = pathlib.Path(tempfile.mkdtemp())/"tensorflow_logs"
shutil.rmtree(logdir, ignore_errors=True)

In [None]:
#@title Utils
#@markdown ```
#@markdown - build_vocab(): Extracts unique tokens and tags
#@markdown - build_indexes(): Builds the tokens and tags mapping indexes
#@markdown ```

AUTOTUNE = tf.data.experimental.AUTOTUNE

def configure_dataset(dataset):
    return dataset.cache().prefetch(buffer_size=AUTOTUNE)

def build_vocab(data):
    tokens = {token for token in data["word"]}
    tokens = {"unk" if t is math.nan or isinstance(t, float) else t for t in tokens}
    
    tags = {tag for tag in data["tag"]}
    tags = {"unk" if t is math.nan or isinstance(t, float) else t for t in tags}
    return tokens, tags

def build_tagged_senteces(data):
    agg_func = lambda s: [(w, t) for w, t in zip(s["word"], s["tag"])]
    grouped = data.groupby("sentence_idx").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences

def build_indexes(tokens, tags):
    token2idx = {token: idx for idx, token in enumerate(tokens)}
    idx2token = {idx: token for idx, token in enumerate(tokens)}
    tag2idx = {tag: idx for idx, tag in enumerate(tags)}
    idx2tag = {idx: tag for idx, tag in enumerate(tags)}
    return token2idx, idx2token, tag2idx, idx2tag

def tokenize(sentences, token2idx, tag2idx, one_hot_encode_tags=True):
    unk_token_idx, unk_tag_idx = token2idx['unk'], tag2idx['unk']

    X = [[token2idx[t] for t, _ in s] for s in sentences]
    X = pad_sequences(X, maxlen=maxlen, padding="post", value=unk_token_idx)

    y = [[tag2idx[t] for _, t in s] for s in sentences]
    y = pad_sequences(y, maxlen=maxlen, padding="post", value=unk_tag_idx)
    if one_hot_encode_tags:
        y = [to_categorical(tag_idx, num_classes=num_tags) for tag_idx in y]
    return X, np.array(y)


## Load the dataset

In [None]:
df = pd.read_csv("ner.csv", encoding="ISO-8859-1", error_bad_lines=False)
df.head()

b'Skipping line 281837: expected 25 fields, saw 34\n'


Unnamed: 0.1,Unnamed: 0,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,pos,prev-iob,prev-lemma,prev-pos,prev-prev-iob,prev-prev-lemma,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
0,0,thousand,of,demonstr,NNS,lowercase,demonstrators,IN,lowercase,of,NNS,__START1__,__start1__,__START1__,__START2__,__start2__,__START2__,wildcard,__START2__,wildcard,__START1__,1.0,capitalized,Thousands,O
1,1,of,demonstr,have,VBP,lowercase,have,NNS,lowercase,demonstrators,IN,O,thousand,NNS,__START1__,__start1__,__START1__,wildcard,__START1__,capitalized,Thousands,1.0,lowercase,of,O
2,2,demonstr,have,march,VBN,lowercase,marched,VBP,lowercase,have,NNS,O,of,IN,O,thousand,NNS,capitalized,Thousands,lowercase,of,1.0,lowercase,demonstrators,O
3,3,have,march,through,IN,lowercase,through,VBN,lowercase,marched,VBP,O,demonstr,NNS,O,of,IN,lowercase,of,lowercase,demonstrators,1.0,lowercase,have,O
4,4,march,through,london,NNP,capitalized,London,IN,lowercase,through,VBN,O,have,VBP,O,demonstr,NNS,lowercase,demonstrators,lowercase,have,1.0,lowercase,marched,O


In [None]:
data = df[["sentence_idx", "word", "tag"]]
data.head(15)

Unnamed: 0,sentence_idx,word,tag
0,1.0,Thousands,O
1,1.0,of,O
2,1.0,demonstrators,O
3,1.0,have,O
4,1.0,marched,O
5,1.0,through,O
6,1.0,London,B-geo
7,1.0,to,O
8,1.0,protest,O
9,1.0,the,O


In [None]:
data["tag"].value_counts()

O        889973
B-geo     37525
B-tim     20193
B-org     20184
I-per     17382
B-per     17011
I-org     16537
B-gpe     16392
I-geo      7409
I-tim      6298
B-art       434
B-eve       348
I-eve       297
I-art       280
I-gpe       229
B-nat       226
I-nat        76
Name: tag, dtype: int64

### Build vocab

In [None]:
tagged_sentences = build_tagged_senteces(data)
print("Sample tagged sentence")
print(repr(tagged_sentences[0][:4]), "...")

tokens, tags = build_vocab(data)
num_tokens, num_tags = len(tokens), len(tags)
print("\nStats")
print(f"Num tokens: {num_tokens:,}")
print(f"Num tags: {num_tags}")

maxlen = max([len(t) for t in tokens])
print(f"maxlen: {maxlen}")


Sample tagged sentence
[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O')] ...

Stats
Num tokens: 30,173
Num tags: 18
maxlen: 64


### Tokenize sentence and label sequences

In [None]:
token2idx, idx2token, tag2idx, idx2tag = build_indexes(tokens, tags)
X, y = tokenize(tagged_sentences, token2idx, tag2idx)

print(f"Sentences dimension: {X.shape}")
print(f"Labels dimension: {y.shape}")

Sentences dimension: (35177, 64)
Labels dimension: (35177, 64, 18)


### Split the dataset into train and test

In [None]:
VALIDATION_SIZE = int(len(X) * 0.1)
AUTOTUNE = tf.data.experimental.AUTOTUNE
BUFFER_SIZE = 50000

dataset = tf.data.Dataset.from_tensor_slices((X, y))
train_dataset = dataset.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE).batch(64, drop_remainder=True)
train_dataset = configure_dataset(train_dataset)

test_dataset = dataset.take(VALIDATION_SIZE)
test_dataset = configure_dataset(test_dataset).batch(64, drop_remainder=True)

train_dataset.cardinality(), test_dataset.cardinality()

(<tf.Tensor: shape=(), dtype=int64, numpy=494>,
 <tf.Tensor: shape=(), dtype=int64, numpy=54>)

In [None]:
model = Sequential([
    Embedding(input_dim=num_tokens, output_dim=64),
    SpatialDropout1D(0.1),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(num_tags, activation="softmax"))
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          1931072   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, None, 64)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, None, 200)         132000    
_________________________________________________________________
time_distributed (TimeDistri (None, None, 18)          3618      
Total params: 2,066,690
Trainable params: 2,066,690
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
# history = model.fit(train_dataset, epochs=3, verbose=1)
history = model.fit(train_dataset, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
model.evaluate(test_dataset)



[0.11380528658628464, 0.9701877236366272]

In [None]:
X_test, y_test = next(test_dataset.take(1).as_numpy_iterator())
sample_idx = np.random.randint(0, len(X_test))
X_test = X_test[sample_idx]
y_test = y_test[sample_idx]
pred = model.predict(X_test)
pred_tags = np.argmax(pred, axis=-1).flatten()
ground_truth = np.argmax(y_test, axis=1)
X_test.shape
print(f"{'Word':15}{'True':5}\t {'Pred'}\n")
print("_"*30)
for token, gt_tag, pred_tag in zip(X_test, ground_truth, pred_tags):
    print(f"{idx2token[token]:15}{idx2tag[gt_tag]}\t{idx2tag[pred_tag]}")

Word           True 	 Pred

______________________________
The            O	O
London         B-geo	B-geo
march          O	O
came           O	O
ahead          O	O
of             O	O
anti-war       O	O
protests       O	O
today          O	B-tim
in             O	O
other          O	O
cities         O	O
,              O	O
including      O	O
Rome           B-geo	unk
,              O	O
Paris          B-geo	B-geo
,              O	O
and            O	O
Madrid         B-geo	B-gpe
.              O	O
The            O	O
London         B-geo	B-geo
march          O	O
came           O	O
ahead          O	O
of             O	O
anti-war       O	O
protests       O	O
today          O	B-tim
in             O	O
other          O	O
cities         O	O
,              O	O
including      O	O
Rome           B-geo	unk
,              O	O
Paris          B-geo	B-geo
,              O	O
and            O	O
Madrid         B-geo	B-gpe
.              O	O
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk   

In [55]:
y_true = [[idx2tag[tag] for tag in ground_truth]]
y_pred = [[idx2tag[tag] for tag in pred_tags]]

In [56]:
# from seqeval.metrics import classification_report
# classification_report(ground_truth, pred_tags)

from seqeval.metrics import f1_score
f1_score(y_true, y_pred)



0.5

In [61]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         geo       1.00      0.50      0.67         8
         gpe       0.00      0.00      0.00         0
          nk       0.33      1.00      0.50         1
         tim       0.00      0.00      0.00         0

   micro avg       0.45      0.56      0.50         9
   macro avg       0.33      0.38      0.29         9
weighted avg       0.93      0.56      0.65         9



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model_222 = Sequential([
    Embedding(input_dim=num_tokens, output_dim=64),
    SpatialDropout1D(0.5),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.5)),
    LSTM(units=100, return_sequences=True, recurrent_dropout=0.5),
    TimeDistributed(Dense(num_tags))
])

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model_222.compile(optimizer="adam", loss=loss, metrics=["accuracy"])
X = [[token2idx[t] for t, _ in s] for s in sentences]
X = pad_sequences(X, maxlen=maxlen, padding="post", value=unk_token_idx)

y = [[tag2idx[t] for _, t in s] for s in sentences]
y = pad_sequences(y, maxlen=maxlen, padding="post", value=unk_tag_idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
history = model_222.fit(X_train, np.array(y_train), validation_split=0.2, batch_size=32, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
model_222.evaluate(X_test, np.array(y_test))



[0.06481315195560455, 0.9811416268348694]

In [None]:
sample_idx = np.random.randint(0, len(X_test))
pred = model_222(tf.expand_dims(X_test[sample_idx], 0))
pred = tf.squeeze(pred, 0)
pred = tf.random.categorical(pred, num_samples=1)
pred_tags = pred.numpy().flatten()
ground_truth = y_test[sample_idx]

print(f"{'Word':15}{'True':5}\t {'Pred'}\n")
print("_"*30)
for token, gt_tag, pred_tag in zip(X_test[sample_idx], ground_truth, pred_tags):
    print(f"{idx2token[token]:15}{idx2tag[gt_tag]}\t{idx2tag[pred_tag]}")

Word           True 	 Pred

______________________________
In             O	O
Baghdad        B-geo	B-geo
,              O	O
militants      O	O
freed          O	O
the            O	O
brother        O	O
of             O	O
Interior       O	B-org
Minister       O	I-org
Bayan          B-per	I-per
Jabor          I-per	I-per
,              O	O
who            O	O
was            O	O
kidnapped      O	O
one            B-tim	O
day            I-tim	B-tim
earlier        O	O
near           O	O
the            O	O
Sadr           B-geo	B-geo
City           I-geo	I-geo
district       O	O
.              O	O
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            

# Bidirectional LSTM CRF