<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/deeplearning.ai/tf/tf_name_entity_recognition_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Download Kaggle Dataset
#@markdown Dataset: Annotated Corpus for Named Entity Recognition <br>
#@markdown https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
#@markdown ___

kaggle_dataset_id = "abhinavwalia95/entity-annotated-corpus" #@param {type:"string"}

!pip install -q kaggle
from google.colab import drive
drive.mount('/content/gdrive')
!mkdir -p ~/.kaggle
!cp /content/gdrive/My\ Drive/kaggle/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d {kaggle_dataset_id}
!ls -l /content
!unzip -o /content/entity-annotated-corpus

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
entity-annotated-corpus.zip: Skipping, found more recently modified local copy (use --force to force download)
total 195268
-rw-r--r-- 1 root root  27703149 Dec 24 07:44 entity-annotated-corpus.zip
drwx------ 5 root root      4096 Dec 24 07:44 gdrive
-rw-r--r-- 1 root root 157030359 Sep 20  2019 ner.csv
-rw-r--r-- 1 root root  15208151 Sep 20  2019 ner_dataset.csv
drwxr-xr-x 1 root root      4096 Dec 21 17:29 sample_data
Archive:  /content/entity-annotated-corpus.zip
  inflating: ner.csv                 
  inflating: ner_dataset.csv         


In [2]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-d60ce961-179e-66ef-3c51-3b5e2c11c2e4)


In [17]:
import math
import pathlib
import shutil
import tempfile

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tf_ad
from numpy.random import seed
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import (
    LSTM,
    Bidirectional,
    Dense,
    Embedding,
    TimeDistributed,
    Dropout,
    SpatialDropout1D
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.random import set_seed

set_seed(42)
seed(42)

logdir = pathlib.Path(tempfile.mkdtemp())/"tensorflow_logs"
shutil.rmtree(logdir, ignore_errors=True)

## Load the dataset

In [4]:
df = pd.read_csv("ner.csv", encoding="ISO-8859-1", error_bad_lines=False)
df.head()

b'Skipping line 281837: expected 25 fields, saw 34\n'


Unnamed: 0.1,Unnamed: 0,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,pos,prev-iob,prev-lemma,prev-pos,prev-prev-iob,prev-prev-lemma,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
0,0,thousand,of,demonstr,NNS,lowercase,demonstrators,IN,lowercase,of,NNS,__START1__,__start1__,__START1__,__START2__,__start2__,__START2__,wildcard,__START2__,wildcard,__START1__,1.0,capitalized,Thousands,O
1,1,of,demonstr,have,VBP,lowercase,have,NNS,lowercase,demonstrators,IN,O,thousand,NNS,__START1__,__start1__,__START1__,wildcard,__START1__,capitalized,Thousands,1.0,lowercase,of,O
2,2,demonstr,have,march,VBN,lowercase,marched,VBP,lowercase,have,NNS,O,of,IN,O,thousand,NNS,capitalized,Thousands,lowercase,of,1.0,lowercase,demonstrators,O
3,3,have,march,through,IN,lowercase,through,VBN,lowercase,marched,VBP,O,demonstr,NNS,O,of,IN,lowercase,of,lowercase,demonstrators,1.0,lowercase,have,O
4,4,march,through,london,NNP,capitalized,London,IN,lowercase,through,VBN,O,have,VBP,O,demonstr,NNS,lowercase,demonstrators,lowercase,have,1.0,lowercase,marched,O


In [5]:
data = df[["sentence_idx", "word", "tag"]]
data.head(15)

Unnamed: 0,sentence_idx,word,tag
0,1.0,Thousands,O
1,1.0,of,O
2,1.0,demonstrators,O
3,1.0,have,O
4,1.0,marched,O
5,1.0,through,O
6,1.0,London,B-geo
7,1.0,to,O
8,1.0,protest,O
9,1.0,the,O


In [6]:
data["tag"].value_counts()

O        889973
B-geo     37525
B-tim     20193
B-org     20184
I-per     17382
B-per     17011
I-org     16537
B-gpe     16392
I-geo      7409
I-tim      6298
B-art       434
B-eve       348
I-eve       297
I-art       280
I-gpe       229
B-nat       226
I-nat        76
Name: tag, dtype: int64

In [7]:
agg_func = lambda s: [(w, t) for w, t in zip(s["word"], s["tag"])]
data.groupby("sentence_idx").apply(agg_func)

sentence_idx
1.0        [(Thousands, O), (of, O), (demonstrators, O), ...
2.0        [(Families, O), (of, O), (soldiers, O), (kille...
3.0        [(They, O), (marched, O), (from, O), (the, O),...
4.0        [(Police, O), (put, O), (the, O), (number, O),...
5.0        [(The, O), (protest, O), (comes, O), (on, O), ...
                                 ...                        
47955.0    [(Indian, B-gpe), (border, O), (security, O), ...
47956.0    [(Indian, B-gpe), (officials, O), (said, O), (...
47957.0    [(Two, O), (more, O), (landed, O), (in, O), (f...
47958.0    [(They, O), (say, O), (not, O), (all, O), (of,...
47959.0    [(Indian, B-gpe), (forces, O), (said, O), (the...
Length: 35177, dtype: object

In [8]:
class SentenceGetter():
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["word"], s["tag"])]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[f"Sentence: {self.n_sent}"]
            self.n_sent += 1
            return s
        except:
            return None

getter = SentenceGetter(data)
sentences = getter.sentences
print(repr(sentences[0][:4]), "...")

[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O')] ...


In [9]:
tokens = data["word"].unique()
tokens = {"unk" if t is math.nan or isinstance(t, float) else t for t in tokens}
num_tokens = len(tokens)

maxlen = max([len(t) for t in tokens])

tags = data["tag"].unique()
tags = {"unk" if t is math.nan or isinstance(t, float) else t for t in tags}
num_tags = len(tags)

print(f"Num tokenks: {num_tokens:,}  Num tags: {num_tags}  maxlen: {maxlen}")

Num tokenks: 30,173  Num tags: 18  maxlen: 64


In [10]:
token2idx = {token: idx for idx, token in enumerate(tokens)}
idx2token = {idx: token for idx, token in enumerate(tokens)}
tag2idx = {tag: idx for idx, tag in enumerate(tags)}
idx2tag = {idx: tag for idx, tag in enumerate(tags)}

unk_token_idx = token2idx['unk']
unk_tag_idx = tag2idx['unk']

In [11]:
X = [[token2idx[t] for t, _ in s] for s in sentences]
X = pad_sequences(X, maxlen=maxlen, padding="post", value=unk_token_idx)

y = [[tag2idx[t] for _, t in s] for s in sentences]
y = pad_sequences(y, maxlen=maxlen, padding="post", value=unk_tag_idx)
y = [to_categorical(tag_idx, num_classes=num_tags) for tag_idx in y]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [19]:
model = Sequential([
    Embedding(input_dim=num_tokens, output_dim=64),
    SpatialDropout1D(0.1),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(num_tags, activation="softmax"))
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          1931072   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, None, 64)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 200)         132000    
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 18)          3618      
Total params: 2,066,690
Trainable params: 2,066,690
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [28]:
history = model.fit(X_train, np.array(y_train), validation_split=0.2, batch_size=32, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [29]:
model.evaluate(X_test, np.array(y_test))



[0.056382834911346436, 0.9834822416305542]

In [55]:
sample_idx = np.random.randint(0, len(X_test))
pred = model.predict(X_test[sample_idx])
pred_tags = np.argmax(pred, axis=-1).flatten()
ground_truth = np.argmax(y_test[sample_idx], axis=1)

print(f"{'Word':15}{'True':5}\t {'Pred'}\n")
print("_"*30)
for token, gt_tag, pred_tag in zip(X_test[sample_idx], ground_truth, pred_tags):
    print(f"{idx2token[token]:15}{idx2tag[gt_tag]}\t{idx2tag[pred_tag]}")

Word           True 	 Pred

______________________________
The            O	O
military       O	O
also           O	O
said           O	O
Iraqi          B-gpe	B-gpe
soldiers       O	O
recovered      O	O
Iranian-made   O	B-geo
rockets        O	O
and            O	O
other          O	O
weapons        O	O
in             O	O
Baghdad        B-geo	B-geo
's             O	O
Sadr           B-geo	B-geo
City           I-geo	I-geo
district       O	O
on             O	O
Monday         B-tim	B-tim
.              O	O
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            

In [57]:
model_222 = Sequential([
    Embedding(input_dim=num_tokens, output_dim=64),
    SpatialDropout1D(0.1),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(num_tags))
])
model_222.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
X = [[token2idx[t] for t, _ in s] for s in sentences]
X = pad_sequences(X, maxlen=maxlen, padding="post", value=unk_token_idx)

y = [[tag2idx[t] for _, t in s] for s in sentences]
y = pad_sequences(y, maxlen=maxlen, padding="post", value=unk_tag_idx)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
history = model_222.fit(X_train, np.array(y_train), validation_split=0.2, batch_size=32, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [58]:
model_222.evaluate(X_test, np.array(y_test))



[0.16509294509887695, 0.9749813675880432]

In [75]:
sample_idx = np.random.randint(0, len(X_test))
pred = model_222(tf.expand_dims(X_test[sample_idx], 0))
pred = tf.squeeze(pred, 0)
pred = tf.random.categorical(pred, num_samples=1)
pred_tags = pred.numpy().flatten()
ground_truth = y_test[sample_idx]

print(f"{'Word':15}{'True':5}\t {'Pred'}\n")
print("_"*30)
for token, gt_tag, pred_tag in zip(X_test[sample_idx], ground_truth, pred_tags):
    print(f"{idx2token[token]:15}{idx2tag[gt_tag]}\t{idx2tag[pred_tag]}")

Word           True 	 Pred

______________________________
The            O	B-nat
United         B-geo	I-tim
States         I-geo	O
has            O	B-per
been           O	O
pressing       O	I-eve
for            O	B-org
the            O	B-per
expansion      O	I-art
to             O	O
relieve        O	B-gpe
pressure       O	B-geo
on             O	I-per
stretched      O	I-org
American       B-gpe	B-art
forces         O	unk
.              O	B-org
unk            unk	I-tim
unk            unk	B-per
unk            unk	B-per
unk            unk	I-nat
unk            unk	unk
unk            unk	O
unk            unk	I-tim
unk            unk	B-org
unk            unk	I-per
unk            unk	B-nat
unk            unk	I-art
unk            unk	I-gpe
unk            unk	I-tim
unk            unk	unk
unk            unk	B-per
unk            unk	I-nat
unk            unk	I-nat
unk            unk	I-org
unk            unk	unk
unk            unk	unk
unk            unk	unk
unk            unk	B-per
unk            u

In [70]:
predictions

<tf.Tensor: shape=(64, 1), dtype=int64, numpy=
array([[ 9],
       [14],
       [17],
       [ 9],
       [ 9],
       [ 6],
       [15],
       [16],
       [17],
       [11],
       [11],
       [ 9],
       [ 2],
       [ 8],
       [ 0],
       [ 2],
       [15],
       [ 1],
       [ 2],
       [ 4],
       [ 8],
       [12],
       [13],
       [15],
       [ 6],
       [ 5],
       [ 0],
       [ 6],
       [ 8],
       [13],
       [ 9],
       [ 6],
       [ 3],
       [11],
       [16],
       [15],
       [ 1],
       [15],
       [17],
       [ 5],
       [ 6],
       [ 6],
       [ 9],
       [ 8],
       [ 9],
       [ 5],
       [ 4],
       [11],
       [ 9],
       [13],
       [16],
       [ 0],
       [ 6],
       [11],
       [15],
       [ 4],
       [ 4],
       [14],
       [17],
       [12],
       [ 2],
       [ 7],
       [ 7],
       [ 7]])>

In [64]:
pred_tags

array([-0.03643083,  0.03061048,  0.02354746, ...,  0.0033778 ,
       -0.04422568,  0.18118353], dtype=float32)

InvalidArgumentError: ignored