<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/deeplearning.ai/tf/tf_ner_bi_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#@title Download Kaggle Dataset
#@markdown Dataset: Annotated Corpus for Named Entity Recognition <br>
#@markdown https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
#@markdown ___

kaggle_dataset_id = "abhinavwalia95/entity-annotated-corpus" #@param {type:"string"}

!pip install -q kaggle
from google.colab import drive
drive.mount('/content/gdrive')

!mkdir -p ~/.kaggle
!cp /content/gdrive/My\ Drive/kaggle/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d {kaggle_dataset_id}
!ls -l /content
!unzip -o /content/entity-annotated-corpus

#@markdown ___
#@markdown Install dependencies<br>
#@markdown - seqeval: Sequence labeling evaluation (F1, precision, etc).
#@markdown - fastprogress: Progress bar for Jupyter notebooks.

!pip install -Uqq seqeval
!pip install -Uqq fastprogress

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
entity-annotated-corpus.zip: Skipping, found more recently modified local copy (use --force to force download)
total 195268
-rw-r--r-- 1 root root  27703149 Dec 29 19:01 entity-annotated-corpus.zip
drwx------ 5 root root      4096 Dec 29 19:01 gdrive
-rw-r--r-- 1 root root 157030359 Sep 20  2019 ner.csv
-rw-r--r-- 1 root root  15208151 Sep 20  2019 ner_dataset.csv
drwxr-xr-x 1 root root      4096 Dec 21 17:29 sample_data
Archive:  /content/entity-annotated-corpus.zip
  inflating: ner.csv                 
  inflating: ner_dataset.csv         


In [4]:
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-fdd6780d-80fe-214f-4e18-ed11e1b85118)


In [5]:
import math
import pathlib
import shutil
import tempfile

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
from fastprogress.fastprogress import master_bar, progress_bar
from numpy.random import seed
from seqeval.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model, Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import (
    LSTM,
    Bidirectional,
    Dense,
    Dropout,
    Embedding,
    InputSpec,
    Layer,
    SpatialDropout1D,
    TimeDistributed,
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.random import set_seed
from tensorflow_addons.text import crf_decode, crf_log_likelihood

set_seed(42)
seed(42)

logdir = pathlib.Path(tempfile.mkdtemp())/"tensorflow_logs"
shutil.rmtree(logdir, ignore_errors=True)

In [6]:
PAD_TOKEN = "PAD"
PAD_INDEX = 0
PAD_TAG = "O"

In [155]:
#@title Utils
#@markdown ```
#@markdown - build_vocab(): Extracts unique tokens and tags
#@markdown - build_indexes(): Builds the tokens and tags mapping indexes
#@markdown - decode_one_hot_tags_sequence():
#@markdown - decode_tags_sequence()
#@markdown - decode_tags_batch()
#@markdown - test_inference()
#@markdown ```

AUTOTUNE = tf.data.experimental.AUTOTUNE

def configure_dataset(dataset):
    return dataset.cache().prefetch(buffer_size=AUTOTUNE)

def build_vocab(data):
    tokens = {token for token in data["word"]}
    tokens = {"unk" if t is math.nan or isinstance(t, float) else t for t in tokens}

    tags = {tag for tag in data["tag"]}
    tags = {"O" if t is math.nan or isinstance(t, float) else t for t in tags}
    return tokens, tags

def build_tagged_senteces(data):
    agg_func = lambda s: [(w, t) for w, t in zip(s["word"], s["tag"])]
    grouped = data.groupby("sentence_idx").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences

def build_indexes(tokens, tags):
    # token[0] is reserved for padding
    token2idx = {token: idx + 1 for idx, token in enumerate(tokens)}
    idx2token = {idx + 1: token for idx, token in enumerate(tokens)}
    idx2token[PAD_INDEX] = PAD_TOKEN
    token2idx[PAD_TOKEN] = PAD_INDEX 

    tag2idx = {tag: idx for idx, tag in enumerate(tags)}
    idx2tag = {idx: tag for idx, tag in enumerate(tags)}
    return token2idx, idx2token, tag2idx, idx2tag

def tokenize(sentences, token2idx, tag2idx, one_hot_encode_tags=True):
    X = [[token2idx[t] for t, _ in s] for s in sentences]
    X = pad_sequences(X, maxlen=maxlen, padding="post", value=PAD_INDEX)

    y = [[tag2idx[t] for _, t in s] for s in sentences]
    y = pad_sequences(y, maxlen=maxlen, padding="post", value=tag2idx[PAD_TAG])
    if one_hot_encode_tags:
        y = [to_categorical(tag_idx, num_classes=num_tags) for tag_idx in y]
    return X, np.array(y)

def decode_tags_sequence(idx_tags):
    return [idx2tag[idx] for idx in idx_tags]

def decode_one_hot_tags_sequence(tags_sequence):
    idx_tags = np.argmax(tags_sequence, axis=-1)
    return decode_tags_sequence(idx_tags) 

def decode_tags_batch(encoded_tags_sequences):
    return [decode_one_hot_tags_sequence(seq) for seq in encoded_tags_sequences]

def test_inference(inference_model, test_dataset):
    true_labels = []
    pred_labels = []

    num_test_batches = test_dataset.cardinality().numpy()
    for X_batch, y_true in progress_bar(test_dataset, total=num_test_batches):
        y_pred = inference_model.predict(X_batch)
        pred_labels.append(decode_tags_batch(y_pred))
        true_labels.append(decode_tags_batch(y_true.numpy()))

    true_labels = np.array(true_labels)
    num_batches, num_samples, sentence_lenght = true_labels.shape
    true_labels = true_labels.reshape(num_batches * num_samples, sentence_lenght)
    true_labels = true_labels.tolist()

    pred_labels = np.array(pred_labels)
    pred_labels = pred_labels.reshape(num_batches * num_samples, sentence_lenght)
    pred_labels = pred_labels.tolist()

    return true_labels, pred_labels

## Load the dataset

In [8]:
df = pd.read_csv("ner.csv", encoding="ISO-8859-1", error_bad_lines=False)
df.head()

b'Skipping line 281837: expected 25 fields, saw 34\n'


Unnamed: 0.1,Unnamed: 0,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,pos,prev-iob,prev-lemma,prev-pos,prev-prev-iob,prev-prev-lemma,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
0,0,thousand,of,demonstr,NNS,lowercase,demonstrators,IN,lowercase,of,NNS,__START1__,__start1__,__START1__,__START2__,__start2__,__START2__,wildcard,__START2__,wildcard,__START1__,1.0,capitalized,Thousands,O
1,1,of,demonstr,have,VBP,lowercase,have,NNS,lowercase,demonstrators,IN,O,thousand,NNS,__START1__,__start1__,__START1__,wildcard,__START1__,capitalized,Thousands,1.0,lowercase,of,O
2,2,demonstr,have,march,VBN,lowercase,marched,VBP,lowercase,have,NNS,O,of,IN,O,thousand,NNS,capitalized,Thousands,lowercase,of,1.0,lowercase,demonstrators,O
3,3,have,march,through,IN,lowercase,through,VBN,lowercase,marched,VBP,O,demonstr,NNS,O,of,IN,lowercase,of,lowercase,demonstrators,1.0,lowercase,have,O
4,4,march,through,london,NNP,capitalized,London,IN,lowercase,through,VBN,O,have,VBP,O,demonstr,NNS,lowercase,demonstrators,lowercase,have,1.0,lowercase,marched,O


In [9]:
data = df[["sentence_idx", "word", "tag"]]
data.head(15)

Unnamed: 0,sentence_idx,word,tag
0,1.0,Thousands,O
1,1.0,of,O
2,1.0,demonstrators,O
3,1.0,have,O
4,1.0,marched,O
5,1.0,through,O
6,1.0,London,B-geo
7,1.0,to,O
8,1.0,protest,O
9,1.0,the,O


In [10]:
data["tag"].value_counts()

O        889973
B-geo     37525
B-tim     20193
B-org     20184
I-per     17382
B-per     17011
I-org     16537
B-gpe     16392
I-geo      7409
I-tim      6298
B-art       434
B-eve       348
I-eve       297
I-art       280
I-gpe       229
B-nat       226
I-nat        76
Name: tag, dtype: int64

### Build vocab

In [11]:
tagged_sentences = build_tagged_senteces(data)
print("Sample tagged sentence")
print(repr(tagged_sentences[0][:4]), "...")

tokens, tags = build_vocab(data)
num_tokens, num_tags = len(tokens), len(tags)
print("\nStats")
print(f"Num tokens: {num_tokens:,}")
print(f"Num tags: {num_tags}")

maxlen = max([len(t) for t in tokens])
print(f"maxlen: {maxlen}")


Sample tagged sentence
[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O')] ...

Stats
Num tokens: 30,173
Num tags: 17
maxlen: 64


### Tokenize sentence and label sequences

In [12]:
token2idx, idx2token, tag2idx, idx2tag = build_indexes(tokens, tags)
X, y = tokenize(tagged_sentences, token2idx, tag2idx)

print(f"Sentences dimension: {X.shape}")
print(f"Labels dimension: {y.shape}")

Sentences dimension: (35177, 64)
Labels dimension: (35177, 64, 17)


### Split the dataset into train and test

In [13]:
VALIDATION_SIZE = int(len(X) * 0.1)
BUFFER_SIZE = 50000

dataset = tf.data.Dataset.from_tensor_slices((X, y))
train_dataset = dataset.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE).batch(64, drop_remainder=True)
train_dataset = configure_dataset(train_dataset)

test_dataset = dataset.take(VALIDATION_SIZE)
test_dataset = configure_dataset(test_dataset).batch(64, drop_remainder=True)

train_dataset.cardinality(), test_dataset.cardinality()

(<tf.Tensor: shape=(), dtype=int64, numpy=494>,
 <tf.Tensor: shape=(), dtype=int64, numpy=54>)

In [14]:
# @title Baseline: Bilateral LSTM model
EPOCHS = 3  # @param {type:"number"}
LEARNING_RATE = 1e-3  # @param {type:"number"}
EMBEDDING_DROPOUT_RATE = 0.5  # @param {type:"number"}
RECURRENT_DROPOUT_RATE = 0.5  # @param {type:"number"}

# embeddings input_dim add one extra input accounting for the padding
baseline_model = Sequential([
    Embedding(input_dim=num_tokens + 1, output_dim=64, mask_zero=True),
    SpatialDropout1D(EMBEDDING_DROPOUT_RATE),
    Bidirectional(
        LSTM(units=100, return_sequences=True, dropout=RECURRENT_DROPOUT_RATE)
    ),
    TimeDistributed(Dense(num_tags, activation="softmax")),
])
# baseline_model.summary()
adam_optimizer = Adam(learning_rate=LEARNING_RATE)
baseline_model.compile(
    optimizer=adam_optimizer, loss="categorical_crossentropy", metrics=["accuracy"]
)
history = baseline_model.fit(train_dataset, epochs=EPOCHS, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [15]:
baseline_model.evaluate(test_dataset)



[0.11363252997398376, 0.9551070928573608]

In [16]:
X_test, y_test = next(test_dataset.take(1).as_numpy_iterator())
sample_idx = np.random.randint(0, len(X_test))

X_test = X_test[sample_idx]
y_pred = baseline_model.predict(X_test)
pred_tags = decode_tags_batch(y_pred)
pred_tags = np.squeeze(pred_tags)

y_test = y_test[sample_idx]
true_tags = decode_tags_batch([y_test])
true_tags = np.squeeze(true_tags)

print(f"{'Word':15}{'True':5}\t {'Pred'}\n")
print("_"*30)
for token_idx, true_tag, pred_tag in zip(X_test, true_tags, pred_tags):
    print(f"{idx2token[token_idx]:15}{true_tag}\t{pred_tag}")

Word           True 	 Pred

______________________________
Pacific        I-org	B-geo
Economic       I-org	I-org
Cooperation    I-org	I-org
Business       I-org	I-org
Advisory       I-org	I-org
Council        I-org	I-org
are            O	O
holding        O	O
meetings       O	O
this           O	O
week           O	O
to             O	O
finalize       O	O
their          O	O
annual         O	O
report         O	O
for            O	O
APEC           B-org	B-gpe
leaders        O	O
who            O	O
will           O	O
hold           O	O
a              O	O
summit         O	O
on             O	O
September      B-tim	B-tim
8              I-tim	I-tim
and            O	O
9              B-tim	I-tim
.              O	O
RepresentativesO	I-org
from           O	O
the            O	O
Asia           B-org	I-geo
Pacific        I-org	B-geo
Economic       I-org	I-org
Cooperation    I-org	I-org
Business       I-org	I-org
Advisory       I-org	I-org
Council        I-org	I-org
are            O	O
holding        O	O
mee

## Baseline Model Evaluation

Achieves f1-score=0.72

In [17]:
true_labels, pred_labels = test_inference(baseline_model, test_dataset)
print(classification_report(true_labels, pred_labels))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art       0.00      0.00      0.00       121
         eve       0.00      0.00      0.00        93
         geo       0.71      0.85      0.77      4770
         gpe       0.92      0.75      0.82      2716
         nat       0.00      0.00      0.00        46
         org       0.58      0.52      0.55      2704
         per       0.68      0.63      0.65      2403
         tim       0.80      0.82      0.81      2658

   micro avg       0.73      0.72      0.73     15511
   macro avg       0.46      0.45      0.45     15511
weighted avg       0.72      0.72      0.72     15511



# Bidirectional LSTM CRF

In [111]:
tf.keras.backend.clear_session()

In [112]:
class CRF(Layer):
    """
    Conditional Random Field loss layer
    """

    def __init__(self, output_dim, sparse_target=True, name="crf_loss", **kwargs):
        """
        Args:
            output_dim (int): number of labels to tag
            sparse_target (bool): is ground-truth label one-hot encoded
            Input shape:
                (batch_size, sentence_lenght, output_dim)
            Output shape:
                (batch_size, sentence_lenght, output_dim)
        """
        super(CRF, self).__init__(name=name, **kwargs)
        self.output_dim = output_dim
        self.sparse_target = sparse_target
        self.input_spec = InputSpec(min_ndim=3)
        self.supports_masking = False
        self.sequence_lenghts = None
        self.transitions = None

    def build(self, input_shape):
        assert len(input_shape) == 3
        f_shape = tf.TensorShape(input_shape)
        input_spec = InputSpec(min_ndim=3, axes={-1: f_shape[-1]})

        if f_shape[-1] is None:
            raise ValueError("CRF missing dimession mismatch.")
        if f_shape[-1] != self.output_dim:
            raise ValueError("Last dimession should be equal to output_dim.")
        self.input_spec = input_spec
        self.transitions = self.add_weight(
            name="transitions",
            shape=[self.output_dim, self.output_dim],
            initializer="glorot_uniform",
            trainable=True,
        )
        self.build = True

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, inputs, sequence_lengths=None, training=False, mask=None, **kwargs):
        # print("inputs!!!", tf.print(inputs))
        # print("sequence_lengths!!!", sequence_lengths)
        sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
        # if sequence_lengths is not None:
        #     assert len(sequence_lengths.shape) == 2
        #     assert tf.convert_to_tensor(sequence_lengths).dtype == "int32"
        #     seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list()
        #     assert seq_len_shape[1] == 1
        #     self.sequence_lengths = K.flatten(sequence_lengths)
        # else:
        #     self.sequence_lengths = tf.ones(tf.shape(inputs)[0], dtype=tf.int32) * (
        #         tf.shape(inputs)[1]
        #     )

        # print("sequence_lengths222", self.sequence_lengths)
        # w = tf.cast(self.sequence_lengths, dtype=tf.int32)
        # print("www", w)
        # # tf.print("w:", w, {2: w * 2})

        if mask is None:
            raw_input_shape = tf.slice(tf.shape(inputs), [0], [2])
            mask = tf.ones(raw_input_shape)
        self.sequence_lengths = K.sum(K.cast(mask, "int32"), axis=-1)

        # print("mask", mask)
        # print("K.sum", self.sequence_lengths)

        viterbi_sequence, _ = crf_decode(
            sequences, self.transitions, self.sequence_lengths
        )
        output = K.one_hot(viterbi_sequence, self.output_dim)
        return K.in_train_phase(sequences, output)

    @property
    def loss(self):
        def crf_loss(y_true, y_pred):
            # print("inside crf_loss", self.sequence_lengths)
            y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
            log_likelihood, self.transitions = crf_log_likelihood(
                y_pred,
                tf.cast(K.argmax(y_true), dtype=tf.int32)
                if self.sparse_target
                else y_true,
                self.sequence_lengths,
                transition_params=self.transitions,
            )
            return tf.reduce_mean(-log_likelihood)

        return crf_loss

    @property
    def accuracy(self):
        def viterbi_accuracy(y_true, y_pred):
            mask = K.cast(K.all(K.greater(y_pred, -1e10), axis=2), K.floatx())
            shape = tf.shape(y_pred)
            sequence_lenghts = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
            y_pred, _ = crf_decode(y_pred, self.transitions, sequence_lenghts)
            if self.sparse_target:
                y_true = K.argmax(y_true, 2)
            y_pred = K.cast(y_pred, "int32")
            y_true = K.cast(y_true, "int32")
            corrects = K.cast(K.equal(y_true, y_pred), K.floatx())
            return K.sum(corrects * mask) / K.sum(mask)

        return viterbi_accuracy

    def compute_output_shape(self, input_shape):
        tf.TensorShape(input_shape).asser_has_rank(3)
        return input_shape[:2] + (self.output_dim,)

    def get_config(self):
        config = {
            "output_dim": self.output_dim,
            "sparse_target": self.sparse_target,
            "supports_masking": self.supports_masking,
            "transitions": K.eval(self.transitions),
        }
        return config

In [98]:
# @title Challenger: BiLSTM model with CRF loss
EPOCHS =   10# @param {type:"number"}
LEARNING_RATE = 1e-3  # @param {type:"number"}
EMBEDDING_DROPOUT_RATE = 0.5  # @param {type:"number"}
RECURRENT_DROPOUT_RATE = 0.5  # @param {type:"number"}

# embeddings input_dim add one extra input accounting for the padding
bi_lstm_crf_model = Sequential([
    tf.keras.layers.Input(shape=(maxlen,)),
    Embedding(input_dim=num_tokens + 1, output_dim=64, input_length=maxlen, mask_zero=True),
    SpatialDropout1D(EMBEDDING_DROPOUT_RATE),
    Bidirectional(
        LSTM(units=100, return_sequences=True, dropout=RECURRENT_DROPOUT_RATE)
    ),
    TimeDistributed(Dense(num_tags, activation=None)),
    CRF(num_tags, name="crf"),
])
# model.summary()
adam_optimizer = Adam(learning_rate=LEARNING_RATE)
crf_loss = bi_lstm_crf_model.layers[-1].loss
crf_metrics = bi_lstm_crf_model.layers[-1].accuracy
bi_lstm_crf_model.compile(
    optimizer=adam_optimizer, loss=crf_loss, metrics=[crf_metrics]
)
bi_lstm_crf_model.summary()
history = bi_lstm_crf_model.fit(train_dataset, epochs=EPOCHS, verbose=1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 64, 64)            1931136   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 64, 64)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 64, 200)           132000    
_________________________________________________________________
time_distributed (TimeDistri (None, 64, 17)            3417      
_________________________________________________________________
crf (CRF)                    (None, 64, 17)            289       
Total params: 2,066,842
Trainable params: 2,066,842
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [118]:
bi_lstm_crf_model.evaluate(test_dataset)



[79.22120666503906, 0.014485676772892475]

In [20]:
# #@title Train/Test dataset split
# batch_size = 64 #@param {type:"number"}

# X, y = tokenize(tagged_sentences, token2idx, tag2idx, one_hot_encode_tags=False)

# print(f"Sentences dimension: {X.shape}")
# print(f"Labels dimension: {y.shape}")

# dataset = tf.data.Dataset.from_tensor_slices((X, y))
# train_dataset = (
#     dataset.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE).batch(batch_size, drop_remainder=True)
# )
# train_dataset = configure_dataset(train_dataset)

# test_dataset = dataset.take(VALIDATION_SIZE)
# test_dataset = configure_dataset(test_dataset).batch(batch_size, drop_remainder=True)

# print("-" * 10)
# train_batches_size = train_dataset.cardinality().numpy()
# print("Train batches size:", train_batches_size)

# test_batches_size = test_dataset.cardinality().numpy()
# print("Test batches size:", test_batches_size)

In [21]:
# SPATIAL_DROPOUT_RATE = 0.5
# RECURRENT_DROPOUT_RATE = 0.5


# class BiLstmCRF(Model):
#     def __init__(
#         self,
#         vocab_dim,
#         tag_dim,
#         max_seq_len,
#         embedding_dim=128,
#         lstm_men_dim=200,
#         name="BiLstmCRF",
#         **kwargs
#     ):
#         super(NerBiLstmCRF, self).__init__(name=name, **kwargs)
#         self.embedding = Embedding(vocab_dim, embedding_dim)
#         self.dropout = SpatialDropout1D(SPATIAL_DROPOUT_RATE)
#         self.lstm = LSTM(
#             lstm_men_dim,
#             return_sequences=True,
#             dropout=RECURRENT_DROPOUT_RATE
#         )
#         self.bi_lstm = Bidirectional(self.lstm)
#         self.classifier = Dense(tag_dim, activation="softmax")
#         self.time_distributed_classifier = TimeDistributed(self.classifier)
#         self.sequence_lengths = tf.expand_dims(max_seq_len, axis=0)
#         # let the crf layer to initialize the transition_params for us
#         self.transition_params = None 

#     def call(self, inputs, labels=None, training=False):
#         token_embeddings = self.embedding(inputs)
#         token_embeddings = self.dropout(token_embeddings, training)
#         logits = self.bi_lstm(token_embeddings)
#         logits = self.time_distributed_classifier(logits)

#         if labels is not None:
#             label_sequences = labels
#             log_likelihood, self.transition_params = tf_ad.text.crf_log_likelihood(
#                 logits, labels, self.sequence_lengths, transition_params=self.transition_params
#             )
#             return logits, log_likelihood
#         return logits

In [23]:
# #@title Train Bilateral LSTM model with CRF
# EPOCHS  =  60#@param {type:"number"}
# LEARNING_RATE = 1e-3 #@param {type:"number"}

# bi_lstm_crf_model = BiLstmCRF(num_tokens, num_tags, max_seq_len=maxlen)
# optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

# train_loss_metric = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)

# best_acc = 0
# step = 0

# epoch_bar = master_bar(range(epochs))
# for epoch in epoch_bar:
#     for tokens_batch, labels_batch in progress_bar(train_dataset, total=train_batches_size, parent=epoch_bar):
#         loss, logits = train_step_fn(
#             bi_lstm_crf_model, optimizer, tokens_batch, labels_batch
#         )
#         train_loss_metric(loss)
#         epoch_bar.child.comment = f"training loss : {train_loss_metric.result():.3f}"
#         if step % 20 == 0:
#             accuracy = 0
#     tf.summary.scalar('training loss', train_loss_metric.result(), step=epoch)
#     epoch_bar.write(f"Epoch {epoch} - train loss: {train_loss_metric.result():.3f} valid loss 0 accuracy: 0%")
#     train_loss_metric.reset_states()


In [None]:
tf.summary.flush()

In [None]:
# model_222 = Sequential([
#     Embedding(input_dim=num_tokens, output_dim=64),
#     SpatialDropout1D(0.5),
#     Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.5)),
#     LSTM(units=100, return_sequences=True, recurrent_dropout=0.5),
#     TimeDistributed(Dense(num_tags))
# ])

# def loss(labels, logits):
#     return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

# model_222.compile(optimizer="adam", loss=loss, metrics=["accuracy"])
# X = [[token2idx[t] for t, _ in s] for s in sentences]
# X = pad_sequences(X, maxlen=maxlen, padding="post", value=unk_token_idx)

# y = [[tag2idx[t] for _, t in s] for s in sentences]
# y = pad_sequences(y, maxlen=maxlen, padding="post", value=unk_tag_idx)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# history = model_222.fit(X_train, np.array(y_train), validation_split=0.2, batch_size=32, epochs=3, verbose=1)

In [None]:
# sample_idx = np.random.randint(0, len(X_test))
# pred = model_222(tf.expand_dims(X_test[sample_idx], 0))
# pred = tf.squeeze(pred, 0)
# pred = tf.random.categorical(pred, num_samples=1)
# pred_tags = pred.numpy().flatten()
# ground_truth = y_test[sample_idx]

# print(f"{'Word':15}{'True':5}\t {'Pred'}\n")
# print("_"*30)
# for token, gt_tag, pred_tag in zip(X_test[sample_idx], ground_truth, pred_tags):
#     print(f"{idx2token[token]:15}{idx2tag[gt_tag]}\t{idx2tag[pred_tag]}")

In [100]:
!pip install tf2crf

Collecting tf2crf
  Downloading https://files.pythonhosted.org/packages/eb/f5/e9f972be845a2b0ea93c76d26ed6b7bde599a48f70554a30a528117731c8/tf2crf-0.1.29-py2.py3-none-any.whl
Installing collected packages: tf2crf
Successfully installed tf2crf-0.1.29


In [117]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, Dense
from tensorflow.keras.models import Model
#from tf2crf import CRF, ModelWithCRFLoss

inputs = Input(shape=(None,), dtype='int32')
output = Embedding(100, 40, trainable=True, mask_zero=True)(inputs)
output = Bidirectional(GRU(64, return_sequences=True))(output)
output = Dense(9, activation=None)(output)
crf = CRF(output_dim=9, sparse_target=False)(output)
base_model = Model(inputs, crf)
#model = ModelWithCRFLoss(base_model)
#model.compile(optimizer='adam')

adam_optimizer = Adam(learning_rate=LEARNING_RATE)
crf_loss = base_model.layers[-1].loss
crf_metrics = base_model.layers[-1].accuracy
base_model.compile(
    optimizer=adam_optimizer, loss=crf_loss, metrics=[crf_metrics]
)

x = [[5, 2, 3] * 3] * 10
y = [[1, 2, 3] * 3] * 10

base_model.fit(x=x, y=y, epochs=20, batch_size=2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7faa6d6c8630>

In [139]:
tf.keras.backend.clear_session()

import tensorflow as tf
import tf2crf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, Dense
from tensorflow.keras.models import Model
#from tf2crf import CRF, ModelWithCRFLoss

inputs = Input(shape=(None,), dtype='int32')
output = Embedding(input_dim=num_tokens + 1, output_dim=64, input_length=maxlen, mask_zero=True, trainable=True)(inputs)
output = Bidirectional(LSTM(units=100, return_sequences=True, dropout=RECURRENT_DROPOUT_RATE))(output)
output = Dense(num_tags, activation=None)(output)
crf = tf2crf.CRF(dtype='float32')
output = crf(output)
x_base_model = Model(inputs, output)
x_model = ModelWithCRFLoss(x_base_model)
x_model.compile(optimizer='adam')

token2idx, idx2token, tag2idx, idx2tag = build_indexes(tokens, tags)
X, y = tokenize(tagged_sentences, token2idx, tag2idx, one_hot_encode_tags=False)

print(f"Sentences dimension: {X.shape}")
print(f"Labels dimension: {y.shape}")

VALIDATION_SIZE = int(len(X) * 0.1)
BUFFER_SIZE = 50000

dataset = tf.data.Dataset.from_tensor_slices((X, y))
train_dataset = dataset.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE).batch(64, drop_remainder=True)
train_dataset = configure_dataset(train_dataset)

test_dataset = dataset.take(VALIDATION_SIZE)
test_dataset = configure_dataset(test_dataset).batch(64, drop_remainder=True)

train_dataset.cardinality(), test_dataset.cardinality()

x_model.fit(train_dataset, epochs=3, verbose=1)

Sentences dimension: (35177, 64)
Labels dimension: (35177, 64)
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7faa6bea8ac8>

In [140]:
x_model.evaluate(test_dataset)



[4.671042442321777, 0.9560539722442627]

In [148]:
X_test, y_test = next(test_dataset.take(1).as_numpy_iterator())
sample_idx = np.random.randint(0, len(X_test))

X_test = X_test[sample_idx]
y_pred = x_model.predict(X_test)

# pred_tags = decode_tags_batch(y_pred)
for t, p in zip(y_test[sample_idx], np.squeeze(y_pred[0])):
    print(t, p)
#pred_tags = np.squeeze(y_pred[0])

# y_test = y_test[sample_idx]
# true_tags = decode_tags_batch([y_test])
# true_tags = np.squeeze(true_tags)

# print(f"{'Word':15}{'True':5}\t {'Pred'}\n")
# print("_"*30)
# for token_idx, true_tag, pred_tag in zip(X_test, true_tags, pred_tags):
#     print(f"{idx2token[token_idx]:15}{true_tag}\t{pred_tag}")

16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16
16 16


In [156]:
true_tags = decode_tags_sequence(np.squeeze(y_pred[0]))