In [1]:
#http://alexminnaar.com/2019/08/22/ner-rnns-tensorflow.html
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import pandas as pd
import numpy as np
import os
import time

print(tf.__version__)

2.7.0-dev20210709


In [2]:
train_ds = pd.read_csv('../dataset/extraction/train_dataset.csv')
test_ds  = pd.read_csv('../dataset/extraction/test_dataset.csv')
valid_ds   = pd.read_csv('../dataset/extraction/dev_dataset.csv')


In [3]:
labels = set()


def ds2Examples(ds_name):
    examples = []
    example  = [[],[]]
    for _, row in ds_name.iterrows():
        word = row['Word']
        entity  = row['Entity']
        if pd.notna(word) and pd.notna(entity):
            example[0].append(word)
            example[1].append(entity)
            labels.add(entity)
        else:
            examples.append(example)
            example = [[],[]]
    return examples

train_examples = ds2Examples(train_ds)
test_examples  = ds2Examples(test_ds)
valid_examples = ds2Examples(valid_ds)

print(train_examples)

[[['Can', 'you', 'show', 'me', 'some', 'movie', 'listings', '?'], ['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o']], [['I', 'am', 'in', 'San', 'Jose', '.', 'Also', 'I', 'want', 'to', 'see', 'a', '3D', 'movie', '.'], ['o', 'o', 'o', 'B-location', 'I-location', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o']], [['Yes', ',', 'please', 'find', 'me', 'some', 'other', 'movies', '.', 'I', 'want', 'to', 'see', 'something', 'at', 'CineLux', 'Almaden', '.'], ['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'B-theater_name', 'I-theater_name', 'o']], [['What', 'times', 'do', 'you', 'have', 'available', 'for', 'Curse', 'of', 'La', 'Llorona', '?', 'I', "'ve", 'been', 'wanting', 'to', 'see', 'that', '!'], ['o', 'o', 'o', 'o', 'o', 'o', 'o', 'B-movie_name', 'I-movie_name', 'I-movie_name', 'I-movie_name', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o']], [['March', '5th', '.'], ['B-show_date', 'I-show_date', 'o']], [['That', 'will', 'work', '!', 'That', 'is', 'all', 'I', 'need', '.'],

In [4]:
all_text = " ".join([" ".join(x[0]) for x in train_examples + test_examples + valid_examples])
vocab = sorted(set(all_text))

char2idx = {u:i+1 for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
label2idx = {u:i+1 for i, u in enumerate(labels)}
idx2label = np.array(labels)

print(idx2label)
print(char2idx)


{'I-theater_name', 'I-director', 'I-starring', 'I-genre', 'B-directed_by', 'I-show_date', 'B-movie_name', 'B-number_of_tickets', 'I-directed_by', 'B-show_time', 'B-genre', 'I-location', 'o', 'B-theater_name', 'B-show_type', 'B-starring', 'B-director', 'B-cast', 'B-show_date', 'I-movie_name', 'B-location', 'I-show_time', 'I-cast'}
{' ': 1, '!': 2, '&': 3, "'": 4, ',': 5, '-': 6, '.': 7, '/': 8, '0': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, '9': 18, ':': 19, ';': 20, '>': 21, '?': 22, 'A': 23, 'B': 24, 'C': 25, 'D': 26, 'E': 27, 'F': 28, 'G': 29, 'H': 30, 'I': 31, 'J': 32, 'K': 33, 'L': 34, 'M': 35, 'N': 36, 'O': 37, 'P': 38, 'Q': 39, 'R': 40, 'S': 41, 'T': 42, 'U': 43, 'V': 44, 'W': 45, 'X': 46, 'Y': 47, 'Z': 48, 'a': 49, 'b': 50, 'c': 51, 'd': 52, 'e': 53, 'f': 54, 'g': 55, 'h': 56, 'i': 57, 'j': 58, 'k': 59, 'l': 60, 'm': 61, 'n': 62, 'o': 63, 'p': 64, 'q': 65, 'r': 66, 's': 67, 't': 68, 'u': 69, 'v': 70, 'w': 71, 'x': 72, 'y': 73, 'z': 74}


In [5]:

def split_char_labels(eg):
    tokens = eg[0]
    labels = eg[1]

    input_chars = []
    output_char_labels = []

    for token,label in zip(tokens,labels):

        input_chars.extend([char for char in token])
        input_chars.extend(' ')
        output_char_labels.extend([label]*len(token))
        output_char_labels.extend('o')
        

    return [[char2idx[x] for x in input_chars[:-1]],np.array([label2idx[x] for x in output_char_labels[:-1]])]

train_formatted = [split_char_labels(eg) for eg in train_examples]
test_formatted  = [split_char_labels(eg) for eg in test_examples]
valid_formatted   = [split_char_labels(eg) for eg in valid_examples]

print(len(train_formatted))
print(len(test_formatted))
print(len(valid_formatted))


2065
781
176


In [6]:
def gen_train_series():
    for eg in train_formatted:
        yield eg[0],eg[1]

def gen_valid_series():
    for eg in valid_formatted:
        yield eg[0],eg[1]

def gen_test_series():
    for eg in test_formatted:
        yield eg[0],eg[1]
      
# create Dataset objects for train, test and validation sets  
series = tf.data.Dataset.from_generator(gen_train_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))
series_valid = tf.data.Dataset.from_generator(gen_valid_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))
series_test = tf.data.Dataset.from_generator(gen_test_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))

BATCH_SIZE = 128
BUFFER_SIZE=1000

# create padded batch series objects for train, test and validation sets
ds_series_batch = series.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)
ds_series_batch_valid = series_valid.padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)
ds_series_batch_test = series_test.padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)

# print example batches
for input_example_batch, target_example_batch in ds_series_batch_valid.take(1):
    print(input_example_batch)
    print(target_example_batch)


tf.Tensor(
[[31  1  4 ...  0  0  0]
 [29 66 53 ...  0  0  0]
 [36 63  1 ...  0  0  0]
 ...
 [36 63  1 ...  0  0  0]
 [31  1 49 ...  0  0  0]
 [45 56 63 ...  0  0  0]], shape=(128, 157), dtype=int32)
tf.Tensor(
[[13 13 13 ...  0  0  0]
 [13 13 13 ...  0  0  0]
 [13 13 13 ...  0  0  0]
 ...
 [13 13 13 ...  0  0  0]
 [13 13 13 ...  0  0  0]
 [13 13 13 ...  0  0  0]], shape=(128, 157), dtype=int32)


In [7]:
vocab_size = len(vocab)+1

  # The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

label_size = len(labels)  

# build LSTM model
def build_model(vocab_size,label_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                          batch_input_shape=[batch_size, None],mask_zero=True),
        tf.keras.layers.LSTM(rnn_units,
                    return_sequences=True,
                    stateful=True,
                    recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(label_size)
        ])
    return model

model = build_model(
    vocab_size = len(vocab)+1,
    label_size=len(labels)+1,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 256)          19200     
_________________________________________________________________
lstm (LSTM)                  (128, None, 1024)         5246976   
_________________________________________________________________
dense (Dense)                (128, None, 24)           24600     
Total params: 5,290,776
Trainable params: 5,290,776
Non-trainable params: 0
_________________________________________________________________


In [8]:
import os

# define loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss,metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [10]:

EPOCHS=20
history = model.fit(ds_series_batch, epochs=EPOCHS, validation_data=ds_series_batch_valid,callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [11]:
from sklearn.metrics import classification_report, confusion_matrix

preds = np.array([])
y_trues= np.array([])

# iterate through test set, make predictions based on trained model
for input_example_batch, target_example_batch in ds_series_batch_test:

    pred=model.predict_on_batch(input_example_batch)
    pred_max=tf.argmax(tf.nn.softmax(pred),2).numpy().flatten()
    y_true=target_example_batch.numpy().flatten()

    preds=np.concatenate([preds,pred_max])
    y_trues=np.concatenate([y_trues,y_true])

# remove padding from evaluation
remove_padding = [(p,y) for p,y in zip(preds,y_trues) if y!=0]

r_p = [x[0] for x in remove_padding]
r_t = [x[1] for x in remove_padding]

# print confusion matrix and classification report
print(confusion_matrix(r_p,r_t))
print(classification_report(r_p,r_t))

[[  147    10     0     1     5     0   111     0     1    23    59     6
      0     3     0   156    65     1    52]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    4     8     1   480    14     4    10   172     0     0   619     0
     17     4   108    57    15   315     5]
 [    5     0    56     0   109     0     0     0    27     0   216     0
      7    13     6    30    11     0     5]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     1     0     0     

  _warn_prf(average, modifier, msg_start, len(result))
