# Evaluate Model Performance on the Test Set 

In [1]:
import numpy as np
import tensorflow as tf

from data import load_file, process_data, create_data_loader, preProcessingIWSLT12

from transformers import BertTokenizer
from transformers import TFBertForMaskedLM

from model import create_model

from datetime import datetime
import os
import json

import sys

In [2]:
# punctuation_enc = {
#     'O': 0,
#     'PERIOD': 1,
# }

punctuation_enc = {
    'O': 0,
    'COMMA': 1,
    'PERIOD': 2,
    'QUESTION': 3
}

### Hyper-parameters

In [3]:
n = 10

vocab_size = 30522
segment_size = 32
batch_size = 5
train_layer_ind = -2  # 0 for all model, -2 for only top layer
num_epochs = 2

hyperparameters = {
    'vocab_size': vocab_size,
    'segment_size': segment_size,
    'batch_size': batch_size
}

In [4]:
# name of data with the sentences
data_name = "IWSLT12"
testSet_01 = 'Data' + data_name + '/extractTest_01.txt'

# from sentences to list of words+punctuation

preProcessingIWSLT12(testSet_01)

data_test = load_file('./Data/testSet_02.txt')

# data_train = load_file('./Data/trainSet_02.txt')
data_test = load_file('./Data/testSet_02.txt')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# X_train, y_train = process_data(data_train, tokenizer, punctuation_enc, segment_size)
# y_train = np.asarray(y_train)
X_test, y_test = process_data(data_test, tokenizer, punctuation_enc, segment_size)
y_test = np.asarray(y_test)

### Build the dataset

In [5]:
extract_X = X_test[0:n]
extract_y = y_test[0:n]

In [6]:
dataset = tf.data.Dataset.from_tensor_slices((extract_X, extract_y))
dataset = dataset.batch(batch_size)

### Build the model

In [7]:
bert_input = tf.keras.Input(shape=(segment_size), dtype='int32', name='bert_input')
x = TFBertForMaskedLM.from_pretrained('bert-base-uncased')(bert_input)[0]
x = tf.keras.layers.Reshape((segment_size*vocab_size,))(x)
dense_out = tf.keras.layers.Dense(4)(x)

net = tf.keras.Model(bert_input, dense_out, name='network')

net.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

### Evaluate the model

In [8]:
features, labels = next(iter(dataset))

In [9]:
predictions = net(features)

In [10]:
print("Predictions: {}".format(tf.argmax(predictions, axis=1)))
print("     Labels: {}".format(format(labels)))

Predictions: [0 0 0 0 0]
     Labels: [0 0 1 0 0]


In [11]:
# evaluate the untrained model
loss, acc = net.evaluate(features, labels, verbose=2)
print("Untrained model, accuracy: {:5.2f}%".format(100*acc))

5/5 - 18s - loss: 6.2531 - accuracy: 0.8000
Untrained model, accuracy: 80.00%


In [12]:
# !ls -l ModelsExp

In [13]:
# !ls ModelsExp/20200424_180927

In [14]:
checkpoint_path = "ModelsExp/20200424_180927/cp-003.ckpt"

In [15]:
bert_input = tf.keras.Input(shape=(segment_size), dtype='int32', name='bert_input')
x = TFBertForMaskedLM.from_pretrained('bert-base-uncased')(bert_input)[0]
x = tf.keras.layers.Reshape((segment_size*vocab_size,))(x)
dense_out = tf.keras.layers.Dense(4)(x)

net = tf.keras.Model(bert_input, dense_out, name='network')

net.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [16]:
net.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f283bf5f6a0>

In [17]:
predictions = net(features)
print("Predictions: {}".format(tf.argmax(predictions, axis=1)))
print("     Labels: {}".format(format(labels)))

Predictions: [0 0 0 0 0]
     Labels: [0 0 1 0 0]


In [18]:
loss, acc = net.evaluate(features, labels, verbose=2)
print("Trained model, accuracy: {:5.2f}%".format(100*acc))

5/5 - 17s - loss: 373.7167 - accuracy: 0.8000
Trained model, accuracy: 80.00%


In [19]:
# evaluate the model
loss, acc = net.evaluate(dataset)



### Evaluate the model, get statistics for each class

In [20]:
# build and compile model

bert_input = tf.keras.Input(shape=(segment_size), dtype='int32', name='bert_input')
x = TFBertForMaskedLM.from_pretrained('bert-base-uncased')(bert_input)[0]
x = tf.keras.layers.Reshape((segment_size*vocab_size,))(x)
dense_out = tf.keras.layers.Dense(4, activation='softmax')(x)

net = tf.keras.Model(bert_input, dense_out, name='network')

net.compile(optimizer='adam',
              loss=tf.losses.CategoricalCrossentropy(from_logits=False),
              metrics=[tf.keras.metrics.Recall(class_id=0, name='recall_0'),
                       tf.keras.metrics.Precision(class_id=0, name='Precision_0'),
                       tf.keras.metrics.Recall(class_id=1, name='recall_1'),
                       tf.keras.metrics.Precision(class_id=1, name='Precision_1'),
                       tf.keras.metrics.Recall(class_id=2, name='recall_2'),
                       tf.keras.metrics.Precision(class_id=2, name='Precision_2'),
                       tf.keras.metrics.Recall(class_id=3, name='recall_3'),
                       tf.keras.metrics.Precision(class_id=3, name='Precision_3'),
                      ])

In [21]:
# load weights
net.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f2848e55ba8>

In [22]:
features, labels = next(iter(dataset))

In [23]:
labels

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([0, 0, 1, 0, 0])>

In [24]:
labels_hot = tf.one_hot(labels, 4, dtype='int64')

In [25]:
labels_hot

<tf.Tensor: shape=(5, 4), dtype=int64, numpy=
array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0]])>

In [26]:
net.predict(features)

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)

In [28]:
net.evaluate(features, labels_hot)



[373.71673583984375, 1.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]