# Evaluate Model Performance on the Test Set 

In [1]:
import numpy as np
import tensorflow as tf

from dataProcessing import load_file, preProcessingIWSLT12, encode_data, insert_target

from transformers import BertTokenizer
from transformers import TFBertForMaskedLM

from model import create_model

from datetime import datetime
import os
import json

import sys

In [2]:
### Instantiate tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [3]:
### path to weights
checkpoint_path = "Models/20200425_142515/cp-008.ckpt"

In [4]:
### punctuation encoder
punctuation_enc = {
    'O': 0,
    'COMMA': 1,
    'PERIOD': 2,
    'QUESTION': 3
}

### Hyper-parameters

In [5]:
n = 10000

vocab_size = 30522
segment_size = 32
batch_size = 20
train_layer_ind = -2  # 0 for all model, -2 for only top layer
num_epochs = 2

hyperparameters = {
    'vocab_size': vocab_size,
    'segment_size': segment_size,
    'batch_size': batch_size
}

### Get the dataset

In [6]:
# name of data with the sentences
data_name = "IWSLT12"
testSet_01 = 'Data' + data_name + '/extractTrain_01.txt'

data = load_file(preProcessingIWSLT12(testSet_01))

X_, y_ = encode_data(data, tokenizer, punctuation_enc)
X = insert_target(X_, segment_size)
y = np.asarray(y_)

# one hot encode the labels
y = tf.one_hot(y, 4, dtype='int64').numpy()

# get only a fraction of data
X = X[0:n]
y = y[0:n]

dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.batch(batch_size)

In [7]:
X.shape

(10000, 32)

### Build the model

In [8]:
# build and compile model

bert_input = tf.keras.Input(shape=(segment_size), dtype='int32', name='bert_input')
x = TFBertForMaskedLM.from_pretrained('bert-base-uncased')(bert_input)[0]
x = tf.keras.layers.Reshape((segment_size*vocab_size,))(x)
dense_out = tf.keras.layers.Dense(4, activation='softmax')(x)

net = tf.keras.Model(bert_input, dense_out, name='network')

net.compile(optimizer='adam',
              loss=tf.losses.CategoricalCrossentropy(from_logits=False),
              metrics=[tf.keras.metrics.Recall(class_id=0, name='Rec_0'),
                       tf.keras.metrics.Precision(class_id=0, name='Prec_0'),
                       tf.keras.metrics.Recall(class_id=1, name='Rec_1'),
                       tf.keras.metrics.Precision(class_id=1, name='Prec_1'),
                       tf.keras.metrics.Recall(class_id=2, name='Rec_2'),
                       tf.keras.metrics.Precision(class_id=2, name='Prec_2'),
                       tf.keras.metrics.Recall(class_id=3, name='Rec_3'),
                       tf.keras.metrics.Precision(class_id=3, name='Prec_3'),
                      ])

In [9]:
# load the weights
net.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fe36c1e2130>

### Evaluate the model

In [10]:
net.evaluate(dataset)



[0.06665027141571045,
 0.9940733909606934,
 0.9973699450492859,
 0.7927631735801697,
 0.9659318923950195,
 0.9865771532058716,
 0.8258426785469055,
 1.0,
 0.8148148059844971]