In [1]:
import os
import numpy as np
import tensorflow as tf
from data import load_file
from transformers import BertTokenizer
from transformers import TFBertForMaskedLM
from datetime import datetime
import json
import sys

In [2]:
n = 20

vocab_size = 30522
segment_size = 32
batch_size = 2
train_layer_ind = 0  # 0 for all model, -2 for only top layer
learat = 1e-4
num_epochs = 10

hyperparameters = {
    'vocab_size': vocab_size,
    'segment_size': segment_size,
    'learning_rate': learat,
    'batch_size': batch_size
}

save_path = 'ModelsExp/{}/'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))
os.mkdir(save_path)
with open(save_path + 'hyperparameters.json', 'w') as f:
    json.dump(hyperparameters, f)

In [3]:
### instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [4]:
def encodeDataInfer(data, tokenizer):
    """
    Takes in the data made of sentences, with words separated by blank spaces (no punctuation).
    Output X, list containing the token id of the words.
    """
    X = []
    for line in data:
        words = line.split(" ")
        words[-1] = words[-1].strip()  # get rid of \n at the end of the line
        for word in words:
            token = tokenizer.tokenize(word)
            tokenId = tokenizer.convert_tokens_to_ids(token)
            X += tokenId
    return X

In [5]:
def insert_target(x, segment_size):
    """
    Restructure x in order to have sequences of length equal to segment_size.
    Output X, an array with dimensions len(x) * segment_size.
    In each segment the target is placed in the middle and replaced with a zero.
    """
    X = []
    x_pad = x[-((segment_size-1)//2-1):]+x+x[:segment_size//2]

    for i in range(len(x_pad)-segment_size+2):
        segment = x_pad[i:i+segment_size-1]
        segment.insert((segment_size-1)//2, 0)
        X.append(segment)

    return np.array(X)

In [6]:
# print('\nPRE-PROCESS AND PROCESS DATA')


# name of dataset with sentences
data_name = "IWSLT12"
infSet_01 = 'Data' + data_name + '/' + 'toyInfer_01.txt'
data = load_file(infSet_01)


X_ = encodeDataInfer(data, tokenizer)
X = insert_target(X_, segment_size)


# # get only a fraction of dataset
# X = X[0:n]


# instantiate tf.data.Dataset
dataset = tf.data.Dataset.from_tensor_slices((X,)).batch(batch_size)

In [7]:
print(X.shape)
print(type(X[0]))
print(X[0])

(174, 32)
<class 'numpy.ndarray'>
[ 3023  2057 17835  8216  8310  1997 10796  2046  1996  2250  1998  2046
  1996  2300  2009     0  2064  2022  1037  2200  8552  2518  1996  4153
  1998  2009  2064  2022  1037  2200  8552  2518]


## Build the model

In [8]:
bert_input = tf.keras.Input(shape=(segment_size), dtype='int32', name='bert_input')
x = TFBertForMaskedLM.from_pretrained('bert-base-uncased')(bert_input)[0]
x = tf.keras.layers.Reshape((segment_size*vocab_size,))(x)
dense_out = tf.keras.layers.Dense(4)(x)


model = tf.keras.Model(bert_input, dense_out, name='BertModel')
# print(model.summary())

In [9]:
checkpoint_path = "Models/20200425_142515/cp-008.ckpt"

In [10]:
# load the weights
model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fdee062feb0>

## Calculate predictions

In [11]:
feats = next(iter(dataset))

In [12]:
preds = np.argmax(model.predict(dataset), axis=1)

In [13]:
# print(preds.shape)
# print(preds)
# print(type(preds[0]))

In [14]:
# print(len(X_))
# print(X_)

## Return the text with restored (inferred) punctuation

In [15]:
punDec = {
    '0': "SPACE",
    '1': "COMMA",
    '2': "PERIOD",
    '3': "QUESTION"
}

In [16]:
def restorePunctuation(X, preds, punDec, tokenizer, fileName):
    file = open(fileName, 'w')
    for i in range(len(preds)):
        word = tokenizer.convert_ids_to_tokens(X_[i])
        pun = punDec[str(preds[i])]
        file.write(word + " | " + pun + " \n")
    file.close()

In [17]:
restorePunctuation(X_, preds, punDec, tokenizer, 'textRestored.txt')