In [1]:
import os
import numpy as np
import tensorflow as tf
from data import load_file, preProcessingIWSLT12
from transformers import BertTokenizer
from transformers import TFBertForMaskedLM
from datetime import datetime
import json
import sys

In [2]:
n = 20

vocab_size = 30522
segment_size = 32
batch_size = 2
train_layer_ind = 0  # 0 for all model, -2 for only top layer
learat = 1e-4
num_epochs = 10

hyperparameters = {
    'vocab_size': vocab_size,
    'segment_size': segment_size,
    'learning_rate': learat,
    'batch_size': batch_size
}

save_path = 'ModelsExp/{}/'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))
os.mkdir(save_path)
with open(save_path + 'hyperparameters.json', 'w') as f:
    json.dump(hyperparameters, f)

In [3]:
### instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [4]:
punctuation_enc = {
    'O': 0,
    'COMMA': 1,
    'PERIOD': 2,
    'QUESTION': 3
}

In [5]:
def encode_data(data, tokenizer, punctuation_enc):
    """
    Takes in the dataset mode of two columns separated by comma.
    First column is a word, second column is what comes after the word (blank space, comma, period, etc.)
    Output X, containing the token id of the the words.
    Output y, which contains what there is after the word.
    """
    X = []
    Y = []
    for line in data:
        word, punc = line.split(',')
        punc = punc.strip()
        tokens = tokenizer.tokenize(word)
        x = tokenizer.convert_tokens_to_ids(tokens)
        y = [punctuation_enc[punc]]
        # one word can be encoded in more than one token
        if len(x) > 0:
            if len(x) > 1:
                y = (len(x)-1)*[0]+y
            X += x
            Y += y
    return X, Y

In [6]:
def insert_target(x, segment_size):
    """
    Restructure x in order to have sequences of length segment_size.
    Output X, an array with dimensions len(x) * segment_size.
    In each segment the target is placed in the middle and replaced with a zero.
    """
    X = []
    x_pad = x[-((segment_size-1)//2-1):]+x+x[:segment_size//2]

    for i in range(len(x_pad)-segment_size+2):
        segment = x_pad[i:i+segment_size-1]
        segment.insert((segment_size-1)//2, 0)
        X.append(segment)

    return np.array(X)

In [7]:
# print('\nPRE-PROCESS AND PROCESS DATA')


# name of dataset with sentences
data_name = "IWSLT12"
infSet_01 = 'Data' + data_name + '/' + 'toyTrain_01.txt'


# from sentences to list of words+punctuation
outInf = preProcessingIWSLT12(infSet_01)


processedData = load_file(outInf)


X_, y_ = encode_data(processedData, tokenizer, punctuation_enc)
X = insert_target(X_, segment_size)
y = np.asarray(y_)


# # get only a fraction of dataset
# X = X[0:n]
# y = y[0:n]


# # instantiate tf.data.Dataset
# dataset = tf.data.Dataset.from_tensor_slices((X, y)).batch(batch_size)
# # dataset = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(buffer_size=10000).batch(batch_size)

In [8]:
X_[0:12]

[2009, 2064, 2022, 1037, 2200, 8552, 2518, 1996, 4153, 1998, 2009, 2064]

In [13]:
print(type(X[0]))
print(X[0])

<class 'numpy.ndarray'>
[1997 2054 1005 1055 2183 2006 2021 2477 2008 2057 2035 3492 2092 2113
 2009    0 2064 2022 1037 2200 8552 2518 1996 4153 1998 2009 2064 2022
 1037 2200 8552 2518]
