# Experimenting in Using Time Stamps

Build a model with a first layer performing input ids embedding.

In [1]:
import os
import numpy as np
import tensorflow as tf
from dataProcessing import load_file, encode_data, insert_target, processingScriber00
from transformers import AutoTokenizer, TFCamembertForMaskedLM
from datetime import datetime
import sys

In [2]:
### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)

### punctuation encoder
punctuation_enc = {
    'O': 0,
    'PERIOD': 1,
}

### Set Vocabulary Size
vocabSize = 32005

### hyper-parameters
sequenceSize = 32
batchSize = 32
learningRate = 1e-5
trainLayerIndex = 0
numEpo = 1

listHyper0 = ['sequenceSize', 'batchSize', 'learningRate', 'trainLayerIndex', 'numEpo']
listHyper1 = [str(sequenceSize), str(batchSize), str(learningRate), str(trainLayerIndex), str(numEpo)]
time = datetime.now().strftime("%Y%m%d_%H%M%S")
save_path = 'ModelsExpScriber/{}/'.format(time)
os.mkdir(save_path)

In [3]:
### Get Training Dataset

print('\nProcessing Training Data ... ')

# name of dataset with sentences
data_name = "Scriber"

# file name
trainDataName = 'Data' + data_name + '/' + 'raw.processed.Train_01.txt'

# from sentences to list of words+punctuation
data_train = load_file(processingScriber00(trainDataName))

# encode data and insert target
X_train_, y_train_ = encode_data(data_train, tokenizer, punctuation_enc)
X_train = insert_target(X_train_, sequenceSize)
y_train = np.asarray(y_train_)

# get only a fraction of data 
n = 96
X_train = X_train[0:n]
y_train = y_train[0:n]

# build the datasets
trainDataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=500000).batch(batchSize)

print("\nTraining Dataset Tensor Shape = ", X_train.shape)


Processing Training Data ... 

Training Dataset Tensor Shape =  (96, 32)


In [4]:
### Get an Experimental Input Tensor

sentence = "elle se situe au cœur d'un vaste bassin sédimentaire aux sols fertiles et au climat tempéré elle se situe au cœur d'un vaste bassin"
tokens = tokenizer.tokenize(sentence)
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
tensor = tf.convert_to_tensor(tokens_ids)  # convert list to tensor
tensor = tf.expand_dims(tensor, 0)
print(tensor)

tf.Tensor(
[[  109    48  3685    36   766    18    11    59  2615  3633    52 12279
  19464    68  7498 24377    10    14    36  3287 22208   141   109    48
   3685    36   766    18    11    59  2615  3633]], shape=(1, 32), dtype=int32)


In [5]:
### Get word_embeddings Weights From a Fine-Tuned Model

# Build The Model
bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
dense_out = tf.keras.layers.Dense(len(punctuation_enc))(x)
model = tf.keras.Model(bert_input, dense_out, name='model')

# Load Weights
checkpointPath = "../project_PunctuatorBERTTensorFlow2/Models/20200530_161559/cp-001.ckpt"  # this is the baseline model
model.load_weights(checkpointPath)

# Extract word_embeddings Weights
weightsTensor = model.trainable_variables[194][0:]

In [8]:
### print word_embeddings of the fine-tuned model
print(model.trainable_variables[194][0][0:5])

tf.Tensor([-0.05737098  0.0300844   0.00174185 -0.10074838 -0.06946988], shape=(5,), dtype=float32)


In [11]:
### Build a Linear Layer to Embed the tokens_ids

linearProj = tf.keras.layers.Dense(768, input_shape=(32005,), use_bias=False)
tensor = tf.zeros(shape=[1, 32005])
linearProj(tensor)  # execute the layer
linearProj.trainable_variables[0].assign(weightsTensor)  # load the weights

print(linearProj.trainable_variables[0][0][0:5])

tf.Tensor([-0.05737098  0.0300844   0.00174185 -0.10074838 -0.06946988], shape=(5,), dtype=float32)


In [17]:
### Test the Linear Layer

tensor = tf.one_hot(0, 32005)
tensor = tf.expand_dims(tensor, 0)
print(linearProj(tensor)[0][0:5])

tf.Tensor([-0.05737098  0.0300844   0.00174185 -0.10074838 -0.06946988], shape=(5,), dtype=float32)


In [None]:
# ### Test How To Build a Model Incorporating the Linear Layer

# model_input = tf.keras.Input(shape=(vocabSize, sequenceSize), dtype='int32')
# x = linearProj(model_input)
# x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
# x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
# dense_out = tf.keras.layers.Dense(len(punctuation_enc))(x)

In [None]:
### Build The Model

print('\nBulding the Model ... ')

bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
dense_out = tf.keras.layers.Dense(len(punctuation_enc))(x)

model = tf.keras.Model(bert_input, dense_out, name='model')

# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)

# func to calculate the gradients
def grad(model, inputs, targets, trainLayerIndex):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)

In [None]:
### generate a random tensor
tensor = tf.random.uniform(shape=[1, 32, 768])

In [None]:
### Build The Model

print('\nBulding the Model ... ')

bert_input = tf.keras.Input(shape=(sequenceSize), dtype='int32', name='bert_input')
x = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base")(bert_input)[0]
x = tf.keras.layers.Reshape((sequenceSize*vocabSize,))(x)
dense_out = tf.keras.layers.Dense(len(punctuation_enc))(x)

model = tf.keras.Model(bert_input, dense_out, name='model')

# define the loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss(model, x, y):
    y_ = model(x)
    return loss_object(y_true=y, y_pred=y_)

# func to calculate the gradients
def grad(model, inputs, targets, trainLayerIndex):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return loss_value, tape.gradient(loss_value, model.trainable_variables[trainLayerIndex:])

# define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate)