In [1]:
import numpy as np
from dataProcessing import load_file, encode_data, insert_target, preProcessingScriber
import tensorflow as tf
import torch
from transformers import AutoTokenizer, TFCamembertForMaskedLM
from bertviz import head_view

In [2]:
### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)

In [3]:
### path to weights
checkpointPath = "20200510_101117/cp-005.ckpt"

In [4]:
### punctuation encoder
punctuation_enc = {
    'O': 0,
    'PERIOD': 1,
}

In [5]:
%%javascript
require.config({
  paths: {
      d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min',
      jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
  }
});

<IPython.core.display.Javascript object>

In [6]:
### hyperparameters
n = 1
vocab_size = 32005
segment_size = 32
batch_size = 1

In [7]:
### get the dataset

# name of dataset with sentences
data_name = "Scriber"

trainSet_01 = 'Data' + data_name + '/' + 'extractTrain_01.txt'
# trainSet_01 = 'Data' + data_name + '/' + 'raw.processed.Train_01.txt'

# from sentences to list of words+punctuation
data_train = load_file(preProcessingScriber(trainSet_01))

# encode data and insert target
X_train_, y_train_ = encode_data(data_train, tokenizer, punctuation_enc)
X_train = insert_target(X_train_, segment_size)
y_train = np.asarray(y_train_)

# # get only a fraction of data 
# X_train = X_train[0:n]
# y_train = y_train[0:n]

# build the datasets
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
# dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=500000).batch(batch_size)

In [8]:
### get model
bertInp = tf.keras.Input(shape=(segment_size), dtype='int32', name='bertInp')
bertOut = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base", output_attentions=True)(bertInp)
x = tf.keras.layers.Reshape((segment_size*vocab_size,))(bertOut[0])
denseOut = tf.keras.layers.Dense(4)(x)

model = tf.keras.Model(
    inputs = [bertInp],
    outputs = [bertOut, denseOut],
)

In [9]:
# load the weights
model.load_weights(checkpointPath)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f6016304f40>

In [10]:
### get prediction
features = next(iter(dataset))
modelOut = model.predict(features)

In [11]:
bertOutput = modelOut[0]

In [12]:
### get the attention weights
attLayers = bertOutput[1]

In [13]:
### get one sequence from the dataset
test = list(np.asarray(features[0])[0])
testCon = tokenizer.convert_ids_to_tokens(test)
print(testCon)

['▁les', '▁bloquer', '▁en', '▁fait', '▁comme', '▁ca', '▁ca', '▁va', '▁plus', '▁se', '▁reproduire', '▁a', '▁vous', '▁de', '▁s', '<s>NOTUSED', 'fr', '▁bonjour', '▁bienvenue', '▁au', '▁service', '▁client', '▁oui', '▁bonjour', '▁monsieur', '▁je', '▁vous', '▁appelle', '▁parce', '▁que', '▁j', "'"]


In [14]:
# convert weights from np array to torch tensors
attention = []
for i in range(len(attLayers)):
    foo = torch.from_numpy(attLayers[i])
    attention.append(foo)

In [15]:
# head_view(attention, testCon)

### Use a new sentence

In [16]:
# Faute de promulgation, la loi d'état d'urgence sanitaire partiellement applicable
string = "Faute de promulgation la loi d'état d'urgence sanitaire partiellement applicable"

# Because of his literary power and historical importance, Western literary critics consider him one of the great writers of world literature and among the creators of modern European writing
string = "because of his literary power and historical importance western literary critics consider him one of the great writers of world"

# Ecclésiastique et anticlérical, chrétien et considéré par certains comme libre penseur, médecin et ayant l'image d'un bon vivant, les multiples facettes de sa personnalité semblent parfois contradictoires. 
string = "ecclésiastique et anticlérical chrétien et considéré par certains comme libre penseur médecin et ayant l'image d'un bon vivant les multiples facettes de sa personnalité"

tokens = tokenizer.tokenize(string)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens)

['▁ecclésiastique', '▁et', '▁anti', 'clé', 'r', 'ical', '▁chrétien', '▁et', '▁considéré', '▁par', '▁certains', '▁comme', '▁libre', '▁pense', 'ur', '▁médecin', '▁et', '▁ayant', '▁l', "'", 'image', '▁d', "'", 'un', '▁bon', '▁vivant', '▁les', '▁multiples', '▁facettes', '▁de', '▁sa', '▁personnalité']


In [17]:
len(ids)

32

In [18]:
print(ids)

[24920, 14, 896, 9440, 81, 6533, 8970, 14, 3523, 37, 420, 79, 1038, 500, 297, 1974, 14, 634, 17, 11, 1106, 18, 11, 59, 212, 2340, 19, 2523, 12098, 8, 77, 2810]


In [19]:
foo = []
foo.append(ids)

In [20]:
# pad the sequence
test = tf.keras.preprocessing.sequence.pad_sequences(foo, maxlen=32, padding='post')
print(test)

[[24920    14   896  9440    81  6533  8970    14  3523    37   420    79
   1038   500   297  1974    14   634    17    11  1106    18    11    59
    212  2340    19  2523 12098     8    77  2810]]


In [21]:
modelOut = model.predict(test)
bertOutput = modelOut[0]

In [22]:
# get the attention weights
attLayers = bertOutput[1]

In [23]:
# convert weights from np array to torch tensors
attention = []
for i in range(len(attLayers)):
    foo = torch.from_numpy(attLayers[i])
    attention.append(foo)

In [24]:
sequence = tokenizer.convert_ids_to_tokens(test[0])
print(sequence)

['▁ecclésiastique', '▁et', '▁anti', 'clé', 'r', 'ical', '▁chrétien', '▁et', '▁considéré', '▁par', '▁certains', '▁comme', '▁libre', '▁pense', 'ur', '▁médecin', '▁et', '▁ayant', '▁l', "'", 'image', '▁d', "'", 'un', '▁bon', '▁vivant', '▁les', '▁multiples', '▁facettes', '▁de', '▁sa', '▁personnalité']


In [25]:
head_view(attention, sequence)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>