In [11]:
import numpy as np
from dataProcessing import load_file, encode_data, insert_target, preProcessingScriber
import tensorflow as tf
import torch
from transformers import AutoTokenizer, TFCamembertForMaskedLM
from bertviz import head_view

In [12]:
### instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-camembert-base", do_lower_case=True)

In [13]:
### punctuation encoder
punctuation_enc = {
    'O': 0,
    'PERIOD': 1,
}

In [14]:
%%javascript
require.config({
  paths: {
      d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min',
      jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
  }
});

<IPython.core.display.Javascript object>

In [15]:
### hyperparameters
n = 1
vocab_size = 32005
segment_size = 16
batch_size = 1

In [17]:
### get the dataset

# name of dataset with sentences
data_name = "Scriber"

trainSet_01 = 'Data' + data_name + '/' + 'extractTrain_01.txt'
# trainSet_01 = 'Data' + data_name + '/' + 'raw.processed.Train_01.txt'

# from sentences to list of words+punctuation
data_train = load_file(preProcessingScriber(trainSet_01))

# encode data and insert target
X_train_, y_train_ = encode_data(data_train, tokenizer, punctuation_enc)
X_train = insert_target(X_train_, segment_size)
y_train = np.asarray(y_train_)

# # get only a fraction of data 
# X_train = X_train[0:n]
# y_train = y_train[0:n]

# build the datasets
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
# dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(buffer_size=500000).batch(batch_size)

In [18]:
### get model
bertInp = tf.keras.Input(shape=(segment_size), dtype='int32', name='bertInp')
bertOut = TFCamembertForMaskedLM.from_pretrained("jplu/tf-camembert-base", output_attentions=True)(bertInp)
x = tf.keras.layers.Reshape((segment_size*vocab_size,))(bertOut[0])
denseOut = tf.keras.layers.Dense(4)(x)

model = tf.keras.Model(
    inputs = [bertInp],
    outputs = [bertOut, denseOut],
)

In [19]:
### get prediction
features = next(iter(dataset))
modelOut = model.predict(features)

In [20]:
bertOutput = modelOut[0]

In [21]:
attLayers = bertOutput[1]

In [22]:
type(attLayers)

tuple

In [23]:
print(type(attLayers[0]))
print(attLayers[0].shape)

<class 'numpy.ndarray'>
(1, 12, 16, 16)


In [24]:
features

(<tf.Tensor: shape=(1, 16), dtype=int64, numpy=
 array([[  40,   48, 9695,   33,   39,    8,   52,    0,  427, 5061, 5819,
           36,  366, 1065,  773, 5061]])>,
 <tf.Tensor: shape=(1,), dtype=int64, numpy=array([0])>)

In [25]:
tokenizer.tokenize('merci')

['▁merci']

In [26]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize('merci'))

[895]

In [27]:
tokenizer.convert_ids_to_tokens([895])

['▁merci']

In [28]:
tokenizer.convert_ids_to_tokens([0])

['<s>NOTUSED']

In [29]:
features[0]

<tf.Tensor: shape=(1, 16), dtype=int64, numpy=
array([[  40,   48, 9695,   33,   39,    8,   52,    0,  427, 5061, 5819,
          36,  366, 1065,  773, 5061]])>

In [30]:
test = list(np.asarray(features[0])[0])

In [31]:
test

[40, 48, 9695, 33, 39, 8, 52, 0, 427, 5061, 5819, 36, 366, 1065, 773, 5061]

In [32]:
testCon = tokenizer.convert_ids_to_tokens(test)

In [33]:
print(testCon)

['▁plus', '▁se', '▁reproduire', '▁a', '▁vous', '▁de', '▁s', '<s>NOTUSED', 'fr', '▁bonjour', '▁bienvenue', '▁au', '▁service', '▁client', '▁oui', '▁bonjour']


In [34]:
a = tuple(attLayers[0])
b = tuple(attLayers[1])

In [35]:
print(type(a))
print(len(a))

<class 'tuple'>
1


In [36]:
print(type(a[0]))

<class 'numpy.ndarray'>


In [37]:
c = a + b
print(type(c))
print(len(c))

<class 'tuple'>
2


In [38]:
a = tuple('a')
b = tuple('b')
c = a + b
c

('a', 'b')

In [39]:
c += c
c

('a', 'b', 'a', 'b')

In [40]:
type(attLayers[0])

numpy.ndarray

In [41]:
test = torch.from_numpy(attLayers[0])

In [42]:
print(type(test))
print(test.shape)

<class 'torch.Tensor'>
torch.Size([1, 12, 16, 16])


In [43]:
# attention = ()
# for i in range(len(attLayers)):
#     foo = torch.from_numpy(attLayers[i])
#     print(foo.shape)
#     tmp = tuple(foo)
#     print(tmp[0].shape)
#     attention += tmp

In [44]:
attention = []
for i in range(len(attLayers)):
    foo = torch.from_numpy(attLayers[i])
    print(foo.shape)
    attention.append(foo)
    print(attention[i].shape)

torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])
torch.Size([1, 12, 16, 16])


In [45]:
print(type(attention))
print(len(attention))
print(type(attention[0]))
print(attention[0].shape)

<class 'list'>
12
<class 'torch.Tensor'>
torch.Size([1, 12, 16, 16])


In [46]:
head_view(attention, testCon)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>