In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
import matplotlib.pyplot as plt

## Importar dataset

In [2]:
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow_text as text

In [3]:
dataset = tfds.load('imdb_reviews', as_supervised=True)

In [4]:
raw_train_ds, raw_test_ds = dataset['train'], dataset['test']

In [5]:
for text, label in raw_train_ds.take(1):
    print(text.numpy(), label.numpy())

b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it." 0


## Pipeline

In [6]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BUFFER_SIZE = tf.data.experimental.cardinality(raw_train_ds)
BUFFER_SIZE.numpy()

25000

In [7]:
import string

In [8]:
tf.strings.regex_replace('My favorite dog?', f"([{string.punctuation}])", r"")

<tf.Tensor: shape=(), dtype=string, numpy=b'My favorite dog'>

In [9]:
batch_size = 16
voc_size = 5000
max_length = 400

In [10]:
def clean_text(raw_text, label):
    lowercase = tf.strings.lower(raw_text)
    lowercase = tf.strings.substr(lowercase, 0, max_length)
    clean = tf.strings.regex_replace(lowercase, '<br />', ' ')
    clean = tf.strings.regex_replace(clean, 
                                     f"([{string.punctuation}])", r"")
    return clean, label

In [11]:
train_ds = raw_train_ds.map(clean_text)

for text, label in train_ds.take(1):
    print(text)
    print(tf.strings.substr(
    text, 0, max_length)
)

tf.Tensor(b'this was an absolutely terrible movie dont be lured in by christopher walken or michael ironside both are great actors but this must simply be their worst role in history even their great acting could not redeem this movies ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the columbian rebels were making their cases for revol', shape=(), dtype=string)
tf.Tensor(b'this was an absolutely terrible movie dont be lured in by christopher walken or michael ironside both are great actors but this must simply be their worst role in history even their great acting could not redeem this movies ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the columbian rebels were making their cases for revol', shape=(), dtype=string)


In [12]:
train_ds = raw_train_ds.map(clean_text).shuffle(BUFFER_SIZE).batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

test_ds = raw_test_ds.map(clean_text).batch(
        batch_size, num_parallel_calls=AUTOTUNE).prefetch(
        AUTOTUNE)

In [13]:
for text, label in train_ds.take(1):
    print(text)

tf.Tensor(
[b'for those who are too young to know this or for those who have forgotten the disney company went almost down the tubes by the end of the 1980s people were not seeing their movies anymore and the company was not producing the usual wholesome materialat least no what people expected a major problem profanity  yes the idiots running the disney movies during that decade would produ'
 b'the first time i saw this film i loved it it was different  i am a christian bible believing i dont go along with the crowd of right wing believers i dropped out of that atmosphere  to me in their attempts to take over our government they are doing what judas tried to do i call it the judas syndrome  judas didnt get it even though jesus said his kingdom w'
 b'takashi miikes incursion into kiddie territory won me over almost immediately because he demonstrates nerve and bravery in dealing with fantasy elements this is a fairy tale that dares to be dark even as a kid i thought that there was some

## Definir modelo

<img src="../img/bert.png" width="700"/>

__Imagen tomada de Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805.__

In [14]:
train_text, _ = next(iter(train_ds))

In [15]:
train_text

<tf.Tensor: shape=(16,), dtype=string, numpy=
array([b'one of my favorite twilight zone episodes and the next day we were in the supermarket at hollywood blvd and la brea my father and i and guess who was coming toward us in the aisle barney phillips but no hat on  at least i dont think he had a hat on  we asked him about his third eye and he said something like he left it at home and everybody he met that day had asked him abou',
       b'this is the sort of thing that only now thrills the film eggheads after all feiersteins flex crush will have you know that real men dont watch anything by truffaut   it might have been interesting if truffaut had anything to say here the cameraasvoyeur motif was nothing new have we all forgotten de sicas bicycle thief or anything by hitchcock  so all we get is t',
       b'probably the only thing that got the movie up to a four for me is the fact that i love peter falk one of the worlds great portrayers of bumbling incompetence    and yet he is one o

In [16]:
bert_model_path = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
bert_preprocess_path = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [17]:
bert_preprocess_model = hub.KerasLayer(bert_preprocess_path)

In [18]:
preprocess_output = bert_preprocess_model(train_text)
preprocess_output

{'input_mask': <tf.Tensor: shape=(16, 128), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 'input_type_ids': <tf.Tensor: shape=(16, 128), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>,
 'input_word_ids': <tf.Tensor: shape=(16, 128), dtype=int32, numpy=
 array([[ 101, 2028, 1997, ...,    0,    0,    0],
        [ 101, 2023, 2003, ...,    0,    0,    0],
        [ 101, 2763, 1996, ...,    0,    0,    0],
        ...,
        [ 101, 2009, 2001, ...,    0,    0,    0],
        [ 101, 2077, 1045, ...,    0,    0,    0],
        [ 101, 1045, 2031, ...,    0,    0,    0]], dtype=int32)>}

In [19]:
bert_model = hub.KerasLayer(bert_model_path)

In [20]:
bert_model(preprocess_output)['pooled_output']

<tf.Tensor: shape=(16, 512), dtype=float32, numpy=
array([[ 0.00314083,  0.9803104 ,  0.0247332 , ..., -0.09464012,
        -0.27182636, -0.8748782 ],
       [-0.33249202,  0.97886175, -0.13606362, ...,  0.4085269 ,
        -0.04963234,  0.0699869 ],
       [ 0.53238064,  0.84037936, -0.03310442, ...,  0.23471183,
        -0.02497339, -0.14739855],
       ...,
       [ 0.03954168,  0.9674722 , -0.26850796, ...,  0.17178895,
        -0.25081205, -0.41408285],
       [-0.70664436,  0.9434369 , -0.3560397 , ...,  0.07197679,
        -0.23364528, -0.6748686 ],
       [ 0.37896913,  0.97541744,  0.10970975, ..., -0.17064568,
        -0.4797275 , -0.66744995]], dtype=float32)>

In [21]:
text_input = tf.keras.layers.Input(shape=(), 
                                   dtype=tf.string, name='text')
preprocess_text = bert_preprocess_model(text_input)
bert_output = bert_model(preprocess_text)['pooled_output']
x = tf.keras.layers.Dense(128, activation='relu')(bert_output)
output = tf.keras.layers.Dense(1)(x)
small_bert = tf.keras.Model(text_input, output)


- Probar bert con batch de prueba

In [22]:
small_bert(train_text)

<tf.Tensor: shape=(16, 1), dtype=float32, numpy=
array([[1.2523457],
       [1.8890367],
       [1.610847 ],
       [2.0197604],
       [1.5687854],
       [2.0456805],
       [1.8366705],
       [1.3767264],
       [1.7974275],
       [0.9964155],
       [1.5116007],
       [1.6900955],
       [1.9164311],
       [1.8237029],
       [1.9688467],
       [1.3759558]], dtype=float32)>

- Información del modelo

In [23]:
small_bert.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

## Entrenamiento 

In [24]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [25]:
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [26]:
train_loss_avg = tf.keras.metrics.Mean(name='train_loss')
val_loss_avg = tf.keras.metrics.Mean(name='val_loss')

In [27]:
len(test_ds), tf.data.experimental.cardinality(test_ds)

(1563, <tf.Tensor: shape=(), dtype=int64, numpy=1563>)

In [28]:
@tf.function
def train_step(text, target):
    with tf.GradientTape() as tape:
        logits = small_bert(text, training=True)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    gradients = tape.gradient(loss_value, small_bert.trainable_weights)
    opt.apply_gradients(zip(gradients, small_bert.trainable_weights))
    train_loss_avg(loss_value)
    
@tf.function
def test_step(text, target):
    with tf.GradientTape() as tape:
        logits = small_bert(text, training=False)
        loss_value = loss(tf.cast(target, tf.float32), logits)

    val_loss_avg(loss_value)

In [29]:
epochs = 5

In [30]:
for epoch in range(epochs):
    for text, target in train_ds:
        train_step(text, target)
        
    print(f'Epoch: {epoch} Train loss: {train_loss_avg.result().numpy()}')
    train_loss_avg.reset_states()
    
    for text, target in test_ds:
        test_step(text, target)
        
    print(f'Val loss: {val_loss_avg.result().numpy()}')
    val_loss_avg.reset_states()

Epoch: 0 Train loss: 0.582028329372406
Val loss: 0.551526665687561
Epoch: 1 Train loss: 0.5433706641197205
Val loss: 0.5400358438491821
Epoch: 2 Train loss: 0.5347903370857239
Val loss: 0.5484887957572937
Epoch: 3 Train loss: 0.5298752188682556
Val loss: 0.5331859588623047
Epoch: 4 Train loss: 0.5242843627929688
Val loss: 0.5345173478126526


## Ejercicio
- Modificar la arquitectura para obtener mejores resultados.
- Probar diferentes versiones de BERT: https://tfhub.dev/google/collections/bert/1.