In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../../")

In [3]:
import tensorflow as tf

from src.encoders.context_encoder_bert import ContextEncoder

from tensorflow.python import keras
import toml
import json
import pandas as pd

from src.dataset.distilbert import DistilBERTDataset

In [4]:
# Read local `config.toml` file.
config = toml.load('../settings/experiments.toml')
print(config)

{'ALBERT_FINETUNE_SIMPLE': [{'bert_type': 'albert', 'dataset_type': 'clinical', 'finetune_bert': True, 'pct_data': 1, 'augment_pct': 0.1, 'epochs': 200}, {'bert_type': 'albert', 'finetune_bert': True, 'dataset_type': 'clinical', 'pct_data': 1, 'augment_pct': 0.5, 'epochs': 200}, {'bert_type': 'albert', 'finetune_bert': True, 'dataset_type': 'clinical', 'pct_data': 1, 'augment_pct': 1, 'epochs': 200}, {'bert_type': 'albert', 'dataset_type': 'fiction', 'finetune_bert': True, 'pct_data': 1, 'augment_pct': 0.1, 'epochs': 200}, {'bert_type': 'albert', 'finetune_bert': True, 'dataset_type': 'fiction', 'pct_data': 1, 'augment_pct': 0.5, 'epochs': 200}, {'bert_type': 'albert', 'finetune_bert': True, 'dataset_type': 'fiction', 'pct_data': 1, 'augment_pct': 1, 'epochs': 200}, {'bert_type': 'albert', 'dataset_type': 'wiki', 'finetune_bert': True, 'pct_data': 1, 'augment_pct': 0.1, 'epochs': 200}, {'bert_type': 'albert', 'finetune_bert': True, 'dataset_type': 'wiki', 'pct_data': 1, 'augment_pct': 

In [5]:
# from transformers import DistilBertTokenizer, TFDistilBertModel

# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
# model = TFDistilBertModel.from_pretrained('distilbert-base-cased')
# input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
# outputs = model(input_ids)
# last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

In [6]:
# outputs[0]

In [7]:
# def pool_output(input_tensor):
#     bert_full_output = tf.transpose(input_tensor, [0, 2, 1])
#     bert_pooled_output = tf.reduce_mean(bert_full_output, 2)
#     return bert_pooled_output

In [8]:
# pool_output(outputs[0])

In [9]:
model = ContextEncoder(final_dropout=0.5,
                       dense_neurons=64,
                       bert_trainable=True,
                       bert_type="distilbert-base-cased")

In [10]:
fake_output = model(tf.constant([[[1,2]],[[3,4]],[[5,6]]])) 

In [11]:
fake_output

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.3498583]], dtype=float32)>

In [12]:
model.summary()

Model: "context_encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  65190912  
_________________________________________________________________
dense_input_left (Dense)     multiple                  49216     
_________________________________________________________________
dense_input_mid (Dense)      multiple                  49216     
_________________________________________________________________
dense_input_right (Dense)    multiple                  49216     
_________________________________________________________________
dense_output (Dense)         multiple                  193       
_________________________________________________________________
final_dropout (Dropout)      multiple                  0         
Total params: 65,338,753
Trainable params: 65,338,753
Non-trainable params: 0
_______________________________________

In [13]:
dataset = DistilBERTDataset(dataset_type="clinical",
                       pct_data=0.1,
                       max_segment_length=5,
                        max_seq_length=128
                       augment_pct=0.1)

In [14]:
sentences, tokenized_sentences, labels = dataset.process()



In [38]:
dataset.format_sentences_tri_input(tokenized_sentences)

(<tf.Tensor: shape=(1619, 128), dtype=int32, numpy=
 array([[  101,  1996,  2087, ..., 24759,  3022,   102],
        [  101,  4998,  2013, ...,     0,     0,     0],
        [  101,  1996,  2190, ...,     0,     0,     0],
        ...,
        [  101,  6064,  7192, ...,     0,     0,     0],
        [  101,  1999, 25714, ...,     0,     0,     0],
        [  101,  1059, 21030, ...,     0,     0,     0]])>,
 <tf.Tensor: shape=(1619, 128), dtype=int32, numpy=
 array([[  101,  4998,  2013, ...,     0,     0,     0],
        [  101,  1996,  2190, ...,     0,     0,     0],
        [  101,  1996,  5776, ...,     0,     0,     0],
        ...,
        [  101,  1999, 25714, ...,     0,     0,     0],
        [  101,  1059, 21030, ...,     0,     0,     0],
        [  101,  1996,  2087, ..., 24759,  3022,   102]])>,
 <tf.Tensor: shape=(1619, 128), dtype=int32, numpy=
 array([[  101,  1996,  2190, ...,     0,     0,     0],
        [  101,  1996,  5776, ...,     0,     0,     0],
        [  101

## Training

In [15]:
METRICS = [
      keras.metrics.BinaryAccuracy(name='accuracy')
]

In [16]:
EPOCHS = 100
BATCH_SIZE = 4
dense_output = 128

# balanced = balanced binary crossentropy
checkpoint_filepath = '../models/DistilBERT/finetune/simple/{}-{}-{}-pct-{}-aug/checkpoint'.format(
                        dataset.dataset_type,                    
                        len(sentences), 
                        dataset.pct_data,
                        dataset.augment_pct)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    save_best_only=False,
    mode="auto",
    save_freq="epoch")

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

callbacks = [
#     early_stopping,
    model_checkpoint_callback
]

In [17]:
checkpoint_filepath

'../models/DistilBERT/finetune/simple/clinical-1605-0.1-pct-0.1-aug/checkpoint'

In [18]:
model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-3),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=METRICS)

In [19]:
try:
    model.load_weights(checkpoint_filepath)
except:
    print("No checkpoint available.")

No checkpoint available.


In [20]:
history = model.fit(dataset.format_sentences_tri_input(tokenized_sentences), 
                    tf.convert_to_tensor(labels), 
                    epochs=EPOCHS,
                    validation_split=0.1,
                    batch_size=BATCH_SIZE,
                    verbose=1, 
#                     class_weight=class_weight,
                    callbacks=callbacks)

Epoch 1/100
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to locate the source code of <bound method ContextEncoder.call of <src.encoders.context_encoder_bert.ContextEncoder object at 0x000001AD95F73208>>. Note that functions defined in certain environments, like the interactive Python shell do not expose their source code. If that is the case, you should to define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.do_not_convert. Original error: could not get source code
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to locate the source code of <bound method ContextEncoder.call of <src.encoders.context_encoder_bert.ContextEncoder object at 0x000001AD95F73208>>. Note that func

KeyboardInterrupt: 

## Experiment

In [75]:
sys.path.append("../")
from utils.experiments import get_experiments, save_results

In [57]:
# Read local `config.toml` file.
config = get_experiments('ALBERT_FINETUNE_SIMPLE')
config_df = pd.DataFrame.from_dict(config)
config_df

Unnamed: 0,bert_type,dataset_type,finetune_bert,pct_data,augment_pct,epochs
0,albert,clinical,True,1,0.1,200
1,albert,clinical,True,1,0.5,200
2,albert,clinical,True,1,1.0,200
3,albert,fiction,True,1,0.1,200
4,albert,fiction,True,1,0.5,200
5,albert,fiction,True,1,1.0,200
6,albert,wiki,True,1,0.1,200
7,albert,wiki,True,1,0.5,200
8,albert,wiki,True,1,1.0,200


In [69]:
config_df.to_csv(r'../models/experiment.csv', header=None, index=None, sep=' ', mode='a')

In [59]:
for experiment in config:
    bert_type = config['bert_type']
    dataset_type = config['dataset_type']
    finetune_bert = config['finetune_bert']
    pct_data = config['pct_data']
    augment_pct = config['augment_pct']
    epochs = config['epochs']
    print("params:", bert_type, dataset_type, finetune_bert, pct_data, augment_pct, epochs)
    
    # init model
    print("initializing model...")
    model = ContextEncoder(final_dropout=0.5,
                           dense_neurons=64,
                           bert_trainable=finetune_bert,
                           bert_type="albert-base-v2")
    
    # init dataset
    print("initializing dataset...")
    dataset = AlbertDataset(dataset_type=dataset_type,
                           pct_data=pct_data,
                           max_segment_length=5,
                           augment_pct=augment_pct)
    
    # process dataset
    print("processing dataset...")
    sentences, tokenized_sentences, labels = dataset.process()
    
    # create checkpoint path
    checkpoint_filepath = '../models/ALBERT/finetune/simple/{}-{}-{}-pct-{}-aug/checkpoint'.format(
                            dataset_type,                    
                            len(sentences), 
                            pct_data,
                            augment_pct)
    print(checkpoint_filepath)
    
    # compiling model
    print("compiling the model...")
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-3),
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=METRICS)
    
    try:
        model.load_weights(checkpoint_filepath)
        print("model loaded.")
    except:
        print("No checkpoint available.")
    
    # 
    print("starting the training process...")
    history = model.fit(dataset.format_sentences_tri_input(tokenized_sentences), 
                        tf.convert_to_tensor(labels), 
                        epochs=EPOCHS,
                        validation_split=0.1,
                        batch_size=BATCH_SIZE,
                        verbose=1, 
                        # class_weight=class_weight,
                        callbacks=callbacks)
    
    # assigning history to experiment object for saving.
    experiment["history"] = history
    
    print("saving results...")
    save_results(experiment)

9