In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
root_path = "/content/drive/MyDrive/SCHOOL/PhD/Code/context-encoder-v2"

In [4]:
!pip install transformers
!pip install stop_words
!pip install symspellpy
!pip install language_detector 
!pip install cached_property
!pip install sentencepiece

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 8.1MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 58.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 63.7MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbe

In [7]:
import sys
sys.path.append(root_path)

In [8]:
import tensorflow as tf

from src.encoders.context_encoder_bert import ContextEncoder

from tensorflow.python import keras
import toml
import json
import pandas as pd

from src.dataset.distilbert import DistilBERTDataset

## Experiment

In [9]:
sys.path.append(root_path)
from utils.experiments import get_experiments, save_results

In [10]:
# Read local `config.toml` file.
config = get_experiments('DISTILBERT_FINETUNE_TEST')
config_df = pd.DataFrame.from_dict(config)
config_df

Unnamed: 0,bert_type,dataset_type,finetune_bert,pct_data,augment_pct,epochs
0,distilbert,clinical,True,1,0.1,100
1,distilbert,clinical,True,1,0.5,100
2,distilbert,clinical,True,1,1.0,100
3,distilbert,fiction,True,1,0.1,100
4,distilbert,fiction,True,1,0.5,100
5,distilbert,fiction,True,1,1.0,100
6,distilbert,wiki,True,1,0.1,100
7,distilbert,wiki,True,1,0.5,100
8,distilbert,wiki,True,1,1.0,100


In [11]:
sys.path.append(root_path)
config_df.to_csv(f'{root_path}/models/experiment.csv', header=None, index=None, sep=' ', mode='a')

In [14]:
for experiment in config:
    bert_type = experiment['bert_type']
    dataset_type = experiment['dataset_type']
    finetune_bert = experiment['finetune_bert']
    pct_data = experiment['pct_data']
    augment_pct = experiment['augment_pct']
    epochs = experiment['epochs']
    BATCH_SIZE = 32
    print("params:", bert_type, dataset_type, finetune_bert, pct_data, augment_pct, epochs)
    
    # init model
    print("initializing model...")
    model = ContextEncoder(final_dropout=0.5,
                           dense_neurons=64,
                           bert_trainable=finetune_bert,
                           bert_type="distilbert-base-cased")
    
    # init dataset
    print("initializing dataset...")
    dataset = DistilBERTDataset(dataset_type=dataset_type,
                           pct_data=pct_data,
                            max_seq_length=128,
                           max_segment_length=5,
                           augment_pct=augment_pct)
    
    # process dataset
    print("processing dataset...")
    sentences, tokenized_sentences, labels = dataset.process()
    
    # create checkpoint path
    checkpoint_filepath = '{}/models/DistilBERT/finetune/simple/{}-{}-{}-pct-{}-aug/checkpoint'.format(
                            root_path,
                            dataset_type,                    
                            len(sentences), 
                            pct_data,
                            augment_pct)
    print(checkpoint_filepath)
    
    # get callbacks ready.
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_accuracy',
        save_best_only=False,
        mode="auto",
        save_freq="epoch")

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy', 
        verbose=1,
        patience=10,
        mode='max',
        restore_best_weights=True)

    callbacks = [
    #     early_stopping,
        model_checkpoint_callback
    ]
    
    # compiling model
    print("compiling the model...")
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=[
                      keras.metrics.BinaryAccuracy(name='accuracy')
                  ])
    
    try:
        model.load_weights(checkpoint_filepath)
        print("model loaded.")
    except:
        print("No checkpoint available.")
    
    # 
    print("starting the training process...")
    history = model.fit(dataset.format_sentences_tri_input(tokenized_sentences), 
                        tf.convert_to_tensor(labels), 
                        epochs=epochs,
                        validation_split=0.1,
                        batch_size=BATCH_SIZE,
                        verbose=1, 
                        # class_weight=class_weight,
                        callbacks=callbacks)
    
    # assigning history to experiment object for saving.
    experiment["history"] = history.history
    
    print("saving results...")
    save_results(experiment)

params: distilbert clinical True 1 0.1 100
initializing model...
initializing dataset...
processing dataset...




/content/drive/MyDrive/SCHOOL/PhD/Code/context-encoder-v2/models/DistilBERT/finetune/simple/clinical-5484-1-pct-0.1-aug/checkpoint
compiling the model...
No checkpoint available.
starting the training process...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoc

TypeError: ignored