In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install transformers
!pip install stop_words
!pip install symspellpy
!pip install language_detector 
!pip install cached_property
!pip install sentencepiece
!pip install config
!pip install umap
!pip install sentence-transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 15.7MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 57.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████

In [4]:
root_path = "/content/drive/MyDrive/SCHOOL/PhD/Code/context-encoder-v2"
import sys, os
import config

config.root_path = os.path.abspath(root_path)
sys.path.insert(0, config.root_path)

In [5]:
import tensorflow as tf

from src.encoders.context_encoder_ldabert import ContextEncoder, ContextEncoderComplex
from src.dataset.ldabert import LDABERTDataset

from tensorflow.python import keras
import toml
import json
import pandas as pd
import numpy as np
from utils.experiments import get_experiments_json, get_experiments, save_results

## Experiment

In [6]:
dataset = LDABERTDataset(dataset_type="clinical",
                       pct_data=1,
                       max_segment_length=5,
                       augment_pct=1)

In [7]:
sentences, tokenized_sentences, labels = dataset.process(preprocess=False)

In [8]:
# vectors_path = '../data/clinical_vectors/lda_bert_{}_{}.pkl'.format(dataset_type, len(sentences))
vectors_path = '../../data/clinical_vectors/lda_bert_{}_{}_{}.pkl'.format(dataset.dataset_type, 
                                                                    dataset.pct_data, 
                                                                    dataset.augment_pct)

saved_vectors, saved_labels, saved_sentences = dataset.get_vectors(vectors_path)

if len(saved_vectors) == 0:
    saved_vectors, saved_labels, saved_sentences = dataset.create_vectors(vectors_path)

In [9]:
left_input, mid_input, right_input = dataset.format_sentences_tri_input(saved_vectors)

In [10]:
experiments_config = get_experiments_json('ldabert_complex_test')
experiments_config_df = pd.DataFrame.from_dict(experiments_config)
experiments_config_df

Unnamed: 0,bert_type,dataset_type,final_dropout,dense_neurons,lstm_size,lstm_dropout_percentage,cnn_filters,cnn_kernel_size,pool_size,pct_data,augment_pct,epochs
0,ldabert,clinical,0.2,64,256,0.2,8,3,2,1,1,1000
1,ldabert,clinical,0.2,64,256,0.5,8,3,2,1,1,1000
2,ldabert,clinical,0.2,64,256,0.8,8,3,2,1,1,1000
3,ldabert,clinical,0.2,256,256,0.2,8,3,2,1,1,1000
4,ldabert,clinical,0.2,256,256,0.5,8,3,2,1,1,1000
5,ldabert,clinical,0.2,256,256,0.8,8,3,2,1,1,1000
6,ldabert,clinical,0.5,64,256,0.2,8,3,2,1,1,1000
7,ldabert,clinical,0.5,64,256,0.5,8,3,2,1,1,1000
8,ldabert,clinical,0.5,64,256,0.8,8,3,2,1,1,1000
9,ldabert,clinical,0.5,256,256,0.2,8,3,2,1,1,1000


In [12]:
experiments_config_df.to_csv(r'../models/experiment.csv', header=None, index=None, sep=' ', mode='a')

In [15]:
def get_random_hash(k):
  import random, string
  x = ''.join(random.choices(string.ascii_letters + string.digits, k=k))
  return x

In [19]:
for experiment in experiments_config:
    bert_type = experiment['bert_type']
    dataset_type = experiment['dataset_type']
    final_dropout = experiment['final_dropout']
    dense_neurons = experiment['dense_neurons']
    lstm_size = experiment['lstm_size']
    lstm_dropout_percentage = experiment['lstm_dropout_percentage']
    cnn_filters = experiment['cnn_filters']
    cnn_kernel_size = experiment['cnn_kernel_size']
    pool_size = experiment['pool_size']
    pct_data = experiment['pct_data']
    augment_pct = experiment['augment_pct']
    epochs = experiment['epochs']
    BATCH_SIZE = 512
    random_hash = get_random_hash(5)
    print("params:", experiment)

    
    # init model
    print("initializing model...")
    model = ContextEncoder(final_dropout=final_dropout,
                            dense_neurons=dense_neurons,
                             lstm_size=lstm_size,
                             lstm_dropout_percentage=lstm_dropout_percentage,
                             cnn_filters=cnn_filters,
                             cnn_kernel_size=cnn_kernel_size,
                             pool_size=pool_size)
    
    # print("number of params: ", sum([np.prod(keras.get_value(w).shape) for w in model.trainable_weights]))
    
    # init dataset
    print("initializing dataset...")
    dataset = LDABERTDataset(dataset_type=dataset_type,
                            pct_data=pct_data,
                            max_segment_length=5,
                            augment_pct=augment_pct)
    
    # process dataset
    print("processing dataset...")
    sentences, tokenized_sentences, labels = dataset.process(preprocess=False)

    vectors_path = '../../data/{}_vectors/lda_bert_{}_{}_{}.pkl'.format(dataset.dataset_type, dataset.dataset_type, 
                                                                        dataset.pct_data, 
                                                                        dataset.augment_pct)

    saved_vectors, saved_labels, saved_sentences = dataset.get_vectors(vectors_path)

    if len(saved_vectors) == 0:
        saved_vectors, saved_labels, saved_sentences = dataset.create_vectors(vectors_path)

    left_input, mid_input, right_input = dataset.format_sentences_tri_input(saved_vectors)

    # get class weight
    neg, pos = np.bincount(labels.flatten())
    initial_bias = np.log([pos/neg])
    
    total=len(labels)
    weight_for_0 = (1 / neg)*(total)/2.0 
    weight_for_1 = (1 / pos)*(total)/2.0

    class_weight = {0: weight_for_0, 1: weight_for_1}
    print("class weight", class_weight)
    
    # create checkpoint path
    checkpoint_filepath = '{}/models/LDABERT/complex/{}/{}-{}-pct-{}-aug_{}/checkpoint.ckpt'.format(
                            dataset.dataset_type,  
                            config.root_path,                  
                            len(sentences), 
                            dataset.pct_data,
                            dataset.augment_pct,
                            random_hash)
    print(checkpoint_filepath)
    
    # get callbacks ready.
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_accuracy',
        save_best_only=False,
        mode="auto",
        save_freq="epoch")

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy', 
        verbose=1,
        patience=10,
        mode='max',
        restore_best_weights=True)

    callbacks = [
    #     early_stopping,
        model_checkpoint_callback
    ]
    
    # compiling model
    print("compiling the model...")
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=[
                      keras.metrics.BinaryAccuracy(name='accuracy')
                  ])
    
    # try:
    #     model.load_weights(checkpoint_filepath)
    #     print("model loaded.")
    # except:
    #     print("No checkpoint available.")
    
    print("starting the training process...")
    history = model.fit([left_input, mid_input, right_input], 
                        tf.convert_to_tensor(saved_labels), 
                        epochs=epochs,
                        validation_split=0.25,
                        batch_size=BATCH_SIZE,
                        verbose=1, 
                        # class_weight=class_weight,
                        callbacks=callbacks)
    
    # assigning history to experiment object for saving.
    experiment["history"] = history.history
    experiment["hash"] = random_hash
    
    print("saving results...")
    save_results(experiment)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 686/1000
Epoch 687/1000
Epoch 688/1000
Epoch 689/1000
Epoch 690/1000
Epoch 691/1000
Epoch 692/1000
Epoch 693/1000
Epoch 694/1000
Epoch 695/1000
Epoch 696/1000
Epoch 697/1000
Epoch 698/1000
Epoch 699/1000
Epoch 700/1000
Epoch 701/1000
Epoch 702/1000
Epoch 703/1000
Epoch 704/1000
Epoch 705/1000
Epoch 706/1000
Epoch 707/1000
Epoch 708/1000
Epoch 709/1000
Epoch 710/1000
Epoch 711/1000
Epoch 712/1000
Epoch 713/1000
Epoch 714/1000
Epoch 715/1000
Epoch 716/1000
Epoch 717/1000
Epoch 718/1000
Epoch 719/1000
Epoch 720/1000
Epoch 721/1000
Epoch 722/1000
Epoch 723/1000
Epoch 724/1000
Epoch 725/1000
Epoch 726/1000
Epoch 727/1000
Epoch 728/1000
Epoch 729/1000
Epoch 730/1000
Epoch 731/1000
Epoch 732/1000
Epoch 733/1000
Epoch 734/1000
Epoch 735/1000
Epoch 736/1000
Epoch 737/1000
Epoch 738/1000
Epoch 739/1000
Epoch 740/1000
Epoch 741/1000
Epoch 742/1000
Epoch 743/1000
Epoch 744/1000
Epoch 745/1000
Epoch 746/1000
Epoch 747/1000
Epoch

KeyboardInterrupt: ignored