In [1]:
# General
import os
import shutil
from collections import Counter
from tempfile import mkdtemp
from shutil import rmtree
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn
from sklearn.feature_extraction.text import (
    CountVectorizer, TfidfVectorizer)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import (
    SelectKBest, VarianceThreshold, f_classif)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, balanced_accuracy_score, make_scorer
from sklearn.preprocessing import LabelBinarizer

# Custom
from data_io import read_data
from utils import label_map, normalize

# BERT
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

### Load Data

In [4]:
texts_train, labels_train = read_data(mode='train')
y_train_full = np.asarray([label_map[label] for label in labels_train])

In [5]:
texts_val, labels_val = read_data(mode='val')
y_val_full = np.asarray([label_map[label] for label in labels_val])

In [6]:
texts_test, labels_test = read_data(mode='test')
y_test_full = np.asarray([label_map[label] for label in labels_test])

### BERT
Load pretrained model from Tensorflow Hub

In [1]:
bert_model_name = 'experts_pubmed' 

map_name_to_handle = {
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
}

map_model_to_preprocess = {
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/google/experts/bert/pubmed/2
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [9]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

### Preprocessing

In [10]:
labelencoder = LabelBinarizer()

In [11]:
y_train_oh = labelencoder.fit_transform(y_train_full)
y_val_oh = labelencoder.fit_transform(y_val_full)
y_test_oh = labelencoder.fit_transform(y_test_full)

In [13]:
# Load raw data (we use TF Hub BERT preprocessor)
train_ds = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(texts_train, dtype=tf.string), tf.convert_to_tensor(y_train_oh, dtype=tf.int32)))
val_ds = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(texts_val, dtype=tf.string), tf.convert_to_tensor(y_val_oh, dtype=tf.int32)))
test_ds = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(texts_test, dtype=tf.string), tf.convert_to_tensor(y_test_oh, dtype=tf.int32)))

### Load and build model

In [3]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [16]:
def build_classifier_model(train_transformer=False):
    
    # Input Layer
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    
    # Preprocessing with preprocessor
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    
    # Pass through pretrained BERT model
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=train_transformer, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(128)(net)
    net = tf.keras.layers.Dense(128)(net)
    net = tf.keras.layers.Dense(5, activation="softmax", name='classifier')(net)
    
    return tf.keras.Model(text_input, net)

#### Load fine-tuned weights
Please reach out to us, if you would like us to send you the fine-tuned weights of the tranformer

In [None]:
# Load pretrained model from TF Hub, then load the weights from the fine-tuning notebook.
# Weights have been stored in the Weigths & Biases Cloud (https://wandb.ai)
strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
with strategy.scope():
    
    # Build and compile full model
    embedding_model = build_classifier_model(True)
    embedding_model.compile(
        optimizer="Adam",
        loss="CategoricalCrossentropy",
        metrics=['accuracy']
    )
    
    # Load fine-tuned weights
    embedding_model.load_weights("./weights/bert_finetuned_1epoch.h5")

In [None]:
# Evaluate model to check fine-tuned weights
embedding_model.evaluate(val_ds.batch(32))

#### Build new model to extract embeddings

In [48]:
# Builds a new model by detaching the classification head and only predicting the pooled output of the language model
strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
with strategy.scope():
    model_embed = tf.keras.Model(inputs=embedding_model.input, outputs=embedding_model.get_layer("BERT_encoder").output["pooled_output"])

model_embed.summary()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
preprocessing (KerasLayer)      {'input_type_ids': ( 0           text[0][0]                       
__________________________________________________________________________________________________
BERT_encoder (KerasLayer)       {'pooled_output': (N 109482241   preprocessing[0][0]              
                                                                 preprocessing[0][1]              
                                                                 preprocessing[0][2]              
Total params: 109,482,241
Trainable params: 109,482,240
Non-trainable params: 1
____________

### Predict Embeddings

In [49]:
embed_train = model_embed.predict(train_ds.batch(128))

In [50]:
embed_val = model_embed.predict(val_ds.batch(128))

In [51]:
embed_test = model_embed.predict(test_ds.batch(128))

### Store embeddings
Please reach out to us, if you would like the predicted embeddings for accelerating the pipeline

In [52]:
np.save("./data/train_bert_finetuned_1epoch_embed_unnormalized_preprocessed", embed_train, allow_pickle=True)

In [53]:
np.save("./data/val_bert_finetuned_1epoch_embed_unnormalized_preprocessed", embed_val, allow_pickle=True)

In [54]:
np.save("./data/test_bert_finetuned_1epoch_embed_unnormalized_preprocessed", embed_test, allow_pickle=True)