In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
#import matplotlib.pyplot as plt
#from transformers import DistilBertTokenizerFast
#from sklearn.model_selection import train_test_split

In [2]:
papers = pd.read_csv('pubmed_causal_language_use.csv', encoding='utf-8')

num_labels = len(papers.label.unique())
print(num_labels)
papers

4


Unnamed: 0,sentence,label
0,Levels of cholesterol fractions in patients wi...,0
1,Faster aspart and IAsp were confirmed noninfer...,0
2,Major operative morbidity after minimally inva...,0
3,"The promise of combining risk assessment, comm...",0
4,PPARÎ´ peroxisome proliferator-activated recep...,0
...,...,...
3056,The etiology of anemia appears to be iron-rela...,3
3057,DM is associated with poor outcomes in patient...,3
3058,The BDI is a significant predictor of long-ter...,3
3059,Poor glycemic control among diabetics is a ris...,3


In [3]:

seq_len = 512
num_samples = len(papers)

# add empty arrays for token ids and attention mask
Xids = np.zeros((num_samples, seq_len))
Xmask = np.zeros((num_samples, seq_len))

Xids.shape


(3061, 512)

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Tokenize each sentence and insert their token ids and attention mask to Xids and Xmask
for i, phrase in enumerate(papers['sentence']):
    tokens = tokenizer.encode_plus(
        phrase,
        max_length=seq_len,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf')
    Xids[i, : ] = tokens['input_ids']
    Xmask[i, :] = tokens['attention_mask']

In [5]:
# onehot encode label data

arr = papers.label.values

labels = np.zeros((num_samples, arr.max()+1))

labels[np.arange(num_samples), arr] = 1
labels

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [7]:
import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

dataset.take(1)

<TakeDataset shapes: ((512,), (512,), (4,)), types: (tf.float64, tf.float64, tf.float64)>

In [8]:

def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask':masks}, labels

dataset = dataset.map(map_func)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


In [9]:
dataset.take(1)

<TakeDataset shapes: ({input_ids: (512,), attention_mask: (512,)}, (4,)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [10]:
batch_size = 16

dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

dataset.take(1)

<TakeDataset shapes: ({input_ids: (16, 512), attention_mask: (16, 512)}, (16, 4)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [11]:
split = .7

size_ = int((num_samples / batch_size) * split)

train_ds = dataset.take(size_)
val_ds  =  dataset.skip(size_)



In [12]:
from transformers import TFAutoModel

bert_ = TFAutoModel.from_pretrained('bert-base-cased')

bert_.summary()

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Input layers
input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(seq_len,), name = 'attention_mask', dtype='int32')

# create embeddings from bert using the pooled layer [1]
embeddings = bert.bert(input_ids, attention_mask=mask)[1]

x = tf.keras.layers.Dense(1024, activation = 'relu')(embeddings)
y = tf.keras.layers.Dense(arr.max()+1, activation='softmax', name='outputs')(x)

In [None]:
# Put all the created layers into a model object

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)

loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [None]:
model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [None]:
history = model.fit(
    train_ds,
    validation_data = val_ds,
    epochs = 3)

In [14]:
import tensorflow as tf

print(tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))
print(tf.test.is_built_with_cuda())
tf.config.list_physical_devices('GPU')


False
True


[]