In [1]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
!nvidia-smi

Tue Jan  5 20:56:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0    29W /  70W |    227MiB / 15079MiB |      6%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install -q transformers

In [5]:
!pip install bert-for-tf2 >> /dev/null

In [20]:
import numpy as np
import os
import random

import bert
from bert.tokenization.bert_tokenization import FullTokenizer

In [7]:
cd /content/drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


In [8]:
import bert_thesis_experiments.utils as utils

In [9]:
seqfile = 'DeepDom_Code/DeepDom-master/processed_seq.txt'; #file name of the processed sequence data (output from dataprocess.pl)
labelfile= 'DeepDom_Code/DeepDom-master/processed_label.txt'; #file name of the processed label data (output from dataprocess.pl)

In [10]:
(ids,seqs) = utils.process_inputseqs(seqfile)
(ids,labels) = utils.process_inputlabels(labelfile)

In [11]:
rawdata = list(zip(seqs,labels))

In [13]:
from transformers import BertTokenizer
import os

tokenizer = FullTokenizer(vocab_file=os.path.join("bert_thesis_experiments/working/", "protein_seq_words_uniref0.5_vsz30k_mfq2-vocab.txt"))
# tokenizer2 = BertTokenizer.from_pretrained(os.path.join("bert_thesis_experiments/working/", "protein_seq_words_uniref0.5_vsz30k_mfq2-vocab.txt"))

In [21]:
vocabulary = tokenizer2.get_vocab()

print(list(vocabulary.keys())[5000:5020])

['##qral', '##hhy', '##kskk', '##wgf', '##vlkl', '##dlii', '##ekss', '##flee', '##stgv', '##eiar', '##dvdv', '##adtv', '##tiee', '##ahw', '##qill', '##new', '##ilee', '##ilag', '##aegg', '##adsv']


In [22]:
max_length_test = 200
test_sentence = 'LAHVPNASLINFTDVGTSVSKLLQDYSEIVLMSDEIQQTTDKDDPFLDIVPKFMGTILLILKNLQTKFLETEKYLFETIDYFNPTNQTLQQYQQQQYQQYQQQQFQQNIINNNNNNNNNNSNNNNNNISGNTTTTTTTTTTTTTGSIINNNNNNNNNNNNSNNNIINNNNSQSNLQSLLHPQYYLSNSSSSSSSSYKITP'
print("Sequence:", test_sentence)
# add special tokens

test_sentence_with_special_tokens = '[CLS]' + test_sentence + '[SEP]'

tokenized = tokenizer.tokenize(test_sentence)

print('tokenized', tokenized)
print("No. of Tokens:", len(tokenized))
# convert tokens to ids in WordPiece
input_ids = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokenized + ["[SEP]"])
  
# precalculation of pad length, so that we can reuse it later on
padding_length = max_length_test - len(input_ids)

# map tokens to WordPiece dictionary and add pad token for those text shorter than our max length
input_ids = input_ids + ([0] * padding_length)

# attention should focus just on sequence with non padded tokens
attention_mask = [1] * len(input_ids)

# do not focus attention on padded tokens
attention_mask = attention_mask + ([0] * padding_length)

# token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence
token_type_ids = [0] * max_length_test

bert_input = {
    "token_ids": input_ids,
    "token_type_ids": token_type_ids,
    "attention_mask": attention_mask
}
print(bert_input)


Sequence: LAHVPNASLINFTDVGTSVSKLLQDYSEIVLMSDEIQQTTDKDDPFLDIVPKFMGTILLILKNLQTKFLETEKYLFETIDYFNPTNQTLQQYQQQQYQQYQQQQFQQNIINNNNNNNNNNSNNNNNNISGNTTTTTTTTTTTTTGSIINNNNNNNNNNNNSNNNIINNNNSQSNLQSLLHPQYYLSNSSSSSSSSYKITP
tokenized ['l', '##ahvp', '##n', '##asli', '##nf', '##td', '##vgt', '##svsk', '##llq', '##dy', '##seiv', '##l', '##m', '##sdei', '##qqtt', '##dkdd', '##pf', '##l', '##divp', '##kfm', '##gt', '##il', '##l', '##ilkn', '##l', '##qtk', '##flet', '##ekyl', '##feti', '##dy', '##fn', '##ptn', '##qtlq', '##qy', '##qqqq', '##yq', '##qy', '##qqqq', '##fq', '##qn', '##iinn', '##nnnnnnnn', '##snnnnnn', '##is', '##gn', '##tttttttt', '##ttttt', '##g', '##siin', '##nnnnnnnnnn', '##n', '##snnn', '##iinn', '##nnsq', '##snlq', '##sllh', '##pqy', '##ylsn', '##ssssssss', '##yk', '##it', '##p']
No. of Tokens: 62
{'token_ids': [2, 15, 18215, 41, 3886, 288, 162, 1757, 8558, 473, 315, 6572, 42, 48, 5779, 25621, 26938, 335, 42, 10769, 3952, 252, 66, 42, 8015, 42, 10207, 22795, 5601, 10424, 315, 208, 272

In [14]:
# the recommended batches size for BERT are 16,32 ... however on this dataset we are overfitting quite fast 
# and smaller batches work like a regularization. 
# You might play with adding another dropout layer instead.

batch_size = 6

In [27]:
def convert_sequence_to_features(sequence, max_length_seq=200):
    tokenized = tokenizer.tokenize(sequence)

    # print('tokenized', tokenized)
    # print("No. of Tokens:", len(tokenized))
    # convert tokens to ids in WordPiece
    input_ids = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokenized + ["[SEP]"])
    
    # precalculation of pad length, so that we can reuse it later on
    padding_length = max_length_seq - len(input_ids)

    tokens_length = len(input_ids)

    # map tokens to WordPiece dictionary and add pad token for those text shorter than our max length
    input_ids = input_ids + ([0] * padding_length)

    # attention should focus just on sequence with non padded tokens
    attention_mask = [1] * tokens_length

    # do not focus attention on padded tokens
    attention_mask = attention_mask + ([0] * padding_length)

    # token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence
    token_type_ids = [0] * max_length_seq

    bert_input = {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_mask
    }
    return bert_input


In [35]:
# map to the expected input to TFBertForSequenceClassification, see here 
def map_example_to_dict(input_ids, attention_masks, token_type_ids, labels):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, labels

def encode_examples(seqs, labels, max_seq_length=200, limit=-1):

  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []

  for seq, label in zip(seqs, labels):

    bert_input = convert_sequence_to_features(seq, max_seq_length)
  
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append(label)
  
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)


In [17]:
train_num=int(len(rawdata)*0.9)
train_data=rawdata[0:train_num]
validation_data=rawdata[train_num:]

In [18]:
print(len(train_data), len(validation_data))

28260 3140


In [37]:
train_inputX = [i[0] for i in train_data]
train_inputY = [utils.convertlabels_to_binary(i[1]) for i in train_data]
train_encoded = encode_examples(train_inputX, train_inputY).shuffle(10000).batch(batch_size)

val_inputX = [i[0] for i in validation_data]
val_inputY = [utils.convertlabels_to_binary(i[1]) for i in validation_data]
val_encoded = encode_examples(val_inputX, val_inputY).batch(batch_size)

In [38]:
print(train_encoded)

<BatchDataset shapes: ({input_ids: (None, 200), token_type_ids: (None, 200), attention_mask: (None, 200)}, (None, 200)), types: ({input_ids: tf.int32, token_type_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>


In [47]:
from transformers import TFBertForSequenceClassification, TFPreTrainedModel
import tensorflow as tf

In [64]:

# recommended learning rate for Adam 5e-5, 3e-5, 2e-5

learning_rate = 2e-5

# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1


# model initialization
model = TFBertForSequenceClassification.from_pretrained(
    'bert_thesis_experiments/pytorch_bert_pretrained_models/pytorch_model.bin',
    config='bert_thesis_experiments/pytorch_bert_pretrained_models/config.json',
    from_pt=True,
    output_attentions=False,
    output_hidden_states=False, 
    num_labels=200)

# classifier Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
metric = tf.keras.metrics.BinaryAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
model.summary()

Model: "tf_bert_for_sequence_classification_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_265 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  153800    
Total params: 109,636,040
Trainable params: 109,636,040
Non-trainable params: 0
_________________________________________________________________


In [66]:
bert_history = model.fit(train_encoded, epochs=number_of_epochs, validation_data=val_encoded)

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.




The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.




## Reference
1. https://medium.com/atheros/text-classification-with-transformers-in-tensorflow-2-bert-2f4f16eff5ad
2. https://github.com/atherosai/python-graphql-nlp-transformers/tree/master/notebooks/BERT%20fine-tunning%20in%20Tensorflow%202%20with%20Keras%20API