In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 4.3 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 50.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 51.7 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 49.5 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [None]:
import pandas as pd
import re

def clean(data):
  tokens = data.split()
  translation_table = str.maketrans('', '', "\"#$%&'()*+-/:;<=>@[\]^_`{|}~?!.,")
  tokens = [w.translate(translation_table) for w in tokens]
  tokens = [word.lower() for word in tokens]

  return ' '.join(tokens)

data = []
labels = []

# read news
with open('/content/drive/MyDrive/dataset/SMSSpamCollection.txt') as f:
    lines = [line.rstrip() for line in f]

for line in lines:
  tmp = line.split('\t')
  data.append(clean(tmp[1]))
  
  if tmp[0]=='ham':
    labels.append(0)
  
  if tmp[0]=='spam':
    labels.append(1)
  

In [None]:
from tensorflow.keras.utils import to_categorical

labels = to_categorical(labels)

In [None]:
labels[0]

array([1., 0.], dtype=float32)

In [None]:
from transformers import BertTokenizer
import numpy as np

input_ids=[]
attention_masks=[]

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

for sent in data:
    bert_inp = bert_tokenizer.encode_plus(sent, add_special_tokens = True, max_length =64, pad_to_max_length = True, 
                                        return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids = np.asarray(input_ids)
attention_masks = np.array(attention_masks)
labels = np.array(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
len(input_ids),len(attention_masks),len(labels)

(5574, 5574, 5574)

In [None]:
from sklearn.model_selection import train_test_split

train_inp, val_inp, train_label, val_label, train_mask, val_mask = train_test_split(
   input_ids, labels, attention_masks, test_size=0.20, random_state=1000)

test_inp, val_inp, test_label, val_label, test_mask, val_mask = train_test_split(
   val_inp, val_label, val_mask, test_size=0.5, random_state=1000)

In [None]:
import tensorflow as tf
from transformers import TFBertModel, TFBertForPreTraining

SEQ_LEN = 64

bert = TFBertModel.from_pretrained('bert-base-uncased')

input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(SEQ_LEN,), name='attention_mask', dtype='int32')

embeddings = bert(input_ids, attention_mask=mask)[0]
X = tf.keras.layers.Lambda(lambda x: tf.keras.backend.mean(x, axis=1))(embeddings)
y = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(X)

bert_model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)

bert_model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

bert_model.summary()

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 64)]         0                                            
__________________________________________

In [None]:
history = bert_model.fit([train_inp, train_mask], train_label,
                       batch_size = 32,
                       epochs = 4,
                       validation_data = ([val_inp, val_mask],val_label))


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
import tensorflow as tf
from transformers import TFBertModel, TFBertForPreTraining

SEQ_LEN = 64

bert = TFBertModel.from_pretrained('bert-base-uncased')

input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(SEQ_LEN,), name='attention_mask', dtype='int32')

embeddings = bert(input_ids, attention_mask=mask)[1]
y = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(embeddings)

bert_model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)

bert_model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

bert_model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 64)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 64)]         0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
outputs (Dense)                 (None, 2)            1538        tf_bert_model_1[0][1]      

In [None]:
history = bert_model.fit([train_inp, train_mask], train_label,
                       batch_size = 32,
                       epochs = 5,
                       validation_data = ([val_inp, val_mask],val_label))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
