In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import nltk
import numpy as np
import re
import string

In [3]:
# remove stop words
# remove links
# remove punctuation
# remove hashtags

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

def preprocess(text):
    text = text.lower()
    text = strip_all_entities(strip_links(text))
    text = nltk.word_tokenize(text)
    text = " ".join([word for word in text if word not in nltk.corpus.stopwords.words('english')])

    return text

In [4]:
train_df = pd.read_csv("./datasets/tweets/train.csv")

In [5]:
train_texts = train_df['text'].apply(preprocess)
train_labels = train_df["target"]

del [train_df]

In [6]:
seq_len = 512
num_samples = len(train_texts)

Xids = np.zeros((num_samples, seq_len))
Xmask = np.zeros((num_samples, seq_len))

labels = train_labels.to_numpy()
labels = np.expand_dims(labels, axis=0).T

Xids.shape

(7613, 512)

In [7]:
from transformers import BertTokenizer
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, phrase in tqdm(enumerate(train_texts)):
    token = tokenizer.encode_plus(
        phrase, max_length=seq_len, add_special_tokens=True, 
        padding="max_length", truncation=True, return_tensors='tf')

    Xids[i, :] = token['input_ids']
    Xmask[i, :] = token['attention_mask']

7613it [00:03, 2057.57it/s]


<hr>

In [8]:
def map_func(inputs_ids, masks, labels):
    return {
        'input_ids': inputs_ids,
        'attention_mask': masks
    }, labels

In [9]:
batch_size = 8
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))
dataset = dataset.map(map_func)
dataset = dataset.shuffle(buffer_size=1000).batch(batch_size, drop_remainder=True)
dataset.take(1)

<TakeDataset shapes: ({input_ids: (8, 512), attention_mask: (8, 512)}, (8, 1)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.int64)>

In [10]:
split = 0.9
size = int((num_samples/batch_size) * split)

In [11]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

# del [dataset, Xids, Xmask, labels]

In [12]:
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-cased')

# make untrainable
bert.trainable = False

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [13]:
input_ids = keras.layers.Input(shape=(seq_len,), name="input_ids", dtype="int32")
attention_mask = keras.layers.Input(shape=(seq_len,), name="attention_mask", dtype="int32")

embeddings = bert.bert(input_ids, attention_mask=attention_mask)[1]

x = layers.Dense(1024, activation="relu")(embeddings)
x = layers.Dropout(0.5)(x)
x = layers.Dense(1, activation="sigmoid")(x)

In [14]:
model = keras.Model(inputs=[input_ids, attention_mask], outputs=x)

In [15]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPooling(last_hidd               'attention_mask[0][0]']         
                                en_state=(None, 512                                               
                                , 768),                                                       

In [16]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [17]:
class SaveModelCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        self.model.save_weights(f"./models/tf/model_weights_{epoch}.h5")

In [19]:
history = model.fit(
    train_ds,
    epochs=10,
    validation_data=val_ds,
    verbose=1,
    callbacks=[SaveModelCallback()]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
from tensorflow import keras
from tensorflow.keras import layers
from transformers import TFAutoModel
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-cased')

# make untrainable
seq_len = 512
bert.trainable = False
input_ids = keras.layers.Input(shape=(seq_len,), name="input_ids", dtype="int32")
attention_mask = keras.layers.Input(shape=(seq_len,), name="attention_mask", dtype="int32")

embeddings = bert.bert(input_ids, attention_mask=attention_mask)[1]

x = layers.Dense(1024, activation="relu")(embeddings)
x = layers.Dropout(0.5)(x)
x = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs=[input_ids, attention_mask], outputs=x)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [11]:
# load the best model
model.load_weights(f"./models/tf_bert/model_weights_9.h5")

In [12]:
from transformers import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [15]:
import numpy as np
def get_bert_embedding(texts):

    if type(texts) == str:
        texts = [texts]

    num_samples = len(texts)

    Xids = np.zeros((num_samples, seq_len))
    Xmask = np.zeros((num_samples, seq_len))

    for i, phrase in enumerate(texts):
        token = bert_tokenizer.encode_plus(
        phrase, max_length=seq_len, add_special_tokens=True, 
        padding="max_length", truncation=True, return_tensors='tf')

    Xids[i, :] = token['input_ids']
    Xmask[i, :] = token['attention_mask']

    return Xids, Xmask

In [20]:
ids, mask = get_bert_embedding("There is such a violent earthquake here in my city of Berlin")

In [21]:
model([ids, mask])

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.4508171]], dtype=float32)>

C:\Users\vedan\AppData\Local\Programs\Python\Python38\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\vedan\AppData\Local\Programs\Python\Python38\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


C:\Users\vedan\AppData\Local\Programs\Python\Python38\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\vedan\AppData\Local\Programs\Python\Python38\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is s

AttributeError: 'NLP' object has no attribute 'seq_len'