In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from sklearn import model_selection
from sklearn import metrics
import tokenization

### BERT ENCODE FUNCTON:
    Splits the text into tokens and converts tokens into Ids. This function returns three arrays: tokens, masks and segments

In [2]:
def bert_encode(texts, tokenizer, max_len=160):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

## Build Function:
    This function creates 3 layers for BERT model and the model is compiled with Adam optimizer and binary crossentropy as  the loss function.

In [3]:
def build_model(bert_layer, max_len=160):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

### Load BERT layer

In [None]:
%%time
#module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
module_url ="https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

### Load dataset and split the dataset into training and test sets

In [72]:
dataset=pd.read_csv("gossipcop_content_no_ignore.csv",encoding='latin1')

In [73]:
train,test=model_selection.train_test_split(dataset,test_size=0.20)

### Encode the data using helper functions(bert_encode())

In [11]:
import bert
FullTokenizer = bert.bert_tokenization.FullTokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

To reduce the training size, first 500 samples are used and test dataset size remains same

In [64]:
train_input = bert_encode(train['content'][0:500], tokenizer, max_len = 160)
test_input = bert_encode(test['content'], tokenizer, max_len = 160)
train_labels = train['label'][0:500]
test_labels = test['label']

### Building the model and training the model

In [65]:
model = build_model(bert_layer, max_len = 160)
model.summary()

Model: "functional_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 177853441   input_word_ids[0][0]             
                                                                 input_mask[0][0]     

In [66]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.1,
    epochs=2,
    batch_size=20
)

Epoch 1/2
Epoch 2/2


In [57]:
model.save('bert_model.h5')

In [67]:
test_pred = model.predict(test_input)
test_pred = test_pred.round().astype(int)