In [1]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/tools/tokenization.py

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import tensorflow as tf
import tensorflow_hub as hub
import torch
import tokenization
from keras.utils import to_categorical

import matplotlib.pyplot as plt
import re
import seaborn as sns

In [None]:
df = pd.read_csv('/WELFake_Cleaned.csv')
df.text = df.text.astype(str)

In [3]:
X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 40)

In [4]:
df['label'] = df['label'].replace({'Fake' : 1, 'Real' : 0})

In [5]:
bert_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert = hub.KerasLayer(bert_url, trainable=True)



In [6]:
vocab = bert.resolved_object.vocab_file.asset_path.numpy()
lower_case = bert.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab, lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [7]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    clf_output = sequence_output[:, 0, :]
    
    lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    lay = tf.keras.layers.Dense(32, activation='relu')(lay)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    out = tf.keras.layers.Dense(1, activation='softmax')(lay)
    
    bert_model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    bert_model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return bert_model

In [8]:
max_len = 250
train_text = bert_encode(X_train.values, tokenizer, max_len=max_len)
test_text = bert_encode(X_test.values, tokenizer, max_len=max_len)

In [9]:
bert_model = build_model(bert, max_len=max_len)
bert_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 250)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 250)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 250)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 250, 768)]                'input_mask[0][0]',         

  super().__init__(name, **kwargs)


In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_sh = bert_model.fit(
    train_text, y_train,
    validation_split=0.2,
    epochs=3,
    callbacks=[checkpoint, earlystopping],
    batch_size=32,
    verbose=1
)

Epoch 1/3
   5/1427 [..............................] - ETA: 28:21:45 - loss: 0.6881 - accuracy: 0.5312