In [20]:
# following this tutorial: https://medium.com/@skillcate/detecting-fake-news-with-a-bert-model-9c666e3cdd9b
import re
import tqdm
from nltk.corpus import stopwords
import nltk
from tensorflow.keras import regularizers, initializers, optimizers, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow_addons as tfa
import tensorflow as tf
from keras.utils.np_utils import to_categorical
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
import pandas as pd
import numpy as np

In [21]:
# read data
df = pd.read_csv("raw_data/fulltrain.csv", names=["labels", "text"])
df.head()

Unnamed: 0,labels,text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [22]:
from sklearn.model_selection import train_test_split
# transform y into 1 hot vectors
y = pd.get_dummies(df["labels"])
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(df['text'],y, stratify=df['labels'])
X_train.head(4)

24820                How to Be a World Class CrookYoutube 
857      In a span of two minutes Monday, 33-year-old S...
46956    Best Sweet, a maker of candy and chewy over-th...
41836    The protesters were arrested at 4:40 a.m. PST ...
Name: text, dtype: object

In [23]:
y_train

array([[0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       ...,
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0]], dtype=uint8)

In [24]:
import tensorflow_hub as hub
import tensorflow_text as text

In [25]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(4, activation='softmax', name="output")(l)


2023-03-30 13:44:11.030522: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [26]:
# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])
model.compile(
    optimizer="adam", 
    loss="categorical_crossentropy", 
    metrics = [
        "categorical_accuracy", 
        tf.keras.metrics.Precision(), 
        tf.keras.metrics.Recall(), 
        tf.keras.metrics.AUC(),
        tfa.metrics.F1Score(num_classes=4, average="micro", name="f1_score_micro"), 
        tfa.metrics.F1Score(num_classes=4, average="macro", name="f1_score_macro")
    ]
)


In [29]:
# fit
with tf.device('/cpu:0'):
    model.fit(X_train, y_train, epochs=2, batch_size = 32)

Epoch 1/2
Epoch 2/2


In [30]:
model.save('bert_model')

2023-03-30 20:09:09.273221: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-03-30 20:09:21.377468: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: bert_model/assets


INFO:tensorflow:Assets written to: bert_model/assets


In [33]:
with tf.device('/cpu:0'):
    results = model.evaluate(X_test, y_test, batch_size=32)
    print('Test loss:', results[0])
    print('Test categorical_accuracy:', results[1])
    print('Test precision:', results[2])
    print('Test recall:', results[3])
    print('Test auc:', results[4])
    print('Test F1 (micro):', results[5])
    print('Test F1 (macro):', results[6])

2023-03-30 23:48:00.616345: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Test loss: 0.5037228465080261
Test categorical_accuracy: 0.8215162754058838
Test precision: 0.8732697367668152
Test recall: 0.7644506096839905
Test auc: 0.9613867998123169
Test F1 (micro): 0.8215162754058838
Test F1 (macro): 0.8032023906707764


In [34]:
def draw_confusion_matrix(true, preds):
    conf_matx = confusion_matrix(true, preds)
    sns.heatmap(
        conf_matx, 
        annot=True, 
        annot_kws={"size": 12},
        fmt='g', 
        cbar=False, 
        cmap="viridis"
    )
    plt.show()

In [37]:
# test on balanced test.csv
test_df = pd.read_csv("raw_data/balancedtest.csv", names=["labels", "text"])

# split into x_test and y_test
x_test = test_df['text']

# transform y into one hot vectors
y_test = pd.get_dummies(test_df["labels"])
y_test = np.array(y_test)



In [None]:
with tf.device('/cpu:0'):
    # predict
    print("START predict")
    test_predictions = model.predict(x_test)
    test_result = np.argmax(test_predictions, axis=1)
    print("DONE predict")

    # evaluate
    print("START evaluate")
    test_results = model.evaluate(x_test, y_test)
    print('Test loss:', test_results[0])
    print('Test categorical_accuracy:', test_results[1])
    print('Test precision:', test_results[2])
    print('Test recall:', test_results[3])
    print('Test auc:', test_results[4])
    print('Test F1 (micro):', test_results[5])
    print('Test F1 (macro):', test_results[6])

draw_confusion_matrix(np.argmax(y_test, axis=1), test_result)