# Imports

In [1]:
import preprocessor as p
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization 

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
import re

import pandas as pd
import io
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Load Data

In [22]:
train_df = pd.read_csv("data/Constraint_Train.csv")
train_df = train_df.drop('id', 1)
val_df = pd.read_csv("data/Constraint_Val.csv")
val_df = val_df.drop('id', 1)

  train_df = train_df.drop('id', 1)
  val_df = val_df.drop('id', 1)
  test_df = test_df.drop('id', 1)


# Preprocess Data (Raw Text)

In [11]:
stop_words = set(stopwords.words('english'))

wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer  = PorterStemmer()

In [12]:
p.set_options(p.OPT.URL, p.OPT.EMOJI)

def preprocess(row, lemmatizer, stemmer):
    text = row['tweet']
    text = text.strip('\xa0')
    text = p.clean(text)
    tokenization = nltk.word_tokenize(text)     
    tokenization = [w for w in tokenization if not w in stop_words]
    text = ' '.join([porter_stemmer.stem(w) for w in tokenization])
    text = ' '.join([lemmatizer.lemmatize(w) for w in tokenization])
    text = re.sub(r'\([0-9]+\)', '', text).strip()
    return text

In [23]:
train_df['tweet'] = train_df.apply(lambda x: preprocess(x, wordnet_lemmatizer, porter_stemmer), 1)
val_df['tweet'] = val_df.apply(lambda x: preprocess(x, wordnet_lemmatizer, porter_stemmer), 1)
test_df['tweet'] = test_df.apply(lambda x: preprocess(x, wordnet_lemmatizer, porter_stemmer), 1)

In [24]:
def map_label(row):
    return 1 if row['label'] == 'real' else 0

train_df['label_encoded'] = train_df.apply(lambda x: map_label(x), 1)
val_df['label_encoded'] = val_df.apply(lambda x: map_label(x), 1)

In [25]:
train_df.pop('label')
val_df.pop('label')

0       fake
1       fake
2       fake
3       fake
4       real
        ... 
2135    fake
2136    real
2137    fake
2138    fake
2139    real
Name: label, Length: 2140, dtype: object

In [27]:
def df_to_dataset(dataframe, batch_size=10):
    dataframe = dataframe.copy()
    tweets = dataframe.pop('tweet')
    labels = dataframe.pop('label_encoded')
    ds = tf.data.Dataset.from_tensor_slices((tweets, labels))
    ds = ds.batch(batch_size)
    return ds

In [28]:
train_ds = df_to_dataset(train_df)
val_ds = df_to_dataset(val_df)

In [29]:
for x,y in train_ds:
    print(x,y)

tf.Tensor(
[b'The CDC currently report 99031 death . In general discrepancy death count different source small explicable . The death toll stand roughly 100000 people today .'
 b'States reported 1121 death small rise last Tuesday . Southern state reported 640 death .'
 b'Politically Correct Woman ( Almost ) Uses Pandemic Excuse Not Reuse Plastic Bag # coronavirus # nashville'
 b'# IndiaFightsCorona : We 1524 # COVID testing laboratory India 25th August 2020 36827520 test done : @ ProfBhargava DG @ ICMRDELHI # StaySafe # IndiaWillWin'
 b'Populous state generate large case count look new case per million today 9 smaller state showing case per million California Texas : AL AR ID KS KY LA MS NV SC .'
 b"Covid Act Now found `` average person Illinois COVID-19 infecting 1.11 people . Data show infection growth rate declined time factor stay-at-home order restriction put place . ''"
 b'If tested positive # COVID19 symptom stay home away people . Learn CDC \xe2\x80\x99 recommendation around ot

tf.Tensor(
[b'Steam Bar In India Claims To Prevent COVID-19 , IRS Officer Shares Video # COVID19 # viralvideo'
 b'Our current effective capacity 6730 . This give u excess capacity 4261 . Over next week projecting 3319 arrival 1198 departure facility .'
 b'A look 1000+ drug trial treat prevent COVID-19 .'
 b'If use mask long risk developing tumor .'
 b"Our 4 pm update published . The US completed test least 179220 people 44035 yesterday 's total . Note track test state report ( ) . And state report negative test . For detail see :"
 b'Old.note view .. Now Ramzaan time .. Generally fruit like Papaya , Mosumbi , grape , Sugandhi banana transport `` Corona Green/safe zone like Udupi , Mangalore , Shimoga , chikmagalur thru KSRTC/Truck local MP/MLAs motivate .. Selling prices/kg .. Papaya 15-20 , Mosumbi .. 2'
 b'People Are Already Lining Up Outside The White House To Get Their $ 1,000 Coronavirus Checks # donaldtrump'
 b'cure .. Review @ ICMRDELHI Lab Surveillance Data indicated initial # 

# Build Classifier Model

In [34]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [36]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [38]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [39]:
classifier_model = build_classifier_model()
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

# Train Model

In [40]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Save Model

In [41]:
dataset_name = 'fake_news_detection'
saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))

classifier_model.save(saved_model_path, include_optimizer=False)



INFO:tensorflow:Assets written to: ./fake_news_detection_bert\assets


INFO:tensorflow:Assets written to: ./fake_news_detection_bert\assets


# Load Model

In [31]:
dataset_name = 'fake_news_detection'
saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))

reloaded_model = tf.saved_model.load(saved_model_path)

In [32]:
def print_my_examples(inputs, results):
    result_for_printing = \
        [f'input: {inputs[i]:<30} : score: {results[i][0]:.6f}'
                         for i in range(len(inputs))]
    print(*result_for_printing, sep='\n')
    print()


examples = [
    'The CDC currently report 99031 death . In general discrepancy death count different source small explicable . The death toll stand roughly 100000 people today .',
    'States reported 1121 death small rise last Tuesday . Southern state reported 640 death .',
    'Politically Correct Woman ( Almost ) Uses Pandemic Excuse Not Reuse Plastic Bag # coronavirus # nashville',
    'If tested positive # COVID19 symptom stay home away people . Learn CDC \xe2\x80\x99 recommendation around others COVID-19 infection : .'
]

reloaded_results = tf.sigmoid(reloaded_model(tf.constant(examples)))

print('Results from the saved model:')
print_my_examples(examples, reloaded_results)

Results from the saved model:
input: The CDC currently report 99031 death . In general discrepancy death count different source small explicable . The death toll stand roughly 100000 people today . : score: 0.999953
input: States reported 1121 death small rise last Tuesday . Southern state reported 640 death . : score: 0.999985
input: Politically Correct Woman ( Almost ) Uses Pandemic Excuse Not Reuse Plastic Bag # coronavirus # nashville : score: 0.000033
input: If tested positive # COVID19 symptom stay home away people . Learn CDC â recommendation around others COVID-19 infection : . : score: 0.999991



# Evaluate Model with Validation Data

In [33]:
inputs = val_df['tweet'].tolist()
actual_results = val_df['label_encoded'].tolist()

def generate_results(inputs):
    reloaded_results = np.array(tf.sigmoid(reloaded_model(tf.constant(inputs))))
    results = []
    for i in reloaded_results:
        if (i[0] >= 0.5):
            results.append(1)
        else:
            results.append(0)
    return results

results = generate_results(inputs)
results

[0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,


In [41]:
matrix = tf.math.confusion_matrix(actual_results, results)
print('Confusion matrix \n',matrix)

Confusion matrix 
 tf.Tensor(
[[ 982   38]
 [  31 1089]], shape=(2, 2), dtype=int32)


In [40]:
tp, fn, fp, tn = np.array(matrix).reshape(-1)
print('Outcome values \nTP: {0} \nFN: {1} \nFP: {2} \nTN: {3}'.format(tp, fn, fp, tn))

Outcome values 
TP: 982 
FN: 38 
FP: 31 
TN: 1089


In [39]:
def generate_metrics(tp, fn, fp, tn):
    accuracy = (tp + tn)/(tp + fn + fp + tn)
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1 = (2 * precision * recall)/(precision + recall)
    print('Metrics \nAccuracy: {0:.4f} \nPrecision: {1:.4f} \nRecall: {2:.4f} \nF1: {3:.4f}'.format(accuracy, precision, recall, f1))

generate_metrics(tp, fn, fp, tn)

Metrics 
Accuracy: 0.9678 
Precision: 0.9694 
Recall: 0.9627 
F1: 0.9661
