In [1]:
import pandas as pd
import tensorflow as tf

In [2]:
df = pd.read_csv("WELFake_Dataset.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...,...
72129,72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [4]:
X = df[["title", "text"]]
y = df["label"]

In [23]:
X.iloc[0]

title    LAW ENFORCEMENT ON HIGH ALERT Following Threat...
text     No comment is expected from Barack Obama Membe...
Name: 0, dtype: object

In [5]:
X.shape, y.shape

((72134, 2), (72134,))

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [8]:
round(sum([len(str(i).split()) for i in X["text"].values.tolist()])/len(X["text"].values.tolist()))

541

In [9]:
from tensorflow.keras import layers

In [10]:
max_vocab_length = 10000
max_length = 541

text_vector_layer = tf.keras.layers.TextVectorization(max_tokens=max_vocab_length,
                                                output_mode='int',
                                                output_sequence_length=max_length)

In [11]:
text_vector_layer.adapt(X_train["text"].values.tolist())

In [13]:
embedding = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     input_length=max_length)

In [15]:
import random

In [19]:
random_sentence = random.choice(X_train["title"].values.tolist())

In [20]:
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vector_layer([random_sentence]))
sample_embed

Original text:
Trump considers plan to replace Tillerson with CIA chief: U.S. officials      

Embedded version:


<tf.Tensor: shape=(1, 541, 128), dtype=float32, numpy=
array([[[ 0.02090834,  0.04200025, -0.04740032, ..., -0.00601567,
         -0.02563946,  0.02437614],
        [ 0.00599481, -0.00403114, -0.00295774, ...,  0.00190882,
          0.00103259, -0.04907707],
        [ 0.00526872, -0.04918175, -0.04387138, ..., -0.04516617,
          0.04016418,  0.02084937],
        ...,
        [-0.04712648,  0.04809398,  0.04219261, ...,  0.02478163,
         -0.02963173, -0.00751286],
        [-0.04712648,  0.04809398,  0.04219261, ...,  0.02478163,
         -0.02963173, -0.00751286],
        [-0.04712648,  0.04809398,  0.04219261, ...,  0.02478163,
         -0.02963173, -0.00751286]]], dtype=float32)>

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

X_train = X_train

In [24]:
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vector_layer(inputs)
x = embedding(x)
#x= layers.LSTM(64, return_sequences=True)(x)
x = layers.LSTM(64)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs, outputs, name="model_LSTM")

In [25]:
model.summary()

Model: "model_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 541)               0         
 ctorization)                                                    
                                                                 
 embedding_1 (Embedding)     (None, 541, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1329473 (5.07 MB)
Trainable params: 1329473 (5.07 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [31]:
model.compile(loss=tf.keras.losses.BinaryFocalCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(),
             metrics=["accuracy"])

model.fit(X_train["text"].values.tolist(), y_train.values.tolist(), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1d7618c32b0>

In [32]:
model.evaluate(X_test["text"].values.tolist(), y_test.values.tolist())



[0.02650206722319126, 0.9807999134063721]

In [35]:
model_pred_prob = model.predict(X_test["text"].values.tolist())



In [36]:
model_pred = tf.squeeze(tf.round(model_pred_prob))

In [34]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
    """
    Calculates model accuracy, precision, recall and f1 score of a binary classification model.

    Args:
    -----
    y_true = true labels in the form of a 1D array
    y_pred = predicted labels in the form of a 1D array

    Returns a dictionary of accuracy, precision, recall, f1-score.
    """
  # Calculate model accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
    return model_results

In [38]:
model_result = calculate_results(y_true=y_test.values.tolist(),
                                y_pred = model_pred)
model_result

{'accuracy': 98.07998890968324,
 'precision': 0.9808612237629737,
 'recall': 0.9807998890968324,
 'f1': 0.9807964673023798}