# Fake News Detection Model using TensorFlow in Python

Fake news is a type of misinformation that can mislead readers, influence public opinion, and even damage reputations. Detecting fake news prevents its spread and protects individuals and organizations. Media outlets often use these models to help filter and verify content, ensuring that the news shared with the public is accurate.

In [12]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Load data for classification
data = pd.read_csv("./data/news.csv")
data = data.drop(["Unnamed: 0"], axis=1)
print(data.shape)
data.head()

(6335, 3)


Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [None]:
# Encode labels to 0 and 1
le = preprocessing.LabelEncoder()
le.fit(data['label'])
data['label'] = le.transform(data['label'])
data.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


In [51]:
data["label"].value_counts()

label
1    3171
0    3164
Name: count, dtype: int64

In [None]:
embedding_dim = 50
max_length = 54
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<OOV>"
#training_size = 3000
training_size = data.shape[0]
test_portion = 0.1

title = []      # list to store all titles
text = []       # list to store all text bodies
labels = []     # list to store all labels

# loop through first training_size samples and extract fields
for x in range(training_size):
    title.append(data['title'][x])    # add title at index x
    text.append(data['text'][x])      # add text at index x
    labels.append(data['label'][x])   # add label at index x

tokenizer1 = Tokenizer()  # create a tokenizer for text preprocessing
tokenizer1.fit_on_texts(title)  # build word index based on all titles

word_index1 = tokenizer1.word_index   # dictionary: word -> integer index
vocab_size1 = len(word_index1)        # number of unique words in titles

sequences1 = tokenizer1.texts_to_sequences(title)   # convert each title into list of integers (token IDs)

# pad/truncate sequences so they all have same length
padded1 = pad_sequences(sequences1, padding=padding_type, truncating=trunc_type)  


In [41]:
print(vocab_size1)

11721


In [None]:
# splitting Data for Training and Testing
split = int(test_portion * training_size)
training_sequences1 = padded1[split:training_size]
training_labels = labels[split:training_size]
test_sequences1 = padded1[0:split]
test_labels = labels[0:split]

In [None]:
# pre-trained Glove embedding matrix for the model 
embedding_index = {}
with open(r"C:\Users\meisa\Downloads\Compressed\glove.6B\glove.6B.50d.txt", 'r', encoding='utf-8') as f:
    for line in f:                     # Each line: word val1 val2 val3 ... val50
        values = line.split()          # split into word + numbers
        word = values[0]               # first entry is the word
        coefs = np.asarray(values[1:], dtype='float32')  # rest are floats
        embedding_index[word] = coefs  # save in dictionary
        
embedding_matrix = np.zeros((vocab_size1 + 1, embedding_dim))

for word, i in word_index1.items():   # iterate over all words in your tokenizer
    if i < vocab_size1:
        embedding_vector = embedding_index.get(word)  # get GloVe vector if exists
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector    # place it in the right row

In [40]:
embedding_matrix.shape

(11722, 50)

In [None]:
# sequential model with LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size1 + 1, embedding_dim, input_length=max_length, 
                              weights=[embedding_matrix], trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [38]:
history = model.fit(
    training_sequences1, 
    np.array(training_labels), 
    epochs=50, 
    validation_data=(test_sequences1, np.array(test_labels)), 
    verbose=2
)

Epoch 1/50
179/179 - 2s - 10ms/step - accuracy: 0.6564 - loss: 0.6198 - val_accuracy: 0.7362 - val_loss: 0.5404
Epoch 2/50
179/179 - 1s - 3ms/step - accuracy: 0.7113 - loss: 0.5529 - val_accuracy: 0.7504 - val_loss: 0.5107
Epoch 3/50
179/179 - 1s - 3ms/step - accuracy: 0.7527 - loss: 0.5006 - val_accuracy: 0.7630 - val_loss: 0.4980
Epoch 4/50
179/179 - 1s - 3ms/step - accuracy: 0.7781 - loss: 0.4599 - val_accuracy: 0.7741 - val_loss: 0.4833
Epoch 5/50
179/179 - 1s - 3ms/step - accuracy: 0.8041 - loss: 0.4213 - val_accuracy: 0.7709 - val_loss: 0.4629
Epoch 6/50
179/179 - 1s - 3ms/step - accuracy: 0.8250 - loss: 0.3938 - val_accuracy: 0.7551 - val_loss: 0.4846
Epoch 7/50
179/179 - 1s - 3ms/step - accuracy: 0.8423 - loss: 0.3569 - val_accuracy: 0.7773 - val_loss: 0.4982
Epoch 8/50
179/179 - 1s - 3ms/step - accuracy: 0.8464 - loss: 0.3448 - val_accuracy: 0.7788 - val_loss: 0.4736
Epoch 9/50
179/179 - 1s - 3ms/step - accuracy: 0.8571 - loss: 0.3290 - val_accuracy: 0.7804 - val_loss: 0.4464


In [45]:
print(data["text"][2])
print(data["label"][2])

U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism.

Kerry said he expects to arrive in Paris Thursday evening, as he heads home after a week abroad. He said he will fly to France at the conclusion of a series of meetings scheduled for Thursday in Sofia, Bulgaria. He plans to meet the next day with Foreign Minister Laurent Fabius and President Francois Hollande, then return to Washington.

The visit by Kerry, who has family and childhood ties to the country and speaks fluent French, could address some of the criticism that the United States snubbed France in its darkest hour in many years.

The French press on Monday was filled with questions about why neither President Obama nor Kerry attended Sunday’s march, as about 40 leaders of other nations did. Obama was said to have stayed away because his own security needs can be taxing on a country, and Ker

In [52]:
# test the model
#X = "Karry to go to France in gesture of sympathy"
X = data['text'][1]

sequences = tokenizer1.texts_to_sequences([X])
sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
if model.predict(sequences, verbose=0)[0][0] >= 0.5:
    print("This news is True")
else:
    print("This news is False")

This news is False
