# Fake News Classification Using Neural Networks (LSTM)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim

In [5]:
# from transformers import AutoTokenizer

# # Load a pre-trained Arabic-compatible tokenizer (e.g., Arabic BERT)
# tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# # Tokenize the text data
# def tokenize_text(text):
#     tokens = tokenizer.tokenize(text)
#     indices = tokenizer.convert_tokens_to_ids(tokens)
#     return indices


# # Pad sequences to a fixed length
# max_len = 200
# def pad_sequence(sequence):
#     if len(sequence) < max_len:
#         return sequence + [0] * (max_len - len(sequence))
#     else:
#         return sequence[:max_len]

In [15]:
import re

def preprocess(text):
    text = re.sub(r'[\u064B-\u0652]', '', text)  # Remove diacritics
    text = text.replace('أ', 'ا').replace('إ', 'ا').replace('آ', 'ا')  # Normalize alif
    return text

def tokenize(text):
    return text.split()


from collections import Counter

def build_vocab(texts):
    all_tokens = [token for text in texts for token in tokenize(preprocess(text))]
    vocab = {word: i+1 for i, (word, _) in enumerate(Counter(all_tokens).items())}
    vocab['<PAD>'] = 0
    return vocab

def text_to_sequence(text, vocab, max_len=200):
    tokens = tokenize(preprocess(text))
    sequence = [vocab.get(token, vocab['<PAD>']) for token in tokens]
    return np.array(sequence[:max_len] + [vocab['<PAD>']] * (max_len - len(sequence)))


In [29]:
df_articles = pd.read_csv('../final_datasets/articles_dataset.csv')
df_tweets = pd.read_csv('../final_datasets/tweets_dataset.csv')
df_comb = pd.read_csv('../final_datasets/combined_dataset.csv')

# Convert labels to numerical representation (0 for 'fake', 1 for 'real')
df_articles['label'] = df_articles['label'].map({'fake': 0, 'real': 1})
df_tweets['label'] = df_tweets['label'].map({False: 0, True: 1})
df_comb['label'] = df_comb['label'].map({'fake': 0, 'real': 1})

df_articles.head()

Unnamed: 0,text,label
0,الشرق الاوسط الترقب يسود الخليل مواجهات الشرطه...,1
1,اسرائيل ضم موقعين اسلاميين الضفه الغربيه المحت...,1
2,الابراهيمي الخليل وقبر راحيل القريب بيت لحم ضم...,1
3,الكبري العالم ستفهم سريعا وتعتزم اسرائيل ضم ال...,1
4,فتاه تقبل بشقه واحده وخاتم بالماسه وحيده تقدير...,0


In [102]:
df_tweets = df_tweets.sample(frac=1, random_state=42).reset_index(drop=True)
df_tweets = df_tweets.sample(frac=1, random_state=42).reset_index(drop=True)
df_tweets.head()

Unnamed: 0,text,label
0,يعرض استلام الطرود الصين الناس لى خطر الصابة ب...,1
1,صحة للصورة المتداولة حاليا لشكل فيروس كورونا ت...,1
2,انتشار كورونا يران مرعب كثر انتشاره الصين نفسه...,0
3,كانت النتيجة جثثا ملقاة جانب الطريق وتوابيت مص...,1
4,وضحت المعلومة الغرغرة خلال الما والملح الخل قا...,0


In [30]:
df_comb.head()

Unnamed: 0,text,label
0,اليمن الحربي يسلم نفسه والمصري ينفي توافد ارها...,1
1,علي المائه وكان استطلاع مماثل اجراه المركز ثلا...,1
2,افغاني حتفهم الصراع الدائر البلاد، المائه منهم...,1
3,سيتيتمون وابدي اع احترامه لحق خصمه بم خالفته ر...,0
4,تفتتح المستشفى الجديد لمواجهة تم بناه خلال 7 يام,1


In [146]:

X_a = df_articles[['text']]
y_a = df_articles['label']


X_t = df_tweets[['text']]
y_t = df_tweets['label']

X_c = df_comb[['text']]
y_c = df_comb['label']

print(X_a.shape)
print(X_t.shape)
print(X_c.shape)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_a, y_a, test_size=0.2, random_state=42)
X_dev1, X_test1, y_dev1, y_test1 = train_test_split(X_test1, y_test1, test_size=0.5, random_state=42)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_t, y_t, test_size=0.2, random_state=42)
X_dev2, X_test2, y_dev2, y_test2 = train_test_split(X_test2, y_test2, test_size=0.5, random_state=42)

X_train3, X_test3, y_train3, y_test3 = train_test_split(X_c, y_c, test_size=0.2, random_state=42)
X_dev3, X_test3, y_dev3, y_test3 = train_test_split(X_test3, y_test3, test_size=0.5, random_state=42)

vocab = build_vocab(X_train1['text'])
# vocab = build_vocab(X_train2['text'])
# vocab = build_vocab(X_train3['text'])

(15881, 1)
(2812, 1)
(18693, 1)


In [125]:
# LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=False)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()


    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # x = x.view(x.size(0), 1, -1)
        # lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out[:, -1, :])
        return self.sigmoid(output)


In [147]:
# Hyperparameters
# vocab_size = tokenizer.vocab_size
vocab_size = len(vocab)
embedding_dim = 64
hidden_dim = 64
output_dim = 1
learning_rate = 0.01

print("vocab size: ", vocab_size)

vocab size:  121820


In [127]:
# Training loop
def train(model, X, y, lr=0.1, epochs=5):
  model.train()
  loss_fn = nn.BCELoss()
  optimizer = optim.Adam(model.parameters(), lr=lr)

  for epoch in range(epochs):

      X_copy = X.copy()
      tokens = X_copy['text'].apply(lambda text: text_to_sequence(text, vocab))
      X_copy = torch.tensor(np.array(tokens.tolist()), dtype=torch.long)  # Convert to tensor
      y = torch.tensor(np.array(y), dtype=torch.float32)

      optimizer.zero_grad()
      outputs = model(X_copy).squeeze()
      loss = loss_fn(outputs, y)
      loss.backward()
      optimizer.step()
      print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')
      # test(X_dev2, y_dev2, model)


In [59]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_metrics(y_true, y_pred):
    """Calculate various evaluation metrics."""
    y_pred_classes = (y_pred >= 0.5).astype(np.int64)
    accuracy = accuracy_score(y_true, y_pred_classes)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred_classes, average='binary')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [138]:
# Testing loop
def test(X, y, model):
    loss_fn = nn.BCELoss()
    model.eval()

    X_copy = X.copy()
    tokens = X_copy['text'].apply(lambda text: text_to_sequence(text, vocab))
    X_copy = torch.tensor(np.array(tokens.tolist()), dtype=torch.long)  # Convert to tensor
    y = torch.tensor(np.array(y), dtype=torch.float32)

    test_loss = 0

    with torch.no_grad():
        predictions = model(X_copy).squeeze()
        predicted_labels = (predictions > 0.5).float()  # Convert probabilities to binary labels

        test_loss += loss_fn(predictions, y).item()

        # Calculate metrics
        metrics = calculate_metrics(y, np.array(predicted_labels))

        # print(f"Test Error: \n Accuracy: {(100*correct):>0.3f}%, Avg loss: {test_loss:>8f}")
        print(f"accuracy: {metrics['accuracy']}, precision: {metrics['precision']}, recall: {metrics['recall']}, f1: {metrics['f1']} \n")

In [148]:
model1 = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
epochs = 5

In [149]:
#train on articles
train(model1, X_train1, y_train1, learning_rate, epochs)

Epoch 1/5, Loss: 0.7266
Epoch 2/5, Loss: 0.6774
Epoch 3/5, Loss: 0.6514
Epoch 4/5, Loss: 0.6370
Epoch 5/5, Loss: 0.6376


In [150]:
# for model trained on articles, test on articles
test(X_test1, y_test1, model1)

accuracy: 0.6702328508495909, precision: 0.6702328508495909, recall: 1.0, f1: 0.8025621703089676 



In [151]:
# for model trained on articles, test on tweets
test(X_test2, y_test2, model1)

accuracy: 0.45390070921985815, precision: 0.45390070921985815, recall: 1.0, f1: 0.624390243902439 



In [143]:
# change vocab, rerun hyperparameters cell, 
# change dev set in train function to X_dev2, y_dev2 before running
# train on tweets
model2 = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
train(model2, X_train2, y_train2, learning_rate, 5)

Epoch 1/5, Loss: 0.6969
Epoch 2/5, Loss: 0.8448
Epoch 3/5, Loss: 0.7218
Epoch 4/5, Loss: 0.6933
Epoch 5/5, Loss: 0.6976


In [144]:
# for model trained on tweets, test on articles
test(X_test1, y_test1, model2)

accuracy: 0.6702328508495909, precision: 0.6702328508495909, recall: 1.0, f1: 0.8025621703089676 



In [145]:
# for model trained on tweets, test on tweets
test(X_test2, y_test2, model2)

accuracy: 0.45390070921985815, precision: 0.45390070921985815, recall: 1.0, f1: 0.624390243902439 



In [62]:
# change vocab, rerun hyperparameters cell, 
# change dev set in train function to X_dev3, y_dev3 before running
# train model on both tweets and articles
model3 = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
train(model3, X_train3, y_train3, learning_rate, epochs)

Epoch 1/5, Loss: 0.7238
accuracy: 0.6324237560192616, precision: 0.6324237560192616, recall: 1.0, f1: 0.7748279252704031 

Epoch 2/5, Loss: 0.6945
accuracy: 0.6324237560192616, precision: 0.6324237560192616, recall: 1.0, f1: 0.7748279252704031 

Epoch 3/5, Loss: 0.6640
accuracy: 0.6324237560192616, precision: 0.6324237560192616, recall: 1.0, f1: 0.7748279252704031 

Epoch 4/5, Loss: 0.6518
accuracy: 0.6324237560192616, precision: 0.6324237560192616, recall: 1.0, f1: 0.7748279252704031 

Epoch 5/5, Loss: 0.6555
accuracy: 0.6324237560192616, precision: 0.6324237560192616, recall: 1.0, f1: 0.7748279252704031 



In [63]:
# for model trained on both articles and tweets, test on both articles and tweets
test(X_test3, y_test3, model3)

accuracy: 0.6358288770053476, precision: 0.6358288770053476, recall: 1.0, f1: 0.7773782281791436 



trained and tested on only articles: 

accuracy: 0.6702328508495909, precision: 0.6702328508495909, recall: 1.0, f1: 0.8025621703089676


trained on articles, tested on tweets:

accuracy: 0.45390070921985815, precision: 0.45390070921985815, recall: 1.0, f1: 0.624390243902439 


trained and tested on only tweets:

accuracy: 0.6702328508495909, precision: 0.6702328508495909, recall: 1.0, f1: 0.8025621703089676 


trained on tweets, tested on articles:

accuracy: 0.45390070921985815, precision: 0.45390070921985815, recall: 1.0, f1: 0.624390243902439 


trained on combination, tested on combination:

accuracy: 0.6358288770053476, precision: 0.6358288770053476, recall: 1.0, f1: 0.7773782281791436