# Fake News Classification Using Neural Networks (LSTM)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from nltk.corpus import stopwords

# import nltk
# nltk.download('stopwords')

In [2]:
from transformers import AutoTokenizer

# Load a pre-trained Arabic-compatible tokenizer (e.g., Arabic BERT)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenize the text data
def tokenize_text(text):
    tokens = tokenizer.tokenize(text)
    indices = tokenizer.convert_tokens_to_ids(tokens)
    return indices


# Pad sequences to a fixed length
max_len = 200
def pad_sequence(sequence):
    if len(sequence) < max_len:
        return sequence + [0] * (max_len - len(sequence))
    else:
        return sequence[:max_len]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



In [3]:
import re

def preprocess(text):
    text = re.sub(r'[\u064B-\u0652]', '', text)  # Remove diacritics
    text = text.replace('أ', 'ا').replace('إ', 'ا').replace('آ', 'ا')  # Normalize alif
    return text

def tokenize(text):
    return text.split()


from collections import Counter

def build_vocab(texts):
    all_tokens = [token for text in texts for token in tokenize(preprocess(text))]
    vocab = {word: i+1 for i, (word, _) in enumerate(Counter(all_tokens).items())}
    vocab['<PAD>'] = 0
    return vocab

def text_to_sequence(text, vocab, max_len=200):
    tokens = tokenize(preprocess(text))
    sequence = [vocab.get(token, vocab['<PAD>']) for token in tokens]
    return np.array(sequence[:max_len] + [vocab['<PAD>']] * (max_len - len(sequence)))


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv('compiled_real_fake_news_dataset.csv')

# remove stop words
# stop_words = set(stopwords.words('arabic'))
# df['text'] = df['text'].apply(lambda text: ' '.join([word for word in text.split() if word not in stop_words]))


# df['text_indices'] = df['text'].apply(tokenize_text) # Apply tokenization to each text
# df['padded_indices'] = df['text_indices'].apply(pad_sequence)


# # Initialize TF-IDF Vectorizer
# tfidf = TfidfVectorizer(max_features=300)  # Set max_features for dimensionality control
# # Fit and transform the text data
# tfidf_matrix = tfidf.fit_transform(df['text'])
# embeddings = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
# print(embeddings.shape)

# Convert labels to numerical representation (0 for 'fake', 1 for 'real')
df['label'] = df['label'].map({'fake': 0, 'real': 1})



df.head()

Unnamed: 0,text,label
0,الشرق الأوسط الترقب يسود في الخليل بعد مواجها...,1
1,فتاة تقبل بشقة واحدة وخاتم بألماسة وحيدة تقدير...,0
2,محافظ القليوبية آلاف جنيه لكل مدينة وقرية ...,0
3,السيسي يؤك د أن ه يعتقل منافسيه على الرئاسة لي...,0
4,الشرق الأوسط العراق مقتل واصابة شخصا في حوادث...,1


In [11]:
# Sample data (padded sequences and labels assumed)
# X = torch.tensor(df['padded_indices'], dtype=torch.long)
# # X = torch.tensor(embeddings.values, dtype=torch.float32)
# y = torch.tensor(df['label'].values, dtype=torch.float32)

X = df[['text']]
y = df['label']

print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

vocab = build_vocab(X_train['text'])
# df['padded_indices'] = X_train['text'].apply(lambda text: text_to_sequence(text, vocab))

# torch.manual_seed(42)

# # Dataset and DataLoader
# train_dataset = TensorDataset(X_train, y_train)
# dev_dataset = TensorDataset(X_dev, y_dev)
# test_dataset = TensorDataset(X_test, y_test)
# train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# dev_dataloader = DataLoader(dev_dataset, batch_size=32)
# test_dataloader = DataLoader(test_dataset, batch_size=32)

(6370, 1)


In [7]:
# LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=False)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()


    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # x = x.view(x.size(0), 1, -1)
        # lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out[:, -1, :])
        return self.sigmoid(output)


In [12]:
# Hyperparameters
# vocab_size = tokenizer.vocab_size
vocab_size = len(vocab)
embedding_dim = 64
hidden_dim = 64
output_dim = 1
learning_rate = 0.01

In [9]:
# Training loop
def train(model, X, y, lr=0.1, epochs=5):
  loss_fn = nn.BCELoss()
  optimizer = optim.Adam(model.parameters(), lr=lr)

  for epoch in range(epochs):

      X_copy = X.copy()
      tokens = X_copy['text'].apply(lambda text: text_to_sequence(text, vocab))
      X_copy = torch.tensor(np.array(tokens.tolist()), dtype=torch.long)  # Convert to tensor
      y = torch.tensor(np.array(y), dtype=torch.float32)

      optimizer.zero_grad()
      outputs = model(X_copy).squeeze()
      loss = loss_fn(outputs, y)
      loss.backward()
      optimizer.step()
      print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')
      test(X_dev, y_dev, model)


In [10]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_metrics(y_true, y_pred):
    """Calculate various evaluation metrics."""
    y_pred_classes = (y_pred >= 0.5).astype(np.int64)
    accuracy = accuracy_score(y_true, y_pred_classes)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred_classes, average='binary')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [14]:
# Testing loop
def test(X, y, model):
    loss_fn = nn.BCELoss()
    model.eval()

    X_copy = X.copy()
    tokens = X_copy['text'].apply(lambda text: text_to_sequence(text, vocab))
    X_copy = torch.tensor(np.array(tokens.tolist()), dtype=torch.long)  # Convert to tensor
    y = torch.tensor(np.array(y), dtype=torch.float32)

    # size = len(dataloader.dataset)
    # num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # true_labels = []
    # predict_labels = []

    with torch.no_grad():
            predictions = model(X_copy).squeeze()
            predicted_labels = (predictions > 0.5).float()  # Convert probabilities to binary labels

            test_loss += loss_fn(predictions, y).item()
            # correct += (predicted_labels == y).type(torch.float).sum().item()

            # true_labels.extend(y.tolist())
            # predict_labels.extend(predicted_labels.tolist())

    # true_labels = np.array(true_labels)
    # predict_labels = np.array(predict_labels)
            # Calculate metrics
            metrics = calculate_metrics(y, np.array(predicted_labels))


    # test_loss /= num_batches
    # correct /= size
    # print(f"Test Error: \n Accuracy: {(100*correct):>0.3f}%, Avg loss: {test_loss:>8f}")
    print(f"accuracy: {metrics['accuracy']}, precision: {metrics['precision']}, recall: {metrics['recall']}, f1: {metrics['f1']} \n")

In [17]:
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
epochs = 5
train(model, X_train, y_train, learning_rate, epochs)

Epoch 1/5, Loss: 0.7011
accuracy: 0.543171114599686, precision: 0.8055555555555556, recall: 0.17313432835820897, f1: 0.28501228501228504 

Epoch 2/5, Loss: 0.6605
accuracy: 0.5965463108320251, precision: 0.7468354430379747, recall: 0.3522388059701492, f1: 0.4787018255578093 

Epoch 3/5, Loss: 0.6381
accuracy: 0.6593406593406593, precision: 0.7153284671532847, recall: 0.5850746268656717, f1: 0.6436781609195402 

Epoch 4/5, Loss: 0.6160
accuracy: 0.6923076923076923, precision: 0.6853333333333333, recall: 0.7671641791044777, f1: 0.723943661971831 

Epoch 5/5, Loss: 0.5985
accuracy: 0.7142857142857143, precision: 0.6843373493975904, recall: 0.8477611940298507, f1: 0.7573333333333333 



In [18]:
test(X_test, y_test, model)

accuracy: 0.7158555729984302, precision: 0.691415313225058, recall: 0.861271676300578, f1: 0.767052767052767 

