# Using CNN for Fake news detection

## Libraries needed

In [20]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/all-datasets/tweets_dataset.csv
/kaggle/input/all-datasets/combined_dataset.csv
/kaggle/input/all-datasets/articles_dataset.csv


In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re # We use regular expressions for data cleaning
import warnings
warnings.filterwarnings('ignore')

import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score,f1_score, roc_auc_score, roc_curve, # evaluatin metrics
                             confusion_matrix,classification_report)
import tensorflow as tf
import transformers
from transformers import TFAutoModel, AutoTokenizer

plt.style.use('seaborn')

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
import re
from datasets import load_dataset
from camel_tools.tokenizers.word import simple_word_tokenize
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
from collections import Counter

## Dataset preprocessing and model implementation

In [23]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
from collections import Counter

# Preprocessing function
def preprocess_text(text):
    """Clean and tokenize text."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.split()  # Tokenize by splitting on spaces

# Vocabulary building
def build_vocabulary(texts, max_vocab_size=50000):
    """Build vocabulary from training texts only."""
    word_counts = Counter()
    for text in texts:
        word_counts.update(preprocess_text(text))
    vocab = {'<pad>': 0, '<unk>': 1}
    for word, _ in word_counts.most_common(max_vocab_size - 2):
        vocab[word] = len(vocab)
    return vocab

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_length=200):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = preprocess_text(self.texts[idx])
        indices = [self.vocab.get(token, self.vocab['<unk>']) for token in tokens]
        if len(indices) < self.max_length:
            indices = indices + [self.vocab['<pad>']] * (self.max_length - len(indices))
        else:
            indices = indices[:self.max_length]
        return torch.tensor(indices), torch.tensor(self.labels[idx], dtype=torch.float)

# Define the TextCNN model
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text).unsqueeze(1)  # (batch_size, 1, max_length, embedding_dim)
        conved = [torch.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [torch.max(conv, dim=2)[0] for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return torch.sigmoid(self.fc(cat))

# Training function
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    all_preds, all_labels = [], []

    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(texts).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        all_preds.extend(predictions.detach().cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    accuracy, precision, recall, f1 = compute_metrics(all_labels, all_preds)
    return epoch_loss / len(train_loader), accuracy, precision, recall, f1

# Evaluation function
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            predictions = model(texts).squeeze(1)
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy, precision, recall, f1 = compute_metrics(all_labels, all_preds)
    return epoch_loss / len(test_loader), accuracy, precision, recall, f1

# Compute evaluation metrics
def compute_metrics(y_true, y_pred):
    y_pred_binary = (np.array(y_pred) >= 0.5).astype(int)
    accuracy = accuracy_score(y_true, y_pred_binary)
    precision = precision_score(y_true, y_pred_binary)
    recall = recall_score(y_true, y_pred_binary)
    f1 = f1_score(y_true, y_pred_binary)
    return accuracy, precision, recall, f1

# Experiment runner
def run_experiment(train_texts, train_labels, test_texts, test_labels, max_length=200, epochs=10):
    vocab = build_vocabulary(train_texts)
    train_dataset = TextDataset(train_texts, train_labels, vocab, max_length=max_length)
    test_dataset = TextDataset(test_texts, test_labels, vocab, max_length=max_length)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TextCNN(
        vocab_size=len(vocab),
        embedding_dim=100,
        n_filters=100,
        filter_sizes=[3, 4, 5],
        output_dim=1,
        dropout=0.5,
        pad_idx=vocab['<pad>']
    ).to(device)

    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCELoss()

    for epoch in range(epochs):
        train_loss, train_acc, train_prec, train_rec, train_f1 = train_model(model, train_loader, optimizer, criterion, device)
        print(f'Epoch {epoch + 1} | Train Loss: {train_loss:.3f} | Acc: {train_acc:.3f} | Prec: {train_prec:.3f} | Rec: {train_rec:.3f} | F1: {train_f1:.3f}')

    test_loss, test_acc, test_prec, test_rec, test_f1 = evaluate_model(model, test_loader, criterion, device)
    print(f'\nTest Results | Loss: {test_loss:.3f} | Acc: {test_acc:.3f} | Prec: {test_prec:.3f} | Rec: {test_rec:.3f} | F1: {test_f1:.3f}')



## Model calling and model running

In [25]:
# Load datasets
df_articles = pd.read_csv('/kaggle/input/all-datasets/articles_dataset.csv')
df_tweets = pd.read_csv('/kaggle/input/all-datasets/tweets_dataset.csv')
df_combined = pd.read_csv('/kaggle/input/all-datasets/combined_dataset.csv')

articles_texts = df_articles['text'].tolist()
articles_labels = df_articles['label'].map({'real': 1, 'fake': 0}).tolist()

tweets_texts = df_tweets['text'].tolist()
tweets_labels = df_tweets['label'].map({True: 1, False: 0}).tolist()

combined_texts = df_combined['text'].tolist()
combined_labels = df_combined['label'].map({'real': 1, 'fake': 0}).tolist()

In [26]:
# Run experiments
print("Experiment 1: Train on Tweets, Test on Articles")
run_experiment(tweets_texts, tweets_labels, articles_texts, articles_labels)

Experiment 1: Train on Tweets, Test on Articles
Epoch 1 | Train Loss: 0.629 | Acc: 0.631 | Prec: 0.633 | Rec: 0.628 | F1: 0.630
Epoch 2 | Train Loss: 0.404 | Acc: 0.832 | Prec: 0.820 | Rec: 0.852 | F1: 0.835
Epoch 3 | Train Loss: 0.278 | Acc: 0.906 | Prec: 0.912 | Rec: 0.901 | F1: 0.906
Epoch 4 | Train Loss: 0.196 | Acc: 0.941 | Prec: 0.942 | Rec: 0.939 | F1: 0.941
Epoch 5 | Train Loss: 0.132 | Acc: 0.963 | Prec: 0.964 | Rec: 0.962 | F1: 0.963
Epoch 6 | Train Loss: 0.099 | Acc: 0.973 | Prec: 0.974 | Rec: 0.972 | F1: 0.973
Epoch 7 | Train Loss: 0.067 | Acc: 0.987 | Prec: 0.986 | Rec: 0.988 | F1: 0.987
Epoch 8 | Train Loss: 0.054 | Acc: 0.988 | Prec: 0.988 | Rec: 0.988 | F1: 0.988
Epoch 9 | Train Loss: 0.041 | Acc: 0.993 | Prec: 0.993 | Rec: 0.994 | F1: 0.993
Epoch 10 | Train Loss: 0.039 | Acc: 0.989 | Prec: 0.991 | Rec: 0.987 | F1: 0.989

Test Results | Loss: 1.308 | Acc: 0.665 | Prec: 0.666 | Rec: 0.996 | F1: 0.798


In [27]:
print("\nExperiment 2: Train on Articles, Test on Tweets")
run_experiment(articles_texts, articles_labels, tweets_texts, tweets_labels)


Experiment 2: Train on Articles, Test on Tweets
Epoch 1 | Train Loss: 0.367 | Acc: 0.826 | Prec: 0.853 | Rec: 0.893 | F1: 0.872
Epoch 2 | Train Loss: 0.174 | Acc: 0.930 | Prec: 0.948 | Rec: 0.947 | F1: 0.948
Epoch 3 | Train Loss: 0.082 | Acc: 0.971 | Prec: 0.978 | Rec: 0.978 | F1: 0.978
Epoch 4 | Train Loss: 0.041 | Acc: 0.985 | Prec: 0.989 | Rec: 0.988 | F1: 0.989
Epoch 5 | Train Loss: 0.022 | Acc: 0.994 | Prec: 0.996 | Rec: 0.995 | F1: 0.995
Epoch 6 | Train Loss: 0.016 | Acc: 0.995 | Prec: 0.996 | Rec: 0.996 | F1: 0.996
Epoch 7 | Train Loss: 0.010 | Acc: 0.997 | Prec: 0.998 | Rec: 0.998 | F1: 0.998
Epoch 8 | Train Loss: 0.007 | Acc: 0.998 | Prec: 0.998 | Rec: 0.998 | F1: 0.998
Epoch 9 | Train Loss: 0.004 | Acc: 0.999 | Prec: 0.999 | Rec: 0.999 | F1: 0.999
Epoch 10 | Train Loss: 0.004 | Acc: 0.999 | Prec: 0.999 | Rec: 0.999 | F1: 0.999

Test Results | Loss: 4.119 | Acc: 0.462 | Prec: 0.423 | Rec: 0.199 | F1: 0.271


In [28]:
print("\nExperiment 3: Train and Test on Articles")
train_texts, test_texts, train_labels, test_labels = train_test_split(articles_texts, articles_labels, test_size=0.2, random_state=42)
run_experiment(train_texts, train_labels, test_texts, test_labels)


Experiment 3: Train and Test on Articles
Epoch 1 | Train Loss: 0.406 | Acc: 0.801 | Prec: 0.832 | Rec: 0.880 | F1: 0.855
Epoch 2 | Train Loss: 0.210 | Acc: 0.913 | Prec: 0.931 | Rec: 0.940 | F1: 0.935
Epoch 3 | Train Loss: 0.122 | Acc: 0.952 | Prec: 0.965 | Rec: 0.963 | F1: 0.964
Epoch 4 | Train Loss: 0.066 | Acc: 0.975 | Prec: 0.982 | Rec: 0.981 | F1: 0.982
Epoch 5 | Train Loss: 0.041 | Acc: 0.985 | Prec: 0.989 | Rec: 0.989 | F1: 0.989
Epoch 6 | Train Loss: 0.022 | Acc: 0.994 | Prec: 0.995 | Rec: 0.995 | F1: 0.995
Epoch 7 | Train Loss: 0.015 | Acc: 0.996 | Prec: 0.997 | Rec: 0.997 | F1: 0.997
Epoch 8 | Train Loss: 0.012 | Acc: 0.997 | Prec: 0.997 | Rec: 0.998 | F1: 0.998
Epoch 9 | Train Loss: 0.008 | Acc: 0.998 | Prec: 0.998 | Rec: 0.998 | F1: 0.998
Epoch 10 | Train Loss: 0.005 | Acc: 0.999 | Prec: 0.999 | Rec: 0.999 | F1: 0.999

Test Results | Loss: 0.044 | Acc: 0.986 | Prec: 0.996 | Rec: 0.983 | F1: 0.989


In [29]:
print("\nExperiment 4: Train and Test on Tweets")
train_texts, test_texts, train_labels, test_labels = train_test_split(tweets_texts, tweets_labels, test_size=0.2, random_state=42)
run_experiment(train_texts, train_labels, test_texts, test_labels)


Experiment 4: Train and Test on Tweets
Epoch 1 | Train Loss: 0.646 | Acc: 0.604 | Prec: 0.603 | Rec: 0.623 | F1: 0.613
Epoch 2 | Train Loss: 0.430 | Acc: 0.820 | Prec: 0.813 | Rec: 0.836 | F1: 0.824
Epoch 3 | Train Loss: 0.307 | Acc: 0.898 | Prec: 0.893 | Rec: 0.905 | F1: 0.899
Epoch 4 | Train Loss: 0.220 | Acc: 0.935 | Prec: 0.935 | Rec: 0.935 | F1: 0.935
Epoch 5 | Train Loss: 0.164 | Acc: 0.954 | Prec: 0.956 | Rec: 0.952 | F1: 0.954
Epoch 6 | Train Loss: 0.121 | Acc: 0.974 | Prec: 0.974 | Rec: 0.973 | F1: 0.974
Epoch 7 | Train Loss: 0.080 | Acc: 0.983 | Prec: 0.981 | Rec: 0.986 | F1: 0.983
Epoch 8 | Train Loss: 0.065 | Acc: 0.987 | Prec: 0.987 | Rec: 0.988 | F1: 0.987
Epoch 9 | Train Loss: 0.047 | Acc: 0.994 | Prec: 0.996 | Rec: 0.992 | F1: 0.994
Epoch 10 | Train Loss: 0.036 | Acc: 0.995 | Prec: 0.996 | Rec: 0.995 | F1: 0.995

Test Results | Loss: 0.723 | Acc: 0.787 | Prec: 0.959 | Rec: 0.594 | F1: 0.733


In [30]:
print("\nExperiment 5: Train and Test on Combined")
train_texts, test_texts, train_labels, test_labels = train_test_split(combined_texts, combined_labels, test_size=0.2, random_state=42)
run_experiment(train_texts, train_labels, test_texts, test_labels)


Experiment 5: Train and Test on Combined
Epoch 1 | Train Loss: 0.462 | Acc: 0.772 | Prec: 0.801 | Rec: 0.858 | F1: 0.829
Epoch 2 | Train Loss: 0.257 | Acc: 0.891 | Prec: 0.914 | Rec: 0.917 | F1: 0.915
Epoch 3 | Train Loss: 0.150 | Acc: 0.941 | Prec: 0.954 | Rec: 0.954 | F1: 0.954
Epoch 4 | Train Loss: 0.084 | Acc: 0.971 | Prec: 0.976 | Rec: 0.978 | F1: 0.977
Epoch 5 | Train Loss: 0.051 | Acc: 0.983 | Prec: 0.987 | Rec: 0.986 | F1: 0.987
Epoch 6 | Train Loss: 0.032 | Acc: 0.989 | Prec: 0.992 | Rec: 0.992 | F1: 0.992
Epoch 7 | Train Loss: 0.020 | Acc: 0.994 | Prec: 0.994 | Rec: 0.996 | F1: 0.995
Epoch 8 | Train Loss: 0.015 | Acc: 0.995 | Prec: 0.996 | Rec: 0.996 | F1: 0.996
Epoch 9 | Train Loss: 0.012 | Acc: 0.996 | Prec: 0.997 | Rec: 0.997 | F1: 0.997
Epoch 10 | Train Loss: 0.008 | Acc: 0.997 | Prec: 0.997 | Rec: 0.998 | F1: 0.998

Test Results | Loss: 0.135 | Acc: 0.965 | Prec: 0.971 | Rec: 0.973 | F1: 0.972
