# Fake News Detection With Transformer Models BERT

In [1]:
# importing required libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle
from transformers import XLNetConfig, XLNetModel
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import re
import pickle

In [2]:
import os
from google.colab import drive
MOUNTPOINT = '/content/gdrive'
DATADIR = os.path.join(MOUNTPOINT, 'My Drive', 'myfolder')
drive.mount(MOUNTPOINT)

Mounted at /content/gdrive


In [4]:

# Initializing a XLNet configuration

# Initializing a model (with random weights) from the configuration
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Accessing the model configuration


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
       text = str(self.texts[idx])
       label = self.labels[idx]

       # tokenize the text
       encoding = self.tokenizer.encode_plus(
           text,
           add_special_tokens=True,
           max_length=self.max_length,
           return_token_type_ids=False,
           padding='max_length',
           truncation=True,
           return_tensors='pt'
       )

       # get the input ids
       input_ids = encoding['input_ids'].flatten()

       # create the attention mask
       attention_mask = torch.ones_like(input_ids)

       # if 'attention_mask' exists in encoding, use it
       if 'attention_mask' in encoding:
           attention_mask = encoding['attention_mask'].flatten()
           return {
           'input_ids': input_ids,
           'attention_mask': attention_mask,
           'label': torch.tensor(label, dtype=torch.long)
       }

In [6]:
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(train_loader)

# Example function for evaluating the model
def evaluate(model, eval_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(eval_loader), accuracy


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
from sklearn.model_selection import train_test_split

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
! pip install -q kaggle
! mkdir ~/.kaggle
! touch ~/.kaggle/kaggle.json
!echo '{"username":"akselsaatci","key":"asdasdasd"}' > ~/.kaggle/kaggle.json
! kaggle datasets download -d vishakhdapat/fake-news-detection

Downloading fake-news-detection.zip to /content
 53% 5.00M/9.37M [00:00<00:00, 47.6MB/s]
100% 9.37M/9.37M [00:00<00:00, 58.9MB/s]


In [9]:
! unzip fake-news-detection.zip

Archive:  fake-news-detection.zip
  inflating: fake_and_real_news.csv  


In [10]:
data = pd.read_csv('./fake_and_real_news.csv')
# handle duplicated values
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)  # Remove rows with missing values


data['is_fake'] = 0

data['is_fake'] = (data['label'] == 'Fake').astype(int)

def preprocess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip().lower()

data['Text'] = data['Text'].apply(preprocess)


x_train, x_test, y_train, y_test = train_test_split(data['Text'], data['is_fake'], test_size=0.3, random_state=0)

x_train.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
filtered_y_test = [y_test[i] for i in range(len(y_test)) if y_test[i] == 1]
print(filtered_y_test)

filtered_y_train = [y_train[i] for i in range(len(y_train)) if y_train[i] == 1]
print(filtered_y_train)


[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [11]:
BATCH_SIZE = 8
MAX_LENGTH = 32
LEARNING_RATE = 2e-5
EPOCHS = 8



In [12]:
train_dataset = FakeNewsDataset(x_train, y_train, tokenizer, MAX_LENGTH)
eval_dataset = FakeNewsDataset(x_test, y_test, tokenizer, MAX_LENGTH)

    # Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE)

    # Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

    # Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer =  torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    # Training loop
for epoch in range(EPOCHS):
  train_loss = train(model, train_loader, optimizer, criterion, device)
  eval_loss, eval_accuracy = evaluate(model, eval_loader, criterion, device)
  print(f'Epoch {epoch+1}/{EPOCHS}:')
  print(f'Training Loss: {train_loss:.4f}')
  print(f'Evaluation Loss: {eval_loss:.4f} | Evaluation Accuracy: {eval_accuracy:.4f}')


with open("./gdrive/MyDrive/changed_with_eight_epocs_xlnet.pkl", "wb") as file: # file is a variable for storing the newly created file, it can be anything.
    pickle.dump(model, file) # Dump function is used to write the object into the created file in byte format.


Epoch 1/8:
Training Loss: 0.0278
Evaluation Loss: 0.0058 | Evaluation Accuracy: 0.9993
Epoch 2/8:
Training Loss: 0.0003
Evaluation Loss: 0.0067 | Evaluation Accuracy: 0.9993
Epoch 3/8:
Training Loss: 0.0001
Evaluation Loss: 0.0071 | Evaluation Accuracy: 0.9993
Epoch 4/8:
Training Loss: 0.0000
Evaluation Loss: 0.0074 | Evaluation Accuracy: 0.9993
Epoch 5/8:
Training Loss: 0.0000
Evaluation Loss: 0.0079 | Evaluation Accuracy: 0.9993
Epoch 6/8:
Training Loss: 0.0090
Evaluation Loss: 0.0062 | Evaluation Accuracy: 0.9993
Epoch 7/8:
Training Loss: 0.0016
Evaluation Loss: 0.0067 | Evaluation Accuracy: 0.9993
Epoch 8/8:
Training Loss: 0.0000
Evaluation Loss: 0.0071 | Evaluation Accuracy: 0.9993


In [13]:
with open('./gdrive/MyDrive/changed_with_eight_epocs_xlnet.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Ensure to move the model to the appropriate device if necessary
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loaded_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [18]:

# Load your XLNet model and tokenizer
model = loaded_model

input_text = "Trump Gets Tired Of ‘Hamilton’ Feud, Reignites Attacks On The New York Times Instead (TWEETS) It is not even 7 AM on the American east coast as of this writing, and Donald Trump is already up and tweeting. As per usual when we get these early morning, unhinged rants from Trump, he is on the attack   this time with an old foe, The New York Times. As we all know, Trump has long had a frosty relationship with the media, and even invited them to Trump Tower only to rant at them about how he doesn t like how they covered him. Now, he has cancelled a meeting with the Times, the news organization that he had the most criticism for. Without further ado, here is all the craziness in its full glory:I cancelled today's meeting with the failing @nytimes when the terms and conditions of the meeting were changed at the last moment. Not nice  Donald J. Trump (@realDonaldTrump) November 22, 2016Perhaps a new meeting will be set up with the @nytimes. In the meantime they continue to cover me inaccurately and with a nasty tone!  Donald J. Trump (@realDonaldTrump) November 22, 2016The failing @nytimes just announced that complaints about them are at a 15 year high. I can fully understand that   but why announce?  Donald J. Trump (@realDonaldTrump) November 22, 2016This man is literally nuts. Instead of planning his transition, he is scolding the media, picking fights with Saturday Night Live and Hamilton, and deliberately trying to undermine the free press, which is one of our most vital American institutions and is especially important when the person about to take office is an unhinged serial liar.Hopefully, the press resists the temptation to normalize Trump, and they call him out on every lie, every crazy, unAmerican idea, and every corrupt thing he does. If they don t, we can kiss our democracy goodbye, because we just elected a literal fascist. Remember, folks   sowing distrust in institutions such as the free press is the first thing that happens when fascism begins to take hold. Trump is already neck-deep in that one. I shudder to think what s next.Featured image via Scott Eisen/Getty Images "

inputs = tokenizer(input_text, return_tensors='pt', max_length=32 ,truncation=True)

inputs.to(device)

with torch.no_grad():
    outputs = model(**inputs)

predicted_label = torch.argmax(outputs.logits).item()

label_map = {1: 'fake', 0: 'real'}
predicted_class = label_map[predicted_label]

print("Predicted class:", predicted_class)

Predicted class: fake
