# Fake News Detection With Transformer Models XLNET

In [48]:
# importing required libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle
from transformers import XLNetConfig, XLNetModel
import torch
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
import re
import pickle

In [30]:
import os
from google.colab import drive
MOUNTPOINT = '/content/gdrive'
DATADIR = os.path.join(MOUNTPOINT, 'My Drive', 'myfolder')
drive.mount(MOUNTPOINT)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [49]:

# Initializing a XLNet configuration
configuration = XLNetConfig()

# Initializing a model (with random weights) from the configuration
model = XLNetModel(configuration)

# Accessing the model configuration
configuration = model.config

In [50]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
       text = str(self.texts[idx])
       label = self.labels[idx]

       # tokenize the text
       encoding = self.tokenizer.encode_plus(
           text,
           add_special_tokens=True,
           max_length=self.max_length,
           return_token_type_ids=False,
           padding='max_length',
           truncation=True,
           return_tensors='pt'
       )

       # get the input ids
       input_ids = encoding['input_ids'].flatten()

       # create the attention mask
       attention_mask = torch.ones_like(input_ids)

       # if 'attention_mask' exists in encoding, use it
       if 'attention_mask' in encoding:
           attention_mask = encoding['attention_mask'].flatten()
           return {
           'input_ids': input_ids,
           'attention_mask': attention_mask,
           'label': torch.tensor(label, dtype=torch.long)
       }

In [33]:
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(train_loader)

# Example function for evaluating the model
def evaluate(model, eval_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(eval_loader), accuracy


In [51]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)
from sklearn.model_selection import train_test_split

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
! pip install -q kaggle
! mkdir ~/.kaggle
! touch ~/.kaggle/kaggle.json
!echo '{"username":"akselsaatci","key":"asdasdasd"}' > ~/.kaggle/kaggle.json
! kaggle datasets download -d vishakhdapat/fake-news-detection

mkdir: cannot create directory ‘/root/.kaggle’: File exists
fake-news-detection.zip: Skipping, found more recently modified local copy (use --force to force download)


In [36]:
! unzip fake-news-detection.zip

Archive:  fake-news-detection.zip
replace fake_and_real_news.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [52]:
data = pd.read_csv('./fake_and_real_news.csv')
# handle duplicated values
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)  # Remove rows with missing values


data['is_fake'] = 0

data['is_fake'] = (data['label'] == 'Fake').astype(int)

def preprocess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip().lower()

data['Text'] = data['Text'].apply(preprocess)


x_train, x_test, y_train, y_test = train_test_split(data['Text'], data['is_fake'], test_size=0.3, random_state=0)

x_train.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [55]:
filtered_y_test = [y_test[i] for i in range(len(y_test)) if y_test[i] == 1]
print(filtered_y_test)

filtered_y_train = [y_train[i] for i in range(len(y_train)) if y_train[i] == 1]
print(filtered_y_train)


[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [61]:
BATCH_SIZE = 8
MAX_LENGTH = 32
LEARNING_RATE = 2e-5
EPOCHS = 8



In [62]:
train_dataset = FakeNewsDataset(x_train, y_train, tokenizer, MAX_LENGTH)
eval_dataset = FakeNewsDataset(x_test, y_test, tokenizer, MAX_LENGTH)

    # Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE)

    # Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

    # Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer =  torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    # Training loop
for epoch in range(EPOCHS):
  train_loss = train(model, train_loader, optimizer, criterion, device)
  eval_loss, eval_accuracy = evaluate(model, eval_loader, criterion, device)
  print(f'Epoch {epoch+1}/{EPOCHS}:')
  print(f'Training Loss: {train_loss:.4f}')
  print(f'Evaluation Loss: {eval_loss:.4f} | Evaluation Accuracy: {eval_accuracy:.4f}')


with open("./gdrive/MyDrive/changed_with_eight_epocs_xlnet.pkl", "wb") as file: # file is a variable for storing the newly created file, it can be anything.
    pickle.dump(model, file) # Dump function is used to write the object into the created file in byte format.


Epoch 1/8:
Training Loss: 0.0732
Evaluation Loss: 0.0122 | Evaluation Accuracy: 0.9976
Epoch 2/8:
Training Loss: 0.0033
Evaluation Loss: 0.0110 | Evaluation Accuracy: 0.9983
Epoch 3/8:
Training Loss: 0.0000
Evaluation Loss: 0.0119 | Evaluation Accuracy: 0.9990
Epoch 4/8:
Training Loss: 0.0136
Evaluation Loss: 0.0112 | Evaluation Accuracy: 0.9983
Epoch 5/8:
Training Loss: 0.0026
Evaluation Loss: 0.0103 | Evaluation Accuracy: 0.9990
Epoch 6/8:
Training Loss: 0.0009
Evaluation Loss: 0.0114 | Evaluation Accuracy: 0.9990
Epoch 7/8:
Training Loss: 0.0000
Evaluation Loss: 0.0120 | Evaluation Accuracy: 0.9990
Epoch 8/8:
Training Loss: 0.0000
Evaluation Loss: 0.0125 | Evaluation Accuracy: 0.9990


In [77]:
with open('./gdrive/MyDrive/changed_with_eight_epocs_xlnet.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Ensure to move the model to the appropriate device if necessary
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loaded_model.to(device)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [78]:

# Load your XLNet model and tokenizer
model = loaded_model

input_text = "Anthem to pare back Obamacare offerings in Nevada and Georgia (Reuters) - U.S. health insurer Anthem Inc (ANTM.N) said on Monday it will no longer offer Obamacare plans in Nevada’s state exchange and will stop offering the plans in nearly half of Georgia’s counties next year. The moves come after Republican senators last month failed to repeal and replace Obamacare, former President Barack Obama’s signature healthcare reform law, creating uncertainty over how the program providing health benefits to 20 million Americans will be funded and managed in 2018. Hundreds of U.S. counties are at risk of losing access to private health coverage in 2018 as insurers consider pulling out of those markets in the coming months. Nevada had said in June that residents in 14 counties out of 17 in the state would not have access to qualified health plans on the state exchanges. Anthem’s decision to leave the state entirely does not increase the number of “bare counties” in the state, Nevada Insurance Commissioner Barbara Richardson said in a statement. The insurer will still offer “catastrophic plans,” which can be purchased outside the state’s exchange and are only available to consumers under 30 years old or with a low income. Anthem also said it will only offer Obamacare plans in 85 of Georgia’s 159 counties. It said the counties it will continue to offer the plans in are mostly rural counties that would otherwise not have health insurance coverage for their residents.  It said these changes do not impact Anthem’s Medicare Advantage, Medicaid or employer-based plans in either state.   The company said last week that it will pull out of 16 of 19 pricing regions in California in 2018 where it offered Obamacare options this year.  Anthem blamed the moves in part on uncertainty over whether the Trump administration would maintain subsidies that keep costs down.  U.S. President Donald Trump last week threatened to cut off subsidy payments that make the plans affordable for lower-income Americans and help insurers to keep premiums down, after efforts to repeal the law signed by his predecessor, President Barack Obama, failed in Congress.  Trump has repeatedly urged Republican lawmakers to keep working to undo Obama’s Affordable Care Act."

inputs = tokenizer(input_text, return_tensors='pt', max_length=32 ,truncation=True)

inputs.to(device)

with torch.no_grad():
    outputs = model(**inputs)

predicted_label = torch.argmax(outputs.logits).item()

label_map = {1: 'fake', 0: 'real'}
predicted_class = label_map[predicted_label]

print("Predicted class:", predicted_class)

Predicted class: fake
