# Fake News Detection With Transformer Models XLNET

In [9]:
# importing required libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle
from transformers import XLNetConfig, XLNetModel
import torch
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
import re
import pickle

In [10]:
import os
from google.colab import drive
MOUNTPOINT = '/content/gdrive'
DATADIR = os.path.join(MOUNTPOINT, 'My Drive', 'myfolder')
drive.mount(MOUNTPOINT)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [11]:

# Initializing a XLNet configuration
configuration = XLNetConfig()

# Initializing a model (with random weights) from the configuration
model = XLNetModel(configuration)

# Accessing the model configuration
configuration = model.config

In [12]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
       text = str(self.texts[idx])
       label = self.labels[idx]

       # tokenize the text
       encoding = self.tokenizer.encode_plus(
           text,
           add_special_tokens=True,
           max_length=self.max_length,
           return_token_type_ids=False,
           padding='max_length',
           truncation=True,
           return_tensors='pt'
       )

       # get the input ids
       input_ids = encoding['input_ids'].flatten()

       # create the attention mask
       attention_mask = torch.ones_like(input_ids)

       # if 'attention_mask' exists in encoding, use it
       if 'attention_mask' in encoding:
           attention_mask = encoding['attention_mask'].flatten()
           return {
           'input_ids': input_ids,
           'attention_mask': attention_mask,
           'label': torch.tensor(label, dtype=torch.long)
       }

In [13]:
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(train_loader)

# Example function for evaluating the model
def evaluate(model, eval_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(eval_loader), accuracy


In [14]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)
from sklearn.model_selection import train_test_split

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
! pip install -q kaggle
! mkdir ~/.kaggle
! touch ~/.kaggle/kaggle.json
!echo '{"username":"akselsaatci","key":"asdasdasd"}' > ~/.kaggle/kaggle.json
! kaggle datasets download -d vishakhdapat/fake-news-detection

Downloading fake-news-detection.zip to /content
  0% 0.00/9.37M [00:00<?, ?B/s]
100% 9.37M/9.37M [00:00<00:00, 186MB/s]


In [16]:
! unzip fake-news-detection.zip

Archive:  fake-news-detection.zip
  inflating: fake_and_real_news.csv  


In [17]:
data = pd.read_csv('./fake_and_real_news.csv')
# handle duplicated values
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)  # Remove rows with missing values


data['is_fake'] = 0
def preprocess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip().lower()

data['Text'] = data['Text'].apply(preprocess)


x_train, x_test, y_train, y_test = train_test_split(data['Text'], data['is_fake'], test_size=0.3, random_state=0)

x_train.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [18]:
BATCH_SIZE = 8
MAX_LENGTH = 32
LEARNING_RATE = 2e-5
EPOCHS = 1



In [None]:
train_dataset = FakeNewsDataset(x_train, y_train, tokenizer, MAX_LENGTH)
eval_dataset = FakeNewsDataset(x_test, y_test, tokenizer, MAX_LENGTH)

    # Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE)

    # Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

    # Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer =  torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    # Training loop
for epoch in range(EPOCHS):
  train_loss = train(model, train_loader, optimizer, criterion, device)
  eval_loss, eval_accuracy = evaluate(model, eval_loader, criterion, device)
  print(f'Epoch {epoch+1}/{EPOCHS}:')
  print(f'Training Loss: {train_loss:.4f}')
  print(f'Evaluation Loss: {eval_loss:.4f} | Evaluation Accuracy: {eval_accuracy:.4f}')


with open("./gdrive/MyDrive/first_epoch_xlnet.pkl", "wb") as file: # file is a variable for storing the newly created file, it can be anything.
    pickle.dump(model, file) # Dump function is used to write the object into the created file in byte format.
