# Step 1: Classification Model Development


This code trains a BERT-based model to classify news text, focusing on reliable, accurate fake news detection.

## Imports and Setup

In [1]:
# Install required libraries
!pip install transformers
!pip install tweet-preprocessor
!pip install textblob textstat

# Standard libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

# Sklearn libraries for preprocessing and evaluation
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

# Transformers library for BERT and tokenizer
import transformers
from transformers import AutoModel, BertTokenizerFast, AdamW

# Tweet-preprocessor for text cleaning
import preprocessor as p

# PyTorch utilities for data handling
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# TextBlob and textstat for sentiment analysis and readability scores
from textblob import TextBlob
import textstat

# Scipy for hypothesis testing
import scipy.stats as stats

# Specify GPU if available, else default to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Enable access to Google Drive for file storage
from google.colab import drive
drive.mount('/content/drive')

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl.metadata (5.9 kB)
Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0
Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.0 textstat-0.7.4
Mounted at /content/drive


## Data Loading and Pre-Processing

This code loads the training and validation datasets, applies preprocessing to clean up the text data (tweets), and encodes the labels (real/fake) into a numerical format for modeling.

In [5]:
# Function to load data from TSV file into a DataFrame
def getData(file):
    """Load TSV file and return DataFrame."""
    return pd.read_csv(file, delimiter="\t")

# Define file paths for the training and validation datasets
trainFilename = "/content/drive/MyDrive/Fake News Detection Data/Constrain AI/Constraint_English_Train - Sheet1.tsv"
testFilename = "/content/drive/MyDrive/Fake News Detection Data/Constrain AI/Constraint_English_Test - Sheet1.tsv"

# Load training and validation datasets
trainDF = getData(trainFilename)
testDF = getData(testFilename)
print("Train Data Shape: ", trainDF.shape)
print("Test Data Shape: ", testDF.shape)

# Function to preprocess tweets: removes special characters, converts to lowercase, removes hashtags and mentions
def preprocessTweet(row):
    text = row['tweet']
    text = p.clean(text)
    text = text.lower().replace(r'[^\w\s]', ' ').replace(r'\s\s+', ' ').replace("#", "").replace("@", "")
    return text

# Apply preprocessing function to each row in the dataset
trainDF['processedTweet'] = trainDF.apply(preprocessTweet, axis=1)
testDF['processedTweet'] = testDF.apply(preprocessTweet, axis=1)

# Encode labels into numerical format for model compatibility
labelEncoder = preprocessing.LabelEncoder()
labelEncoder.fit(['real', 'fake'])
trainDF['numericalLabels'] = labelEncoder.transform(trainDF['label'])
testDF['numericalLabels'] = labelEncoder.transform(testDF['label'])

Train Data Shape:  (6420, 3)
Test Data Shape:  (2140, 3)


## Data Splitting and Tokenization

The code splits the data into training and validation sets and uses BERT’s tokenizer to convert the text into a format suitable for model input (token IDs and attention masks).

In [3]:
# Split training data for internal validation
trainText, validText, trainLabels, validLabels = train_test_split(
    trainDF['processedTweet'], trainDF['numericalLabels'],
    random_state=2018,
    test_size=0.04,
    stratify=trainDF['numericalLabels']
)

# Load pretrained BERT model and tokenizer
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Function to tokenize and encode sequences for input to BERT model
def tokenize_text(text_data, tokenizer, max_length=200):
    return tokenizer.batch_encode_plus(
        text_data.tolist(),
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

# Tokenize and encode train, validation, and test datasets
tokens_train = tokenize_text(trainText, tokenizer)
tokens_val = tokenize_text(validText, tokenizer)
tokens_test = tokenize_text(testDF['processedTweet'], tokenizer)

# Prepare tensors for model input, including sequence, mask, and labels
train_seq, train_mask, train_y = tokens_train['input_ids'], tokens_train['attention_mask'], torch.tensor(trainLabels.tolist())
val_seq, val_mask, val_y = tokens_val['input_ids'], tokens_val['attention_mask'], torch.tensor(validLabels.tolist())
test_seq, test_mask, test_y = tokens_test['input_ids'], tokens_test['attention_mask'], torch.tensor(testDF['numericalLabels'].tolist())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Data Loader Creation

This code cell creates DataLoader objects to efficiently batch and load data during training and validation, helping with faster processing

In [6]:
# Define batch size
batch_size = 32

# Create DataLoader for train, validation, and test sets
train_data = TensorDataset(train_seq, train_mask, train_y)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_dataloader = DataLoader(val_data, sampler=SequentialSampler(val_data), batch_size=batch_size)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)

## Model Definition

The code sets up a custom BERT-based architecture, which uses BERT as a base model with additional fully connected layers for classification. BERT’s parameters are frozen to reduce training time.

In [7]:
# Load pretrained BERT model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Freeze BERT parameters to prevent updating during training
for param in bert.parameters():
    param.requires_grad = False

# Define custom BERT model architecture with added layers
class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 2)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        outputs = self.bert(sent_id, attention_mask=mask)
        cls_hs = outputs.pooler_output
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return self.softmax(x)

# Initialize model and send to device
model = BERT_Arch(bert)
model = model.to(device)

## Optimizer, Loss Function, and Class Weights

The code configures an AdamW optimizer and a cross-entropy loss function, adjusting for class imbalance by assigning weights to classes.

In [8]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(trainLabels), y=trainLabels)
weights = torch.tensor(class_weights, dtype=torch.float).to(device)
cross_entropy = nn.NLLLoss(weight=weights)  # Weighted loss function



## Training and Evaluation Functions

This code defines functions to handle model training and validation, calculating loss and updating the model’s parameters in each epoch.

In [9]:
# Define training function
def train():
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = [item.to(device) for item in batch]
        sent_id, mask, labels = batch
        model.zero_grad()
        preds = model(sent_id, mask)
        loss = cross_entropy(preds, labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    return total_loss / len(train_dataloader)

# Define evaluation function
def evaluate():
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = [item.to(device) for item in batch]
            sent_id, mask, labels = batch
            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss += loss.item()
    return total_loss / len(val_dataloader)

## Training Loop

The model is trained over multiple epochs, saving the model whenever validation loss improves, indicating better performance on unseen data.

In [10]:
# Saving is based on best validation loss
epochs = 15
best_valid_loss = float('inf')

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    train_loss = train()
    valid_loss = evaluate()

    # Save model if validation loss improves
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/drive/MyDrive/Fake News Detection Data/Constrain AI/saved_weights.pt')

    print(f"Training Loss: {train_loss:.3f}")
    print(f"Validation Loss: {valid_loss:.3f}")


Epoch 1/15
Training Loss: 0.668
Validation Loss: 0.620

Epoch 2/15
Training Loss: 0.635
Validation Loss: 0.587

Epoch 3/15
Training Loss: 0.610
Validation Loss: 0.551

Epoch 4/15
Training Loss: 0.587
Validation Loss: 0.524

Epoch 5/15
Training Loss: 0.565
Validation Loss: 0.501

Epoch 6/15
Training Loss: 0.544
Validation Loss: 0.483

Epoch 7/15
Training Loss: 0.524
Validation Loss: 0.466

Epoch 8/15
Training Loss: 0.507
Validation Loss: 0.449

Epoch 9/15
Training Loss: 0.489
Validation Loss: 0.434

Epoch 10/15
Training Loss: 0.471
Validation Loss: 0.423

Epoch 11/15
Training Loss: 0.456
Validation Loss: 0.411

Epoch 12/15
Training Loss: 0.444
Validation Loss: 0.401

Epoch 13/15
Training Loss: 0.436
Validation Loss: 0.394

Epoch 14/15
Training Loss: 0.417
Validation Loss: 0.385

Epoch 15/15
Training Loss: 0.411
Validation Loss: 0.378


## Model Testing and Evaluation

The best-performing model is loaded, and predictions are made on the test set. Finally, it generates a classification report showing the model’s performance metrics.

In [11]:
# Load best model
model.load_state_dict(torch.load('/content/drive/MyDrive/Fake News Detection Data/Constrain AI/saved_weights.pt'))
model.eval()

# Perform predictions on test data
preds = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = [item.to(device) for item in batch]
        sent_id, mask, labels = batch
        batch_preds = model(sent_id, mask)
        preds.extend(batch_preds.detach().cpu().numpy())

# Convert predictions to class labels
preds = np.argmax(np.array(preds), axis=1)
print(classification_report(test_y, preds))

  model.load_state_dict(torch.load('/content/drive/MyDrive/Fake News Detection Data/Constrain AI/saved_weights.pt'))


              precision    recall  f1-score   support

           0       0.81      0.86      0.84      1020
           1       0.87      0.82      0.84      1120

    accuracy                           0.84      2140
   macro avg       0.84      0.84      0.84      2140
weighted avg       0.84      0.84      0.84      2140



# Step 2: Sentiment, Length, and Readability Analysis



This code creates these features and modifies the output of the BERT model.

In [12]:
# Prepare DataFrame with test tweets, predictions, and labels
classified_tweets = pd.DataFrame({
    'text': testDF['processedTweet'],  # Use the processed tweet text from validation set
    'label': preds  # Predictions from BERT model, 1 for real, 0 for fake
})

# Function to determine sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

# Apply sentiment analysis to each tweet
classified_tweets['sentiment'] = classified_tweets['text'].apply(get_sentiment)

# Calculate the length of each tweet (e.g., by word count)
classified_tweets['length'] = classified_tweets['text'].apply(lambda x: len(x.split()))

# Calculate the readability score (Flesch Reading Ease)
classified_tweets['readability'] = classified_tweets['text'].apply(lambda x: textstat.flesch_reading_ease(x))

# Display the DataFrame to confirm sentiment analysis, length, and readability results
classified_tweets[['text', 'label', 'sentiment', 'length', 'readability']].head()

Unnamed: 0,text,label,sentiment,length,readability
0,chinese converting to islam after realising th...,0,neutral,15,39.33
1,out of people (from the diamond princess cruis...,0,positive,28,51.52
2,"covid-19 is caused by a bacterium, not virus a...",0,neutral,14,82.65
3,mike pence in rnc speech praises donald trumps...,0,positive,20,59.64
4,/10 sky's explains the latest data and governm...,1,positive,15,72.32


# Step 3: Hypothesis Testing

This code checks if there is statistical significance between the three features and news type.

In [13]:
# Convert sentiment to numerical values for chi-square test
sentiment_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
classified_tweets['sentiment_numeric'] = classified_tweets['sentiment'].map(sentiment_mapping)

# Separate data into real and fake news for length and readability analysis
real_news = classified_tweets[classified_tweets['label'] == 1]
fake_news = classified_tweets[classified_tweets['label'] == 0]

# 1. Chi-Square Test of Independence for Sentiment
contingency_table = pd.crosstab(classified_tweets['label'], classified_tweets['sentiment'])
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
print("Chi-square Test for Sentiment and News Type")
print(f"Chi-square statistic: {chi2}, p-value: {p}")

# 2. Independent Samples T-Test for Length
t_stat, p_val = stats.ttest_ind(real_news['length'], fake_news['length'], equal_var=False)
print("\nT-Test for Length and News Type")
print(f"T-statistic (length): {t_stat}, p-value: {p_val}")

# 3. Independent Samples T-Test for Readability
t_stat_readability, p_val_readability = stats.ttest_ind(real_news['readability'], fake_news['readability'], equal_var=False)
print("\nT-Test for Readability and News Type")
print(f"T-statistic (readability): {t_stat_readability}, p-value: {p_val_readability}")

Chi-square Test for Sentiment and News Type
Chi-square statistic: 184.26869661668667, p-value: 9.695292945811067e-41

T-Test for Length and News Type
T-statistic (length): 17.702764972593126, p-value: 2.0587104927984265e-65

T-Test for Readability and News Type
T-statistic (readability): -1.283051259240334, p-value: 0.19961347486386288
