# AML Final -- Fake News Detection

Note: This NB was created in Colab, so there might be issues with metadata when viewing in a jupyter NB. I am also including the "printed" pdf version in case the notebook can not be viewed.

In [5]:
# import packages

import sklearn
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.mixture import GaussianMixture
import scipy
import contractions

#Pre-Processing

In [6]:
train = pd.read_csv('train (2).csv', delimiter=';')
train = train[['title', 'text', 'label']]
test = pd.read_csv('test (1).csv', delimiter=';')
test = test[['title', 'text', 'label']]
val = pd.read_csv('evaluation.csv', delimiter=';')
val = val[['title', 'text', 'label']]

In [9]:
# Convert to lowercase
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
val['text'] = val['text'].str.lower()

# drop na
train = train.dropna(subset=['text'])
test = test.dropna(subset=['text'])
val = val.dropna(subset=['text'])

# lemmatize
wnl = WordNetLemmatizer()
def lemmatize_text(text):
    return ' '.join([wnl.lemmatize(word) for word in text.split()])

train['text'] = train['text'].apply(lemmatize_text)
test['text'] = test['text'].apply(lemmatize_text)
val['text'] = val['text'].apply(lemmatize_text)

# remove punctuation
train['text'] = train['text'].str.replace(f"[{string.punctuation}]", "", regex=True)
test['text'] = test['text'].str.replace(f"[{string.punctuation}]", "", regex=True)
val['text'] = val['text'].str.replace(f"[{string.punctuation}]", "", regex=True)

train['text'] = train['text'].apply(lambda x: contractions.fix(x))
test['text'] = test['text'].apply(lambda x: contractions.fix(x))
val['text'] = val['text'].apply(lambda x: contractions.fix(x))




In [10]:
def clean(text):
    """
    From assignment template code
    """
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    #added substitutions

    #***********added substitutions***********
    # remove all the special characters
    texter = re.sub(r'\W', ' ', texter)
    # remove all single characters
    texter = re.sub(r'\s+[a-zA-Z]\s+', ' ', texter)
    # Remove single characters from the start
    texter = re.sub(r'\^[a-zA-Z]\s+', ' ', texter)
    # Remove numbers
    texter = re.sub(r'\d+', ' ', texter)
    # Converting to Lowercase
    texter = texter.lower()
    # Remove punctuation
    texter = re.sub(r'[^\w\s]', ' ', texter)
    # Remove parentheses
    texter = re.sub(r'\([^)]*\)', ' ', texter)
    # Remove single quotes
    texter = re.sub(r'\'', ' ', texter)
    # Substituting multiple spaces with single space
    texter = re.sub(r'\s+', ' ', texter, flags=re.I)

    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

In [11]:
train['text'] = train['text'].apply(clean)
test['text'] = test['text'].apply(clean)
val['text'] = val['text'].apply(clean)

In [12]:
train['text']

Unnamed: 0,text
0,ramallah west bank reuters palestinian switche...
1,beijing reuters us presidentelect donald trump...
2,while the controversy over trump personal tax ...
3,beijing reuters trip to beijing last week by z...
4,there ha never been more uncourageous person i...
...,...
24348,mexico city reuters key committee in mexico se...
24349,if she not toast now then we re in bigger trou...
24350,kremlin nato wa created for agression russia t...
24351,dallas cowboy star wide receiver dez bryant to...


# Logistic Regression Baseline

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# vectorize
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # tfidf
x_train = tfidf_vectorizer.fit_transform(train['text'])  # fit/transform on training data
x_test = tfidf_vectorizer.transform(test['text'])  # only transform test data

# train logreg
log_reg = LogisticRegression(random_state=42)
y_train = train['label']
log_reg.fit(x_train, y_train)

# predict
y_test = test['label']
y_pred = log_reg.predict(x_test)

# eval
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)



Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3753
           1       0.98      0.97      0.97      4364

    accuracy                           0.97      8117
   macro avg       0.97      0.97      0.97      8117
weighted avg       0.97      0.97      0.97      8117


Accuracy Score: 0.9726499938400887

Confusion Matrix:
[[3646  107]
 [ 115 4249]]


# Embedding / Tokenization

In [14]:
from transformers import DistilBertModel, DistilBertTokenizer

# load distilbert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [15]:
def tokenize_text(text_list, tokenizer, max_length=128):
    """Tokenizes and pads text data."""
    return tokenizer(
        text_list,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )


In [16]:
# tokenize all datasets
train_tokenized = tokenize_text(train['text'].tolist(), tokenizer)
val_tokenized = tokenize_text(val['text'].tolist(), tokenizer)
test_tokenized = tokenize_text(test['text'].tolist(), tokenizer)

# Neural Network

In [17]:
import torch
import torch.nn as nn

In [18]:
device = torch.device("cuda")

class DistilBertForFakeNewsClassification(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(DistilBertForFakeNewsClassification, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        logits = self.fc(cls_embedding)
        return logits

# init model
bert_classifier = DistilBertForFakeNewsClassification(distilbert_model, num_labels=1).to(device)


# Training

In [19]:
from torch.optim import AdamW
from sklearn.metrics import accuracy_score

# optimzier/loss
optimizer = AdamW(bert_classifier.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()

epochs = 5
batch_size = 32



In [20]:
# preprocess labels before  loop
train_labels_tensor = torch.tensor(train['label'].values).unsqueeze(1).float().to(device)
val_labels_tensor = torch.tensor(val['label'].values).unsqueeze(1).float().to(device)

# training loop
for epoch in range(epochs):
    bert_classifier.train()
    total_loss = 0

    for i in range(0, len(train_tokenized['input_ids']), batch_size):
        batch_input_ids = train_tokenized['input_ids'][i:i+batch_size].to(device)
        batch_attention_mask = train_tokenized['attention_mask'][i:i+batch_size].to(device)
        batch_labels = train_labels_tensor[i:i+batch_size]  # Use preprocessed labels

        logits = bert_classifier(batch_input_ids, batch_attention_mask)
        loss = loss_fn(logits, batch_labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / (len(train_tokenized['input_ids']) / batch_size)

    # validation
    bert_classifier.eval()
    val_logits_list = []
    val_labels_list = []

    with torch.no_grad():
        for i in range(0, len(val_tokenized['input_ids']), batch_size):
            val_input_ids = val_tokenized['input_ids'][i:i+batch_size].to(device)
            val_attention_mask = val_tokenized['attention_mask'][i:i+batch_size].to(device)
            val_labels = val_labels_tensor[i:i+batch_size]  #  preprocessed labels

            logits = bert_classifier(val_input_ids, val_attention_mask)
            val_logits_list.extend(logits.cpu().numpy())
            val_labels_list.extend(val_labels.cpu().numpy())  # to numpy for evaluation

    val_predictions = (torch.tensor(val_logits_list).squeeze() > 0).float().numpy()
    val_accuracy = accuracy_score(val_labels_list, val_predictions)

    print(f"Epoch {epoch+1}/{epochs} | Training Loss: {avg_train_loss:.4f} | Validation Accuracy: {val_accuracy:.4f}")


  val_predictions = (torch.tensor(val_logits_list).squeeze() > 0).float().numpy()


Epoch 1/5 | Training Loss: 0.0650 | Validation Accuracy: 0.9882
Epoch 2/5 | Training Loss: 0.0280 | Validation Accuracy: 0.9880
Epoch 3/5 | Training Loss: 0.0144 | Validation Accuracy: 0.9878
Epoch 4/5 | Training Loss: 0.0077 | Validation Accuracy: 0.9882
Epoch 5/5 | Training Loss: 0.0036 | Validation Accuracy: 0.9869


In [21]:
from sklearn.metrics import confusion_matrix, classification_report

bert_classifier.eval()
test_logits_list = []
test_labels_list = []

with torch.no_grad():
    for i in range(0, len(test_tokenized['input_ids']), batch_size):
        test_input_ids = test_tokenized['input_ids'][i:i+batch_size].to(device)
        test_attention_mask = test_tokenized['attention_mask'][i:i+batch_size].to(device)
        test_labels = test['label'][i:i+batch_size].values

        logits = bert_classifier(test_input_ids, test_attention_mask)
        test_logits_list.extend(logits.cpu().numpy())
        test_labels_list.extend(test_labels)

# logits to predictions
test_predictions = (torch.tensor(test_logits_list).squeeze() > 0).float().numpy()

# test accuracy
test_accuracy = accuracy_score(test_labels_list, test_predictions)

# confusion matrix
conf_matrix = confusion_matrix(test_labels_list, test_predictions)

# classification report
clf_report = classification_report(test_labels_list, test_predictions, target_names=['Fake News', 'Real News'])

print(f"Test Accuracy: {test_accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(clf_report)


Test Accuracy: 0.986694591597881

Confusion Matrix:
[[3693   60]
 [  48 4316]]

Classification Report:
              precision    recall  f1-score   support

   Fake News       0.99      0.98      0.99      3753
   Real News       0.99      0.99      0.99      4364

    accuracy                           0.99      8117
   macro avg       0.99      0.99      0.99      8117
weighted avg       0.99      0.99      0.99      8117



# Combine Train + Val and Retrain

In [22]:
# combine train and val set for final model
final_train = pd.concat([train, val], ignore_index=True)

final_tokenized = tokenize_text(final_train['text'].tolist(), tokenizer)
final_labels_tensor = torch.tensor(final_train['label'].values).unsqueeze(1).float().to(device)

# re init model
bert_classifier = DistilBertForFakeNewsClassification(distilbert_model, num_labels=1).to(device)



In [26]:
final_labels_tensor = torch.tensor(final_train['label'].values).unsqueeze(1).float().to(device)

# training loop
for epoch in range(epochs):
    bert_classifier.train()
    total_loss = 0

    for i in range(0, len(final_tokenized['input_ids']), batch_size):
        batch_input_ids = final_tokenized['input_ids'][i:i+batch_size].to(device)
        batch_attention_mask = final_tokenized['attention_mask'][i:i+batch_size].to(device)
        batch_labels = final_labels_tensor[i:i+batch_size]  #  combined labels

        logits = bert_classifier(batch_input_ids, batch_attention_mask)
        loss = loss_fn(logits, batch_labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / (len(final_tokenized['input_ids']) / batch_size)
    print(f"Epoch {epoch+1}/{epochs} | Training Loss: {avg_train_loss:.4f}")


Epoch 1/5 | Training Loss: 0.0016
Epoch 2/5 | Training Loss: 0.0019
Epoch 3/5 | Training Loss: 0.0025
Epoch 4/5 | Training Loss: 0.0008
Epoch 5/5 | Training Loss: 0.0017


# Evaluation

In [27]:
from sklearn.metrics import confusion_matrix, classification_report

# eval on test set
bert_classifier.eval()
test_logits_list = []
test_labels_list = []

with torch.no_grad():
    for i in range(0, len(test_tokenized['input_ids']), batch_size):
        test_input_ids = test_tokenized['input_ids'][i:i+batch_size].to(device)
        test_attention_mask = test_tokenized['attention_mask'][i:i+batch_size].to(device)
        test_labels = test['label'][i:i+batch_size].values

        logits = bert_classifier(test_input_ids, test_attention_mask)
        test_logits_list.extend(logits.cpu().numpy())
        test_labels_list.extend(test_labels)

# logits to predictions
test_predictions = (torch.tensor(test_logits_list).squeeze() > 0).float().numpy()

# test accuracy
test_accuracy = accuracy_score(test_labels_list, test_predictions)
print(f"Final Test Accuracy: {test_accuracy:.4f}")

# confusion matrix
conf_matrix = confusion_matrix(test_labels_list, test_predictions)
print("\nConfusion Matrix:")
print(conf_matrix)

# classification report
clf_report = classification_report(test_labels_list, test_predictions, target_names=['Fake News', 'Real News'])
print("\nClassification Report:")
print(clf_report)


Final Test Accuracy: 0.9887

Confusion Matrix:
[[3707   46]
 [  46 4318]]

Classification Report:
              precision    recall  f1-score   support

   Fake News       0.99      0.99      0.99      3753
   Real News       0.99      0.99      0.99      4364

    accuracy                           0.99      8117
   macro avg       0.99      0.99      0.99      8117
weighted avg       0.99      0.99      0.99      8117



In [30]:
torch.save(bert_classifier.state_dict(), "distilbert_fakenews_model.pth")

tokenizer.save_pretrained("distilbert_fakenews_tokenizer")


('distilbert_fakenews_tokenizer/tokenizer_config.json',
 'distilbert_fakenews_tokenizer/special_tokens_map.json',
 'distilbert_fakenews_tokenizer/vocab.txt',
 'distilbert_fakenews_tokenizer/added_tokens.json')