In [None]:
# My custom setup script
import os

os.environ['HF_HUB_DISABLE_IMPLICIT_TOKEN'] = '1'
os.environ['WANDB_DISABLED'] = 'true'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

print("Installing transformers version so no 404 error occurs...")
!pip uninstall -y peft -q
!pip install -q transformers==4.40.1 accelerate==0.27.0
print("[SUCCESS]: Environment variables have been set")

In [None]:
import pandas as pd
import numpy as np
import torch
import time
from torch.utils.data import Dataset 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns

print("[SUCCESS]: Libraries were imported")
    

In [None]:
import torch

print("[CHECKING]: GPU")

if torch.cuda.is_available():
    print(f"[SUCCESS]: Available GPU --> {torch.cuda.get_device_name(0)}")
    print(f"[SUCCESS]: GPU Memory --> {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("[ERROR]: GPU was not detected. Please enable it to --> GPU T4 x 2")

In [None]:
# Created a custom dataset class since Pytorch's training system requires data to be in a specific format, which is different to that of Pandas (which is what we are making use of in this project)
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, maxLength=512):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.maxLength = maxLength
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.maxLength,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }
    
print("[SUCCESS]: Dataset class successfully defined!")

In [None]:
# Load processed data
def loadProcessedData():

    try:
        trainDf = pd.read_csv("train.csv")
        valDf = pd.read_csv("validation.csv")
        testDf = pd.read_csv("test.csv")
        
    except:

        try:
            trainDf = pd.read_csv("/kaggle/input/processed-data/train.csv")
            valDf = pd.read_csv("/kaggle/input/processed-data/validation.csv")
            testDf = pd.read_csv("/kaggle/input/processed-data/test.csv")

        except:
            print("[ERROR]: Data not found. Please upload the required train.csv, test.csv, and validation.csv files")
            raise

    return trainDf, valDf, testDf

trainDf, valDf, testDf = loadProcessedData()

print("[SUCCESS]: Data was successfully added")
print(f"Training samples: {len(trainDf):,}")
print(f"Validation samples: {len(valDf):,}")
print(f"Test samples: {len(testDf):,}")
print(f"Total: {len(trainDf) + len(valDf) + len(testDf):,}")


In [None]:
def computeMetrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

def plotConfusionMatrix(yTrue, yPred, outputPath="./confusionMatrix.png"):
    cm = confusion_matrix(yTrue, yPred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Fake (0)", "Real (0)"], yticklabels=["Fake (0)", "Real(0)"], cbar_kws={"label" : "Count"})

    plt.title("Fake News Detection Confusion Matrix", fontsize=16, fontweight="bold")
    plt.ylabel("Actual", fontsize=12)
    plt.xlabel("Predicted", fontsize=12)

    total = cm.sum()
    for i in range(2):
        for j in range(2):
            percentage = (cm[i, j] / total) * 100
            plt.text(j + 0.5, i + 0.7, f'\n{percentage:.1f}%', ha="center", va="center", fontsize=10, color="gray")

    plt.tight_layout()
    plt.savefig(outputPath, dpi=300, bbox_inches="tight")
    print(f"[SUCCESS]: Confusion Matrix was successfully generated and saved to {outputPath}")
    plt.close()

    return cm

print("[SUCCESS]: Added metric and confusion matrix functions")

In [None]:
# Loading Model and Tokenizer

modelName = "roberta-base"
print(f"[LOADING]: roberta-base model {modelName}")

try:
    tokenizer = AutoTokenizer.from_pretrained(modelName, use_fast=True, local_files_only=False)
    
    model = AutoModelForSequenceClassification.from_pretrained(modelName, num_labels=2, local_files_only=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    print("[SUCCESS]: Model was successfully loaded")
    print(f"[PARAMETERS]: {sum(p.numel() for p in model.parameters()):,}")

except Exception as e:
    print(f"[ERROR]: {e}")
    print(f"[FALLBACK]: Trying another method...")

    tokenizer = AutoTokenizer.from_pretrained(modelName)
    
    model = AutoModelForSequenceClassification.from_pretrained(modelName, num_labels=2)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    print("[SUCCESS-FALLBACK]: Model was successfully loaded")


In [None]:
print("Creating PyTorch datasets...")

trainDataset = FakeNewsDataset(
    texts=trainDf["text"],
    labels=trainDf["label"],
    tokenizer=tokenizer,
    maxLength=512
)

valDataset = FakeNewsDataset(
    texts=valDf["text"],
    labels=valDf["label"],
    tokenizer=tokenizer,
    maxLength=512
)

testDataset = FakeNewsDataset(
    texts=testDf["text"],
    labels=testDf["label"],
    tokenizer=tokenizer,
    maxLength=512
)

print("[SUCCESS]: Datasets were successfully created")
print(f"Training dataset: {len(trainDataset):,} samples")
print(f"Validation dataset: {len(valDataset):,} samples")
print(f"Test dataset: {len(testDataset):,} samples")

In [None]:
print("[CONFIGURATION]: Configuring Training Arguments...")
trainingArgs = TrainingArguments(
    
    output_dir="./fakeNewsModel",

    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,

    warmup_steps=500,
    warmup_ratio=0.1,
    weight_decay=0.01,
    learning_rate=5e-5,

    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,

    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    fp16=True,
    dataloader_num_workers=2,

    report_to="none",
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=trainingArgs,
    train_dataset=trainDataset,
    eval_dataset=valDataset,
    compute_metrics=computeMetrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
print("[STARTING]: Staring model training...")

startTime = time.time()
trainer.train()
trainingTime = time.time() - startTime

print("[SUCCESS]: Training has been complete!")
print(f"Training Time: {trainingTime/60:.2f} minutes (or {trainingTime/3600:.2f} hours)")

In [None]:
print("[EVALUATING]: Evaluating on Test set...")
print("[PREDICTIONS]: Predictions loading...")

predictions = trainer.predict(testDataset)
yPred = predictions.predictions.argmax(-1)
yTrue = testDf["label"].values

accuracy = accuracy_score(yTrue, yPred)
precision, recall, f1, _ = precision_recall_fscore_support(yTrue, yPred, average="weighted")

print("[SUCCESS]: Predictions were successfully calculated!")
print(f"Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-Score: {f1:.4f}")

In [None]:
print("[GENERATING]: Generating confusion matrix...")
cm = plotConfusionMatrix(yTrue, yPred, outputPath="./confusionMatrix.png")
print("[SUCCESS]: Confusion Matrix was successfully generated and saved!")

In [None]:
print("[LOADING]: Classification Report...")

print(classification_report(yTrue, yPred, target_names=["Fake (0)", "Real (1)"], digits=4))

misclassifiedIndices = np.where(yTrue != yPred)[0]
errorRate = (len(misclassifiedIndices) / len(yTrue)) * 100
print(f"Number of misclassified samples: {len(misclassifiedIndices)} out of {len(yTrue)} ({(len(misclassifiedIndices)/len(yTrue))*100:.2f}%)")
print(f"Error Rate: {errorRate:.2f}%")

if len(misclassifiedIndices) > 0:
    print("Some misclassified samples:")
    for i, idx in enumerate(misclassifiedIndices[:3], 1):
        textPreview = testDf.iloc[idx]["text"][:150] + "..."
        trueLabel = "Real" if yTrue[idx] == 1 else "Fake"
        predLabel = "Real" if yPred[idx] == 1 else "Fake"
        print(f"\n{i}. True Label: {trueLabel}, Predicted Label: {predLabel}\n")
        print(f"   Text Preview: {textPreview}\n")

In [None]:
model.save_pretrained("./fakeNewsModel")
tokenizer.save_pretrained("./fakeNewsModel")
print("[SUCCESS] Model and tokenizer saved to './fakeNewsModel'")

with open("./evaluation_results.txt", "w") as f:
    f.write("VeriNews Detector - Evaluation Results\n")
    f.write(f"Test Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)\n")
    f.write(f"Precision:      {precision:.4f}\n")
    f.write(f"Recall:         {recall:.4f}\n")
    f.write(f"F1 Score:       {f1:.4f}\n\n")
    f.write("Confusion Matrix:\n")
    f.write(f"                Predicted Fake    Predicted Real\n")
    f.write(f"Actual Fake          {cm[0,0]:6d}           {cm[0,1]:6d}\n")
    f.write(f"Actual Real          {cm[1,0]:6d}           {cm[1,1]:6d}\n\n")
    f.write(classification_report(yTrue, yPred, target_names=["Fake (0)", "Real (1)"], digits=4))
    
print("Evaluation results saved to ./evaluation_results.txt")
print("[SUCCESS]: Model training and evaluation are complete!")

In [None]:
print("[PACKAGING]: Creating download for model")
!zip -r fakeNewsModel.zip fakeNewsModel/
print("Packaging model as fakeNewsModel")

from IPython.display import FileLink, display
print("\nDownload trained model: ")
display(FileLink("fakeNewsModel.zip"))

print("[RECAP]: Recapping everything that was generated: ")
print(f"fake_news_model/         - This is the trained model")
print(f"fake_news_model.zip      - This is the model packaged as a zip")
print(f"confusion_matrix.png     - This is the generated Confusion Matrix PNG")
print(f"evaluation_results.txt   - This the model's evaluation results Complete evaluation report")

print("[RECAP]: Recapping the model's final performance: ")
print(f"Accuracy:  {accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall:    {recall*100:.2f}%")
print(f"F1 Score:  {f1*100:.2f}%")

print(f"Model training took roughly {trainingTime/60:.1f} minutes (soooo long but it works!)")

print("[FINAL-SUCCESS]: Model is now fully functional and ready to be used! Let's gooooooo!")