In [1]:
# Step 1: Install all the necessary libraries

!pip install transformers[torch] datasets pandas scikit-learn kaggle

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.1->transformers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.1->transformers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.1->transformers[torch])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.1->transformers[torch])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.1->transformers[torch])
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.1->transformers[torch])
  Downloading nvidia_cufft_cu12

In [2]:
from google.colab import files
import os

print("Upload the 'kaggle.json' file you downloaded from your Kaggle account!")
uploaded = files.upload()

try:
    original_filename = list(uploaded.keys())[0]
    new_filename = "kaggle.json"

    # Rename the file to the standard 'kaggle.json'
    os.rename(original_filename, new_filename)
    print(f"Successfully renamed '{original_filename}' to '{new_filename}'")
except IndexError:
    print("No file was uploaded. Please run the cell again and select your file.")
    import sys
    sys.exit()


# Create the .kaggle directory and move the API key
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

print("\nDownloading the fake and real news dataset...")
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset
!unzip fake-and-real-news-dataset.zip

print("\nDataset successfully downloaded and unzipped!")

Upload the 'kaggle.json' file you downloaded from your Kaggle account!


Saving kaggle.json to kaggle.json
Successfully renamed 'kaggle.json' to 'kaggle.json'

Downloading the fake and real news dataset...
Dataset URL: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset
License(s): CC-BY-NC-SA-4.0
Downloading fake-and-real-news-dataset.zip to /content
  0% 0.00/41.0M [00:00<?, ?B/s]
100% 41.0M/41.0M [00:00<00:00, 517MB/s]
Archive:  fake-and-real-news-dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                

Dataset successfully downloaded and unzipped!


In [3]:
# Step 3: Load the data into pandas, create labels, and prepare it for training

import pandas as pd
true_df = pd.read_csv('True.csv')
fake_df = pd.read_csv('Fake.csv')

true_df['label'] = 0
fake_df['label'] = 1

true_sample = true_df.sample(n=5000, random_state=42)
fake_sample = fake_df.sample(n=5000, random_state=42)

# Combine the two samples into one DataFrame
df = pd.concat([true_sample, fake_sample], ignore_index=True)
df['text'] = df['title'] + ". " + df['text']

df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df = df[['text', 'label']]

print("Dataset prepared successfully!")
print(f"Total articles for training: {len(df)}")
print("\nHere's a preview of the final data:")
print(df.head())

Dataset prepared successfully!
Total articles for training: 10000

Here's a preview of the final data:
                                                text  label
0  Secret Service Laptop Reportedly ‘Stolen’ had ...      1
1  Trump shuffles transition team, eyes loyalists...      0
2  Trump plans to keep Comey as FBI director: sou...      0
3  South Korea, China to hold summit next month t...      0
4  Trump U.S. tax plan will not manage to pay for...      0


# *Detector*

In [4]:
# Step 4: Tokenize the text data

from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
import torch

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# Load the tokenizer for DistilBERT
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Create a custom PyTorch Dataset class
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)
print("Tokenization complete. Datasets are ready for training.")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Tokenization complete. Datasets are ready for training.


In [None]:
# Step 5 : Configure and start the training process
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start the training
trainer.train()

print("\n--- Training Finished! ---")

In [None]:
# Step 6: Save the fine-tuned model and its tokenizer
model_save_path = './my_fake_news_detector'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer have been saved to the folder: '{model_save_path}'")

In [None]:
# Step 7: Zip the model folder and download it to your local computer
from google.colab import files

!zip -r my_fake_news_detector.zip ./my_fake_news_detector
files.download('my_fake_news_detector.zip')