# Fake News Detector - Data Preprocessing
The notebook covers the Data Preprocessing workflow run on ISOT Fake News detection dataset, provided by Kaggle.

The Kaggle Link : https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset

In [1]:
# Step 1: Install required packages
!pip install kagglehub[hf-datasets] pandas --quiet
!python -m nltk.downloader punkt_tab wordnet stopwords > /dev/null

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from time import time
from pathlib import Path
import kagglehub
from kagglehub import KaggleDatasetAdapter

## Loading data from Kaggle

In [3]:
def load_dataset():
    print("⏳ Loading datasets from Kaggle...")

    fake_ds = kagglehub.load_dataset(
        KaggleDatasetAdapter.HUGGING_FACE,
        "clmentbisaillon/fake-and-real-news-dataset",
        "Fake.csv",
        hf_kwargs={"split": "all"}
    )
    true_ds = kagglehub.load_dataset(
        KaggleDatasetAdapter.HUGGING_FACE,
        "clmentbisaillon/fake-and-real-news-dataset",
        "True.csv",
        hf_kwargs={"split": "all"}
    )

    fake_df = fake_ds.to_pandas()
    true_df = true_ds.to_pandas()

    fake_df['label'] = 1  # Fake news
    true_df['label'] = 0  # Real news

    df = pd.concat([fake_df, true_df], ignore_index=True)

    print(f"✅ Dataset loaded: {len(df)} records")
    print(f"   - Fake news: {len(fake_df)} samples")
    print(f"   - Real news: {len(true_df)} samples")

    return df

In [4]:
# Define project structure paths
PROJECT_ROOT = Path.cwd()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

# Create directories if they don't exist
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

## Process individual text entries

In [5]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()

    # Remove special characters/numbers except basic punctuation
    text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Remove short words (length < 2)
    tokens = [word for word in tokens if len(word) > 1]

    return ' '.join(tokens)

In [7]:
def preprocess_data(df):
    print("\n🧹 Preprocessing data...")
    start = time()

    # Save raw data
    raw_path = DATA_RAW / "raw_combined.csv"
    df.to_csv(raw_path, index=False)
    print(f"✅ Raw data saved to: {raw_path}")

    # Clean data
    df = df.drop_duplicates(subset=['title', 'text'])
    df['text'] = df['text'].fillna('')
    df['title'] = df['title'].fillna('')

    # Combine title and text
    df['full_text'] = df['title'] + ' ' + df['text']