# Fake News Detector - Data Preprocessing
The notebook covers the Data Preprocessing workflow run on ISOT Fake News detection dataset, provided by Kaggle.

The Kaggle Link : https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset

In [13]:
# Step 1: Install required packages
!pip install kagglehub[hf-datasets] pandas --quiet
!python -m nltk.downloader punkt_tab wordnet stopwords > /dev/null

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from time import time
from pathlib import Path
import kagglehub
from kagglehub import KaggleDatasetAdapter

## Loading data from Kaggle

In [15]:
def load_dataset():
    print("⏳ Loading datasets from Kaggle...")

    fake_ds = kagglehub.load_dataset(
        KaggleDatasetAdapter.HUGGING_FACE,
        "clmentbisaillon/fake-and-real-news-dataset",
        "Fake.csv",
        hf_kwargs={"split": "all"}
    )
    true_ds = kagglehub.load_dataset(
        KaggleDatasetAdapter.HUGGING_FACE,
        "clmentbisaillon/fake-and-real-news-dataset",
        "True.csv",
        hf_kwargs={"split": "all"}
    )

    fake_df = fake_ds.to_pandas()
    true_df = true_ds.to_pandas()

    fake_df['label'] = 1  # Fake news
    true_df['label'] = 0  # Real news

    df = pd.concat([fake_df, true_df], ignore_index=True)

    print(f"✅ Dataset loaded: {len(df)} records")
    print(f"   - Fake news: {len(fake_df)} samples")
    print(f"   - Real news: {len(true_df)} samples")

    return df

# Load the data
full_df = load_dataset()

⏳ Loading datasets from Kaggle...


  fake_ds = kagglehub.load_dataset(
  true_ds = kagglehub.load_dataset(


✅ Dataset loaded: 44898 records
   - Fake news: 23481 samples
   - Real news: 21417 samples


In [16]:
# Define project structure paths
PROJECT_ROOT = Path.cwd()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

# Create directories if they don't exist
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

## Process individual text entries

In [17]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()

    # Remove special characters/numbers except basic punctuation
    text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Remove short words (length < 2)
    tokens = [word for word in tokens if len(word) > 1]

    return ' '.join(tokens)

## Full preprocess data workflow

In [18]:
def preprocess_data(df):
    print("\n🧹 Preprocessing data...")
    start = time()

    # Save raw data
    raw_path = DATA_RAW / "raw_combined.csv"
    df.to_csv(raw_path, index=False)
    print(f"✅ Raw data saved to: {raw_path}")

    # Clean data
    df = df.drop_duplicates(subset=['title', 'text'])
    df['text'] = df['text'].fillna('')
    df['title'] = df['title'].fillna('')

    # Combine title and text
    df['full_text'] = df['title'] + ' ' + df['text']

    # Preprocess text in batches
    batch_size = 2000
    batches = [df.iloc[i:i+batch_size] for i in range(0, len(df), batch_size)]

    processed_texts = []
    for i, batch in enumerate(batches):
        print(f"  Processing batch {i+1}/{len(batches)}")
        processed_batch = batch['full_text'].apply(preprocess_text)
        processed_texts.extend(processed_batch)

    df['processed_text'] = processed_texts

    # Create train-test split
    train_df = df.sample(frac=0.8, random_state=42)
    test_df = df.drop(train_df.index)

    # Save processed data
    train_path = DATA_PROCESSED / "train_processed.csv"
    test_path = DATA_PROCESSED / "test_processed.csv"

    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)

    print(f"✅ Preprocessing completed in {time()-start:.2f} seconds")
    print(f"   - Train set ({len(train_df)} samples): {train_path}")
    print(f"   - Test set ({len(test_df)} samples): {test_path}")

    # Save additional artifacts for future reference
    sample_path = DATA_PROCESSED / "sample_processed_texts.csv"
    train_df[['processed_text', 'label']].head(100).to_csv(sample_path, index=False)
    print(f"   - Sample processed texts: {sample_path}")

    return train_df, test_df

# Preprocess the data
train_df, test_df = preprocess_data(full_df)


🧹 Preprocessing data...
✅ Raw data saved to: /content/data/raw/raw_combined.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title'] = df['title'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['full_text'] = df['title'] + ' ' + df['text']


  Processing batch 1/20
  Processing batch 2/20
  Processing batch 3/20
  Processing batch 4/20
  Processing batch 5/20
  Processing batch 6/20
  Processing batch 7/20
  Processing batch 8/20
  Processing batch 9/20
  Processing batch 10/20
  Processing batch 11/20
  Processing batch 12/20
  Processing batch 13/20
  Processing batch 14/20
  Processing batch 15/20
  Processing batch 16/20
  Processing batch 17/20
  Processing batch 18/20
  Processing batch 19/20
  Processing batch 20/20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['processed_text'] = processed_texts


✅ Preprocessing completed in 209.12 seconds
   - Train set (31284 samples): /content/data/processed/train_processed.csv
   - Test set (7821 samples): /content/data/processed/test_processed.csv
   - Sample processed texts: /content/data/processed/sample_processed_texts.csv
