# Fake News Detection - Data Preprocessing

This notebook handles the data loading and preprocessing steps for our fake news detection project.

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load datasets
fake = pd.read_csv("../data/Fake.csv")
real = pd.read_csv("../data/True.csv")

# Add target labels
fake["label"] = 0
real["label"] = 1

# Combine and shuffle
df = pd.concat([fake, real], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

# Combine title and text first
df["content"] = df["title"] + " " + df["text"]

print("Dataset shape:", df.shape)
df.head()

In [None]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Remove non-alphabetic characters
    text = re.sub(r"[^a-zA-Z]", " ", text)
    # Convert to lowercase
    text = text.lower()
    # Split into words
    text = text.split()
    # Remove stopwords
    text = [w for w in text if not w in stopwords.words("english")]
    return " ".join(text)

# Apply text cleaning
print("Starting text preprocessing...")
df["clean_text"] = df["content"].apply(clean_text)
print("Preprocessing complete!")

# Keep only necessary columns
df = df[["content", "clean_text", "label"]]

# Save preprocessed data
df.to_csv("../data/preprocessed_data.csv", index=False)
print("Data saved to preprocessed_data.csv")

# Display sample
df.head()