# Fake News Detection - Data Preprocessing

This notebook handles the data loading and preprocessing steps for our fake news detection project.

In [None]:
import kagglehub
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Download dataset
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")
print("Path to dataset files:", path)

In [None]:
# Load datasets
fake = pd.read_csv(path + "/Fake.csv")
real = pd.read_csv(path + "/True.csv")

# Add target labels
fake["label"] = 0
real["label"] = 1

# Combine and shuffle
df = pd.concat([fake, real], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

# Combine title and text
df["content"] = df["title"] + " " + df["text"]
df = df[["content", "label"]]

print("Dataset shape:", df.shape)
df.head()

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    text = text.split()
    text = [w for w in text if not w in stopwords.words("english")]
    return " ".join(text)

# Apply text cleaning
df["clean_text"] = df["content"].apply(clean_text)

# Save preprocessed data
df.to_csv("../data/preprocessed_data.csv", index=False)
print("Preprocessing complete! Data saved to preprocessed_data.csv")
df.head()