# Fake News Detection with FNN + TF-IDF + TruncatedSVD
This notebook trains a Feedforward Neural Network (FNN) on TF-IDF + TruncatedSVD features to classify news as fake or real.

In [None]:
# Imports
import os
import re
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

print("pandas version:", pd.__version__)
print("tensorflow version:", tf.__version__)

In [None]:
# ---------- USER SETTINGS ----------
MAX_FEATURES = 5000
SVD_COMPONENTS = 500
BATCH_SIZE = 64
EPOCHS = 30
RANDOM_STATE = 42

In [None]:
# ---------- Load Dataset ----------
# Replace with your CSV path if needed
# df = pd.read_csv('path/to/news.csv')
display(df.head(3))

In [None]:
# ---------- Create content column ----------
if 'content' not in df.columns:
    df['content'] = (df.get('title', '').fillna('') + ' ' + df.get('text', '').fillna('')).str.strip()

# Fast text cleaning
fallback_stopwords = {
    "the","and","is","in","to","of","a","for","on","with","as","that","this",
    "it","by","an","be","are","from","or","at","was","were","has","have","had",
    "not","but","they","their","i","we","you","he","she","them","his","her"
}
def clean_text_fast(text):
    if pd.isna(text):
        return ""
    s = str(text).lower()
    s = re.sub(r'http\S+', ' ', s)
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    tokens = s.split()
    tokens = [t for t in tokens if t not in fallback_stopwords and len(t) > 2]
    return " ".join(tokens)

df['clean'] = df['content'].apply(clean_text_fast)
display(df[['content','clean']].head(3))

In [None]:
# ---------- Ensure label is numeric ----------
if 'label' not in df.columns:
    raise KeyError("DataFrame must have a 'label' column with 0 (fake) and 1 (real).")

if df['label'].dtype == object:
    df['label'] = df['label'].str.lower().map({'fake':0,'false':0,'satire':0,'real':1,'true':1}).fillna(df['label'])
df['label'] = pd.to_numeric(df['label'], errors='coerce')
if df['label'].isna().any():
    raise ValueError("Some labels are not numeric after conversion.")

print("Label distribution:\n", df['label'].value_counts())

In [None]:
# ---------- Train/Test split ----------
X = df['clean'].values
y = df['label'].astype(int).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
print("Train/Test sizes:", len(X_train), len(X_test))

In [None]:
# ---------- TF-IDF vectorization ----------
tfidf = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)
print("TF-IDF shapes (sparse):", X_train_tfidf.shape, X_test_tfidf.shape)

In [None]:
# ---------- TruncatedSVD ----------
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=RANDOM_STATE)
X_train_reduced = svd.fit_transform(X_train_tfidf)
X_test_reduced  = svd.transform(X_test_tfidf)
print("After SVD shapes:", X_train_reduced.shape, X_test_reduced.shape)

# Optional scaling
scaler = StandardScaler()
X_train_reduced = scaler.fit_transform(X_train_reduced)
X_test_reduced  = scaler.transform(X_test_reduced)

In [None]:
# ---------- Build FNN ----------
input_dim = X_train_reduced.shape[1]
def make_fnn(input_dim):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.4),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

model = make_fnn(input_dim)
model.summary()

In [None]:
# ---------- Class weights ----------
cw = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))
print("Class weights:", cw)

In [None]:
# ---------- Train FNN ----------
es = callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
history = model.fit(X_train_reduced, y_train,
                    validation_split=0.1,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    callbacks=[es],
                    class_weight=cw,
                    verbose=2)

In [None]:
# ---------- Evaluate ----------
y_prob = model.predict(X_test_reduced).ravel()
y_pred = (y_prob >= 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred, digits=4))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['fake','real'], yticklabels=['fake','real'])
plt.xlabel('Predicted'); plt.ylabel('True'); plt.title('Confusion Matrix')
plt.show()

In [None]:
# Training curves
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.legend(); plt.title('Loss')

plt.subplot(1,2,2)
plt.plot(history.history['accuracy'], label='train acc')
plt.plot(history.history['val_accuracy'], label='val acc')
plt.legend(); plt.title('Accuracy')
plt.show()

In [None]:
# ---------- Save artifacts ----------
os.makedirs("models", exist_ok=True)
model.save("models/fnn_svd_model.h5")
joblib.dump(tfidf, "models/tfidf_vectorizer.joblib")
joblib.dump(svd, "models/svd_truncated.joblib")
joblib.dump(scaler, "models/scaler.joblib")
print("Saved all model artifacts.")

In [None]:
# ---------- Quick inference helper ----------
def predict_text(text, threshold=0.5):
    clean = clean_text_fast(text)
    v = tfidf.transform([clean])
    v = svd.transform(v)
    v = scaler.transform(v)
    p = model.predict(v).ravel()[0]
    return p, ("real" if p >= threshold else "fake")

# Example
ex = "Breaking: government announces free electricity for all citizens"
print("Example prediction:", predict_text(ex))