<a href="https://colab.research.google.com/github/madhura2024/fake_news_using_cnn/blob/main/fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Load dataset
df = pd.read_csv("https://raw.githubusercontent.com/GeorgeMcIntire/fake_real_news_dataset/master/fake_and_real_news_dataset.csv")

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|[^a-z ]", " ", text)  # remove links + non-letters
    text = re.sub(r"\s+", " ", text)                 # remove extra spaces
    return text

df = df.dropna().drop_duplicates()
df['text'] = df['text'].str.lower().str.strip()
df['clean_text'] = df['text'].apply(clean_text)

x = df['clean_text']
y = df['label'].map({'FAKE': 1, 'REAL': 0})  # convert labels to binary: FAKE=1, REAL=0
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=11)

# Computers don’t understand words, only numbers.
# Tokenizer turns each word into a number. But sentences have different lengths. Neural networks need same length.
# So we use padding to make them equal
# oov_token keeps track id of each new word
# all strings have to be same len once in numeric form so add 0 at the end

tok = Tokenizer(num_words=5000, oov_token="<OOV>")
tok.fit_on_texts(x_train)

x_train_seq = tok.texts_to_sequences(x_train)
x_test_seq = tok.texts_to_sequences(x_test)

maxlen = 20
x_train_pad = pad_sequences(x_train_seq, maxlen=maxlen, padding='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=maxlen, padding='post')

# Sentences are sequences (word order matters).
# Normal dense layers can’t remember order.
# LSTM = a special RNN that solves the memory problem.
# throw away? remember data? conclude?

# Embedding Layer = turns words (numbers) into word vectors
# input_dim=10000 → we keep only 10,000 most common words
# output_dim=64 → each word becomes a 64-length vector
# squashes output to 0–1 probability. Close to 0 → “Real”, Close to 1 → “Fake”

# input_length=200 → each sentence will be 200 words long (after padding)
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=maxlen),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x_train_pad, y_train, epochs=5, batch_size=32, validation_data=(x_test_pad, y_test))


Epoch 1/5




[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.6521 - loss: 0.6301 - val_accuracy: 0.8259 - val_loss: 0.4367
Epoch 2/5
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.8841 - loss: 0.3112 - val_accuracy: 0.8009 - val_loss: 0.4203
Epoch 3/5
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9440 - loss: 0.1667 - val_accuracy: 0.8161 - val_loss: 0.4718
Epoch 4/5
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9779 - loss: 0.0742 - val_accuracy: 0.8085 - val_loss: 0.5980
Epoch 5/5
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9836 - loss: 0.0498 - val_accuracy: 0.7965 - val_loss: 0.6752


<keras.src.callbacks.history.History at 0x7dd429b6be00>

In [31]:
def predict_fake_news(text):
    cleaned = clean_text(text.lower().strip())
    seq = tok.texts_to_sequences([cleaned])
    pad = pad_sequences(seq, maxlen=maxlen, padding='post')
    prob = model.predict(pad)[0][0]
    return "FAKE" if prob > 0.5 else "REAL", prob

# Example usage
print(predict_fake_news("NASA has confirmed aliens landed in Nevada."))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step
('FAKE', np.float32(0.98850244))
