In [1]:
pip install pandas numpy tensorflow scikit-learn matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


2025-06-25 23:04:44.592888: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

fake['label'] = 'FAKE'
true['label'] = 'REAL'

df = pd.concat([fake, true], ignore_index=True)
df = df[['title', 'text', 'label']]


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['text'] = df['text'].astype(str).apply(clean_text)


In [5]:
X = df['text'].values
y = df['label'].values

le = LabelEncoder()
y = le.fit_transform(y)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

maxlen = 500
X_pad = pad_sequences(X_seq, maxlen=maxlen)


In [27]:
from sklearn.model_selection import train_test_split

# Split the padded data
X_train_padded, X_test_padded, y_train, y_test = train_test_split(
    X_pad, y, test_size=0.2, stratify=y, random_state=42
)


In [10]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=maxlen))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary();

In [25]:
from sklearn.utils import class_weight

# Compute class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))


In [29]:
# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Train model
model.fit(
    X_train_padded, y_train, 
    validation_data=(X_test_padded, y_test), 
    epochs=5, 
    batch_size=64, 
    class_weight=class_weights  # Remove this if you didn’t define
)


Epoch 1/5
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 559ms/step - accuracy: 0.9957 - loss: 0.0164 - val_accuracy: 0.9973 - val_loss: 0.0092
Epoch 2/5
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m347s[0m 618ms/step - accuracy: 0.9981 - loss: 0.0072 - val_accuracy: 0.9980 - val_loss: 0.0088
Epoch 3/5
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m366s[0m 651ms/step - accuracy: 0.9996 - loss: 0.0019 - val_accuracy: 0.9981 - val_loss: 0.0076
Epoch 4/5
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 656ms/step - accuracy: 0.9994 - loss: 0.0028 - val_accuracy: 0.9984 - val_loss: 0.0072
Epoch 5/5
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 661ms/step - accuracy: 0.9995 - loss: 0.0021 - val_accuracy: 0.9981 - val_loss: 0.0087


<keras.src.callbacks.history.History at 0x159cf91f0>

In [30]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 88ms/step - accuracy: 0.9980 - loss: 0.0076
Test Accuracy: 99.81%


In [31]:
def predict_fake_news(text):
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    pad_seq = pad_sequences(seq, maxlen=maxlen)
    pred = model.predict(pad_seq)[0][0]
    return "REAL" if pred > 0.5 else "FAKE"

# Examples:
print(predict_fake_news("NASA discovers a habitable planet in the Andromeda galaxy."))
print(predict_fake_news("The world will end next week, scientists claim."))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 617ms/step
FAKE
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
FAKE


In [32]:
model.save("lstm_fake_news_model.keras")


In [33]:
from tensorflow.keras.models import load_model
model = load_model("lstm_fake_news_model.keras", compile=False)


In [34]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the same max_len used during training
max_len = 500  # Use the same length as in your model training

# Define the prediction function
def predict_news(news_text):
    cleaned = clean_text(news_text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_len)
    prediction = model.predict(padded)[0][0]
    label = "REAL" if prediction > 0.5 else "FAKE"
    print(f"🧠 Prediction: {label} ({prediction:.2f})")


In [35]:
predict_news("NASA discovers new exoplanet that may support life.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 551ms/step
🧠 Prediction: FAKE (0.00)


In [36]:
import pickle

# Assuming tokenizer is already defined and fitted
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("✅ tokenizer.pkl saved")


✅ tokenizer.pkl saved
