In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download required NLTK resources
nltk.download('punkt_tab')  # Explicitly download punkt_tab
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Load data with correct quoting using csv module
fake_df = pd.read_csv('/content/Fake.csv', quoting=csv.QUOTE_NONNUMERIC, on_bad_lines='skip')
true_df = pd.read_csv('/content/True.csv', quoting=csv.QUOTE_NONNUMERIC, on_bad_lines='skip')

In [None]:
# Add labels
fake_df['label'] = 0
true_df['label'] = 1

# Combine datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

In [None]:
# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back to string
    return ' '.join(tokens)

In [None]:
# Apply preprocessing to title and text columns
df['title'] = df['title'].apply(preprocess_text)
df['text'] = df['text'].apply(preprocess_text)

In [None]:
# Combine title and text for input
df['combined_text'] = df['title'] + ' ' + df['text']

In [None]:
# Parameters
max_words = 10000
max_len = 200
embedding_dim = 100

# Tokenization
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['combined_text'])
sequences = tokenizer.texts_to_sequences(df['combined_text'])

In [None]:
# Padding
X = pad_sequences(sequences, maxlen=max_len)
y = df['label'].values

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Build LSTM model
model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])



In [None]:
# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
# Train model
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping]
)

Epoch 1/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 23ms/step - accuracy: 0.9090 - loss: 0.2102 - val_accuracy: 0.9876 - val_loss: 0.0408
Epoch 2/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 22ms/step - accuracy: 0.9802 - loss: 0.0576 - val_accuracy: 0.9852 - val_loss: 0.0517
Epoch 3/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 24ms/step - accuracy: 0.9931 - loss: 0.0240 - val_accuracy: 0.9942 - val_loss: 0.0191
Epoch 4/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 24ms/step - accuracy: 0.9938 - loss: 0.0196 - val_accuracy: 0.9918 - val_loss: 0.0287
Epoch 5/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 22ms/step - accuracy: 0.9970 - loss: 0.0106 - val_accuracy: 0.9936 - val_loss: 0.0209
Epoch 6/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 23ms/step - accuracy: 0.9953 - loss: 0.0137 - val_accuracy: 0.9947 - val_loss: 0.0216


In [None]:
# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.9917 - loss: 0.0233
Test Accuracy: 0.9935
