# LSTM/RNN Model for Fake News Classification

This notebook implements Long Short-Term Memory (LSTM) and other RNN architectures for fake news classification.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, GRU, SimpleRNN, Bidirectional
from tensorflow.keras.layers import Dense, Dropout, Input, GlobalMaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.layers import SpatialDropout1D, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

## Load Data and Tokenizer

In [None]:
# Load the dataset
df = pd.read_csv('../data/combined_news_dataset.csv')
print(f"Dataset shape: {df.shape}")

# Load the tokenizer from CNN notebook (or create if not available)
try:
    with open('../models/cnn_tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    
    with open('../models/cnn_config.pickle', 'rb') as handle:
        config = pickle.load(handle)
    
    MAX_VOCAB_SIZE = config['max_vocab_size']
    MAX_SEQUENCE_LENGTH = config['max_sequence_length']
    EMBEDDING_DIM = config['embedding_dim']
    
    print("Loaded existing tokenizer and configuration")
    
except FileNotFoundError:
    print("Creating new tokenizer...")
    
    # Configuration
    MAX_VOCAB_SIZE = 20000
    MAX_SEQUENCE_LENGTH = 500
    EMBEDDING_DIM = 100
    
    # Light preprocessing function
    import re
    def light_preprocess(text):
        if pd.isna(text):
            return ''
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\S+@\S+', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = ' '.join(text.split())
        return text
    
    # Prepare text
    df['cnn_text'] = (df['title'] + ' ' + df['text']).apply(light_preprocess)
    df = df[df['cnn_text'] != ''].reset_index(drop=True)
    
    # Create tokenizer
    tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
    tokenizer.fit_on_texts(df['cnn_text'].values)

print(f"Vocabulary size: {MAX_VOCAB_SIZE}")
print(f"Max sequence length: {MAX_SEQUENCE_LENGTH}")
print(f"Embedding dimension: {EMBEDDING_DIM}")

## Prepare Data

In [None]:
# Light preprocessing function (if not already defined)
import re
def light_preprocess(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join(text.split())
    return text

# Prepare the text data
if 'cnn_text' not in df.columns:
    df['cnn_text'] = (df['title'] + ' ' + df['text']).apply(light_preprocess)
    df = df[df['cnn_text'] != ''].reset_index(drop=True)

# Convert texts to sequences
texts = df['cnn_text'].values
labels = df['label'].values

sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
y = labels

print(f"Data shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"Label distribution: {np.bincount(y)}")

## Train-Test Split

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

## Model Architectures

### 1. Simple LSTM Model

In [None]:
def create_simple_lstm_model(vocab_size, embedding_dim, max_length):
    """Create a simple LSTM model"""
    
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        SpatialDropout1D(0.2),
        LSTM(100, dropout=0.2, recurrent_dropout=0.2),
        Dense(50, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    
    return model

# Create and compile the model
simple_lstm = create_simple_lstm_model(MAX_VOCAB_SIZE, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH)
simple_lstm.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("Simple LSTM Model:")
simple_lstm.summary()

### 2. Bidirectional LSTM Model

In [None]:
def create_bidirectional_lstm_model(vocab_size, embedding_dim, max_length):
    """Create a bidirectional LSTM model"""
    
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        SpatialDropout1D(0.2),
        Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)),
        Dense(50, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    
    return model

# Create and compile the model
bidirectional_lstm = create_bidirectional_lstm_model(MAX_VOCAB_SIZE, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH)
bidirectional_lstm.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\nBidirectional LSTM Model:")
bidirectional_lstm.summary()

### 3. Stacked LSTM Model

In [None]:
def create_stacked_lstm_model(vocab_size, embedding_dim, max_length):
    """Create a stacked LSTM model with multiple LSTM layers"""
    
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        SpatialDropout1D(0.2),
        
        # First LSTM layer
        LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
        BatchNormalization(),
        
        # Second LSTM layer
        LSTM(100, dropout=0.2, recurrent_dropout=0.2),
        
        # Dense layers
        Dense(50, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    
    return model

# Create and compile the model
stacked_lstm = create_stacked_lstm_model(MAX_VOCAB_SIZE, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH)
stacked_lstm.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\nStacked LSTM Model:")
stacked_lstm.summary()

### 4. GRU Model

In [None]:
def create_gru_model(vocab_size, embedding_dim, max_length):
    """Create a GRU model (faster alternative to LSTM)"""
    
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        SpatialDropout1D(0.2),
        Bidirectional(GRU(100, dropout=0.2, recurrent_dropout=0.2)),
        Dense(50, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    
    return model

# Create and compile the model
gru_model = create_gru_model(MAX_VOCAB_SIZE, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH)
gru_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\nGRU Model:")
gru_model.summary()

## Training Setup

In [None]:
# Training parameters
BATCH_SIZE = 32
EPOCHS = 10

# Callbacks
def get_callbacks(model_name):
    return [
        EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2,
            min_lr=0.0001,
            verbose=1
        ),
        ModelCheckpoint(
            f'../models/best_{model_name}.h5',
            monitor='val_accuracy',
            save_best_only=True,
            verbose=1
        )
    ]

## Train Models

In [None]:
# Dictionary to store models and their histories
models = {
    'Simple LSTM': simple_lstm,
    'Bidirectional LSTM': bidirectional_lstm,
    'Stacked LSTM': stacked_lstm,
    'GRU': gru_model
}

histories = {}
results = {}

# Train each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    print("=" * 50)
    
    # Get callbacks for this model
    callbacks = get_callbacks(name.lower().replace(' ', '_'))
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(X_val, y_val),
        callbacks=callbacks,
        verbose=1
    )
    
    histories[name] = history
    
    # Evaluate on test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    results[name] = {'loss': test_loss, 'accuracy': test_accuracy}
    
    print(f"Test Accuracy for {name}: {test_accuracy:.4f}")

print("\nAll models trained successfully!")

## Training History Visualization

In [None]:
# Plot training histories for all models
fig, axes = plt.subplots(2, 4, figsize=(20, 8))
axes = axes.flatten()

for idx, (name, history) in enumerate(histories.items()):
    # Accuracy plot
    ax1 = axes[idx]
    ax1.plot(history.history['accuracy'], label='Training Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax1.set_title(f'{name} - Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy')
    ax1.legend()
    ax1.grid(True)
    
    # Loss plot
    ax2 = axes[idx + 4]
    ax2.plot(history.history['loss'], label='Training Loss')
    ax2.plot(history.history['val_loss'], label='Validation Loss')
    ax2.set_title(f'{name} - Loss')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Loss')
    ax2.legend()
    ax2.grid(True)

plt.tight_layout()
plt.show()

## Model Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Test_Accuracy': [results[model]['accuracy'] for model in results.keys()],
    'Test_Loss': [results[model]['loss'] for model in results.keys()]
})

comparison_df = comparison_df.sort_values('Test_Accuracy', ascending=False)

print("Model Performance Comparison:")
print(comparison_df.to_string(index=False))

# Plot comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Accuracy comparison
sns.barplot(data=comparison_df, x='Model', y='Test_Accuracy', ax=ax1)
ax1.set_title('Test Accuracy Comparison')
ax1.set_ylabel('Accuracy')
ax1.tick_params(axis='x', rotation=45)
ax1.set_ylim(0.8, 1.0)

for i, v in enumerate(comparison_df['Test_Accuracy']):
    ax1.text(i, v + 0.005, f'{v:.3f}', ha='center')

# Loss comparison
sns.barplot(data=comparison_df, x='Model', y='Test_Loss', ax=ax2)
ax2.set_title('Test Loss Comparison')
ax2.set_ylabel('Loss')
ax2.tick_params(axis='x', rotation=45)

for i, v in enumerate(comparison_df['Test_Loss']):
    ax2.text(i, v + 0.01, f'{v:.3f}', ha='center')

plt.tight_layout()
plt.show()

## Detailed Evaluation of Best Model

In [None]:
# Find the best model
best_model_name = comparison_df.iloc[0]['Model']
best_model = models[best_model_name]

print(f"Best performing model: {best_model_name}")
print(f"Test Accuracy: {comparison_df.iloc[0]['Test_Accuracy']:.4f}")

# Generate predictions
y_pred_proba = best_model.predict(X_test, verbose=0)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()

# Classification report
print(f"\nClassification Report for {best_model_name}:")
print(classification_report(y_test, y_pred, target_names=['Fake', 'True']))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=['Fake', 'True'], yticklabels=['Fake', 'True'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

# Additional metrics
tn, fp, fn, tp = cm.ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"\nDetailed Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")
print(f"True Positives: {tp}, False Positives: {fp}")
print(f"True Negatives: {tn}, False Negatives: {fn}")

## Prediction Confidence Analysis

In [None]:
# Analyze prediction confidence
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Histogram of prediction probabilities
ax1.hist(y_pred_proba[y_test == 0], bins=50, alpha=0.7, label='Fake News', density=True)
ax1.hist(y_pred_proba[y_test == 1], bins=50, alpha=0.7, label='True News', density=True)
ax1.axvline(x=0.5, color='red', linestyle='--', label='Decision Threshold')
ax1.set_xlabel('Predicted Probability')
ax1.set_ylabel('Density')
ax1.set_title('Distribution of Prediction Probabilities')
ax1.legend()

# Confidence vs accuracy
confidence = np.maximum(y_pred_proba.flatten(), 1 - y_pred_proba.flatten())
correct = (y_pred == y_test).astype(int)

# Bin by confidence and calculate accuracy
n_bins = 10
bin_boundaries = np.linspace(0.5, 1.0, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]

accuracies = []
confidences = []
counts = []

for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
    in_bin = (confidence > bin_lower) & (confidence <= bin_upper)
    if in_bin.sum() > 0:
        accuracy_in_bin = correct[in_bin].mean()
        avg_confidence_in_bin = confidence[in_bin].mean()
        count_in_bin = in_bin.sum()
        
        accuracies.append(accuracy_in_bin)
        confidences.append(avg_confidence_in_bin)
        counts.append(count_in_bin)

ax2.bar(range(len(accuracies)), accuracies, alpha=0.7, label='Accuracy')
ax2.plot(range(len(confidences)), confidences, 'ro-', label='Confidence')
ax2.plot([0, len(accuracies)-1], [0, 1], 'k--', alpha=0.5, label='Perfect Calibration')
ax2.set_xlabel('Confidence Bin')
ax2.set_ylabel('Accuracy / Confidence')
ax2.set_title('Model Calibration')
ax2.legend()

plt.tight_layout()
plt.show()

print(f"\nCalibration Analysis:")
for i, (acc, conf, count) in enumerate(zip(accuracies, confidences, counts)):
    print(f"Bin {i+1}: Confidence={conf:.3f}, Accuracy={acc:.3f}, Count={count}")

## Sample Predictions

In [None]:
# Function to make predictions on new text
def predict_with_lstm(model, tokenizer, text, max_length=MAX_SEQUENCE_LENGTH):
    """Predict if a text is fake news using LSTM model"""
    # Preprocess the text
    processed_text = light_preprocess(text)
    
    # Convert to sequence
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    
    # Make prediction
    prediction_prob = model.predict(padded_sequence, verbose=0)[0][0]
    prediction = "True News" if prediction_prob > 0.5 else "Fake News"
    
    return prediction, prediction_prob

# Test on some examples
test_indices = np.random.choice(len(X_test), 5, replace=False)

print(f"Sample Predictions using {best_model_name}:")
print("=" * 80)

for i, idx in enumerate(test_indices):
    # Get original text (this is a simplified approach)
    original_text = df.iloc[idx]['title'] + ' ' + df.iloc[idx]['text'][:300]
    true_label = "True News" if y_test[idx] == 1 else "Fake News"
    
    # Make prediction
    prediction, prob = predict_with_lstm(best_model, tokenizer, original_text)
    
    print(f"\nExample {i+1}:")
    print(f"Text: {original_text[:250]}...")
    print(f"True Label: {true_label}")
    print(f"Predicted: {prediction} (Confidence: {prob:.3f})")
    status = "✓" if prediction == true_label else "✗"
    print(f"Correct: {status}")
    print("-" * 80)

## Save Best Model

In [None]:
# Save the best model
best_model.save(f'../models/lstm_best_model.h5')

# Save all models
for name, model in models.items():
    safe_name = name.lower().replace(' ', '_')
    model.save(f'../models/lstm_{safe_name}.h5')

# Save the results
results_df = pd.DataFrame([
    {'model': name, 'accuracy': results[name]['accuracy'], 'loss': results[name]['loss']} 
    for name in results.keys()
])
results_df.to_csv('../results/lstm_model_comparison.csv', index=False)

print(f"\nBest model ({best_model_name}) saved as: ../models/lstm_best_model.h5")
print("All models and results saved successfully!")

## Training Time Analysis

In [None]:
# Create a summary table with model complexity
model_params = {}
for name, model in models.items():
    model_params[name] = model.count_params()

summary_df = pd.DataFrame({
    'Model': list(model_params.keys()),
    'Parameters': list(model_params.values()),
    'Test_Accuracy': [results[name]['accuracy'] for name in model_params.keys()],
    'Test_Loss': [results[name]['loss'] for name in model_params.keys()]
})

summary_df = summary_df.sort_values('Test_Accuracy', ascending=False)
summary_df['Parameters_M'] = summary_df['Parameters'] / 1e6

print("Model Complexity vs Performance:")
print(summary_df[['Model', 'Parameters_M', 'Test_Accuracy', 'Test_Loss']].to_string(index=False))

# Plot parameters vs accuracy
plt.figure(figsize=(10, 6))
sns.scatterplot(data=summary_df, x='Parameters_M', y='Test_Accuracy', s=100)

for i, row in summary_df.iterrows():
    plt.annotate(row['Model'], (row['Parameters_M'], row['Test_Accuracy']), 
                xytext=(5, 5), textcoords='offset points')

plt.xlabel('Model Parameters (Millions)')
plt.ylabel('Test Accuracy')
plt.title('Model Complexity vs Performance')
plt.grid(True, alpha=0.3)
plt.show()

## Summary

### RNN/LSTM Model Results:
- **Simple LSTM**: Basic LSTM architecture
- **Bidirectional LSTM**: Processes sequences in both directions
- **Stacked LSTM**: Multiple LSTM layers for deeper feature extraction
- **GRU**: Faster alternative to LSTM with similar performance

### Key Findings:
1. LSTM models can capture sequential patterns and long-range dependencies in text
2. Bidirectional processing often improves performance by considering future context
3. Stacked architectures may provide deeper understanding but risk overfitting
4. GRU models are faster to train while maintaining competitive performance
5. Proper regularization (dropout, early stopping) is crucial for RNNs

### Advantages of LSTM approach:
- Excellent at capturing sequential dependencies
- Can handle variable-length inputs naturally
- Bidirectional processing captures full context
- Good performance on text classification tasks

### Disadvantages:
- Slower training compared to CNNs
- Sequential processing prevents parallelization
- Prone to overfitting on smaller datasets
- Memory intensive for long sequences

### Next Steps:
1. Experiment with attention mechanisms
2. Try pre-trained embeddings (Word2Vec, GloVe, FastText)
3. Implement transformer-based models (BERT, RoBERTa)
4. Ensemble multiple models for improved performance
5. Fine-tune hyperparameters and architecture details