# CNN Text Classification Model

This notebook implements a Convolutional Neural Network for fake news classification.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

## Load and Prepare Data

In [None]:
# Load the preprocessed dataset
df = pd.read_csv('../data/combined_news_dataset.csv')

# Load the preprocessor
preprocessor = joblib.load('../models/text_preprocessor.pkl')

print(f"Dataset shape: {df.shape}")
print(f"Label distribution:\n{df['label'].value_counts()}")

In [None]:
# Preprocess text if not already done
if 'processed_text' not in df.columns:
    print("Preprocessing text...")
    df['combined_text'] = df['title'] + ' ' + df['text']
    df['processed_text'] = df['combined_text'].apply(preprocessor.preprocess)
    df = df[df['processed_text'] != ''].reset_index(drop=True)

# For CNN, we'll use less aggressive preprocessing (keep more structure)
def light_preprocess(text):
    """Lighter preprocessing for CNN to preserve more text structure"""
    if pd.isna(text):
        return ''
    
    # Basic cleaning
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join(text.split())
    
    return text

import re
df['cnn_text'] = (df['title'] + ' ' + df['text']).apply(light_preprocess)
df = df[df['cnn_text'] != ''].reset_index(drop=True)

print(f"Final dataset shape: {df.shape}")

## Text Tokenization and Sequence Preparation

In [None]:
# Tokenization parameters
MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 500
EMBEDDING_DIM = 100

# Prepare the data
texts = df['cnn_text'].values
labels = df['label'].values

# Create tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

print(f"Found {len(word_index)} unique tokens")
print(f"Vocabulary size (limited): {MAX_VOCAB_SIZE}")

# Pad sequences
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
y = labels

print(f"Shape of data tensor: {X.shape}")
print(f"Shape of label tensor: {y.shape}")

In [None]:
# Analyze sequence lengths
sequence_lengths = [len(seq) for seq in sequences]

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(sequence_lengths, bins=50, alpha=0.7)
plt.axvline(x=MAX_SEQUENCE_LENGTH, color='r', linestyle='--', label=f'Max Length ({MAX_SEQUENCE_LENGTH})')
plt.xlabel('Sequence Length')
plt.ylabel('Frequency')
plt.title('Distribution of Sequence Lengths')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(sequence_lengths, bins=50, alpha=0.7, cumulative=True, density=True)
plt.axvline(x=MAX_SEQUENCE_LENGTH, color='r', linestyle='--', label=f'Max Length ({MAX_SEQUENCE_LENGTH})')
plt.xlabel('Sequence Length')
plt.ylabel('Cumulative Proportion')
plt.title('Cumulative Distribution of Sequence Lengths')
plt.legend()

plt.tight_layout()
plt.show()

# Statistics
print(f"\nSequence length statistics:")
print(f"Mean: {np.mean(sequence_lengths):.1f}")
print(f"Median: {np.median(sequence_lengths):.1f}")
print(f"95th percentile: {np.percentile(sequence_lengths, 95):.1f}")
print(f"Percentage of sequences <= {MAX_SEQUENCE_LENGTH}: {(np.array(sequence_lengths) <= MAX_SEQUENCE_LENGTH).mean():.2%}")

## Train-Test Split

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Further split training data for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

print(f"\nLabel distribution in training set: {np.bincount(y_train)}")
print(f"Label distribution in validation set: {np.bincount(y_val)}")
print(f"Label distribution in test set: {np.bincount(y_test)}")

## CNN Model Architecture

In [None]:
def create_cnn_model(vocab_size, embedding_dim, max_length):
    """Create a CNN model for text classification"""
    
    model = Sequential([
        # Embedding layer
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        
        # First CNN block
        Conv1D(filters=128, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        # Second CNN block
        Conv1D(filters=128, kernel_size=4, activation='relu'),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        # Third CNN block
        Conv1D(filters=128, kernel_size=5, activation='relu'),
        GlobalMaxPooling1D(),
        Dropout(0.5),
        
        # Dense layers
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    
    return model

# Create the model
cnn_model = create_cnn_model(MAX_VOCAB_SIZE, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH)

# Compile the model
cnn_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Display model architecture
print("CNN Model Architecture:")
cnn_model.summary()

## Multi-filter CNN Model (Alternative Architecture)

In [None]:
def create_multi_cnn_model(vocab_size, embedding_dim, max_length):
    """Create a multi-filter CNN model inspired by Kim (2014)"""
    
    # Input layer
    input_layer = Input(shape=(max_length,))
    
    # Embedding layer
    embedding = Embedding(vocab_size, embedding_dim, input_length=max_length)(input_layer)
    
    # Multiple convolution filters with different kernel sizes
    conv_blocks = []
    filter_sizes = [3, 4, 5]
    
    for filter_size in filter_sizes:
        conv = Conv1D(filters=100, kernel_size=filter_size, activation='relu')(embedding)
        pool = GlobalMaxPooling1D()(conv)
        conv_blocks.append(pool)
    
    # Concatenate all the pooled features
    concatenated = Concatenate()(conv_blocks)
    
    # Dropout and dense layers
    dropout1 = Dropout(0.5)(concatenated)
    dense1 = Dense(128, activation='relu')(dropout1)
    dropout2 = Dropout(0.5)(dense1)
    output = Dense(1, activation='sigmoid')(dropout2)
    
    # Create the model
    model = Model(inputs=input_layer, outputs=output)
    
    return model

# Create the multi-filter CNN model
multi_cnn_model = create_multi_cnn_model(MAX_VOCAB_SIZE, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH)

# Compile the model
multi_cnn_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\nMulti-filter CNN Model Architecture:")
multi_cnn_model.summary()

## Training Setup

In [None]:
# Training parameters
BATCH_SIZE = 32
EPOCHS = 10

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=2,
    min_lr=0.0001,
    verbose=1
)

callbacks = [early_stopping, reduce_lr]

## Train Standard CNN Model

In [None]:
print("Training Standard CNN Model...")

# Train the model
history_cnn = cnn_model.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

print("\nTraining completed!")

## Train Multi-filter CNN Model

In [None]:
print("Training Multi-filter CNN Model...")

# Train the multi-filter model
history_multi_cnn = multi_cnn_model.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

print("\nTraining completed!")

## Training History Visualization

In [None]:
def plot_training_history(history, title):
    """Plot training history"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Plot training & validation accuracy
    ax1.plot(history.history['accuracy'], label='Training Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax1.set_title(f'{title} - Model Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy')
    ax1.legend()
    ax1.grid(True)
    
    # Plot training & validation loss
    ax2.plot(history.history['loss'], label='Training Loss')
    ax2.plot(history.history['val_loss'], label='Validation Loss')
    ax2.set_title(f'{title} - Model Loss')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Loss')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    plt.show()

# Plot training histories
plot_training_history(history_cnn, "Standard CNN")
plot_training_history(history_multi_cnn, "Multi-filter CNN")

## Model Evaluation

In [None]:
# Evaluate both models on test set
print("Evaluating Standard CNN Model:")
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {cnn_loss:.4f}")
print(f"Test Accuracy: {cnn_accuracy:.4f}")

print("\nEvaluating Multi-filter CNN Model:")
multi_cnn_loss, multi_cnn_accuracy = multi_cnn_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {multi_cnn_loss:.4f}")
print(f"Test Accuracy: {multi_cnn_accuracy:.4f}")

In [None]:
# Generate predictions
cnn_pred_proba = cnn_model.predict(X_test, verbose=0)
cnn_pred = (cnn_pred_proba > 0.5).astype(int).flatten()

multi_cnn_pred_proba = multi_cnn_model.predict(X_test, verbose=0)
multi_cnn_pred = (multi_cnn_pred_proba > 0.5).astype(int).flatten()

# Classification reports
print("Standard CNN Classification Report:")
print(classification_report(y_test, cnn_pred, target_names=['Fake', 'True']))

print("\nMulti-filter CNN Classification Report:")
print(classification_report(y_test, multi_cnn_pred, target_names=['Fake', 'True']))

## Confusion Matrices

In [None]:
# Plot confusion matrices
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Standard CNN confusion matrix
cm_cnn = confusion_matrix(y_test, cnn_pred)
sns.heatmap(cm_cnn, annot=True, fmt='d', cmap='Blues', ax=ax1,
           xticklabels=['Fake', 'True'], yticklabels=['Fake', 'True'])
ax1.set_title(f'Standard CNN\nAccuracy: {cnn_accuracy:.3f}')
ax1.set_ylabel('Actual Label')
ax1.set_xlabel('Predicted Label')

# Multi-filter CNN confusion matrix
cm_multi_cnn = confusion_matrix(y_test, multi_cnn_pred)
sns.heatmap(cm_multi_cnn, annot=True, fmt='d', cmap='Blues', ax=ax2,
           xticklabels=['Fake', 'True'], yticklabels=['Fake', 'True'])
ax2.set_title(f'Multi-filter CNN\nAccuracy: {multi_cnn_accuracy:.3f}')
ax2.set_ylabel('Actual Label')
ax2.set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

## Model Comparison

In [None]:
# Compare model performance
comparison_df = pd.DataFrame({
    'Model': ['Standard CNN', 'Multi-filter CNN'],
    'Accuracy': [cnn_accuracy, multi_cnn_accuracy],
    'Loss': [cnn_loss, multi_cnn_loss]
})

print("Model Performance Comparison:")
print(comparison_df)

# Plot comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Accuracy comparison
sns.barplot(data=comparison_df, x='Model', y='Accuracy', ax=ax1)
ax1.set_title('Model Accuracy Comparison')
ax1.set_ylim(0.8, 1.0)  # Zoom in on the relevant range
for i, v in enumerate(comparison_df['Accuracy']):
    ax1.text(i, v + 0.005, f'{v:.3f}', ha='center')

# Loss comparison
sns.barplot(data=comparison_df, x='Model', y='Loss', ax=ax2)
ax2.set_title('Model Loss Comparison')
for i, v in enumerate(comparison_df['Loss']):
    ax2.text(i, v + 0.01, f'{v:.3f}', ha='center')

plt.tight_layout()
plt.show()

## Save Models and Tokenizer

In [None]:
# Save the best performing model
best_model = multi_cnn_model if multi_cnn_accuracy > cnn_accuracy else cnn_model
best_model_name = "multi_cnn" if multi_cnn_accuracy > cnn_accuracy else "standard_cnn"

best_model.save(f'../models/cnn_{best_model_name}.h5')
print(f"Best CNN model saved: ../models/cnn_{best_model_name}.h5")

# Save both models
cnn_model.save('../models/cnn_standard.h5')
multi_cnn_model.save('../models/cnn_multi_filter.h5')

# Save tokenizer
import pickle
with open('../models/cnn_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save model configuration
config = {
    'max_vocab_size': MAX_VOCAB_SIZE,
    'max_sequence_length': MAX_SEQUENCE_LENGTH,
    'embedding_dim': EMBEDDING_DIM,
    'batch_size': BATCH_SIZE,
    'epochs': EPOCHS
}

with open('../models/cnn_config.pickle', 'wb') as handle:
    pickle.dump(config, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("\nAll CNN models and configurations saved successfully!")

## Sample Predictions

In [None]:
# Function to make predictions on new text
def predict_fake_news(model, tokenizer, text, max_length=MAX_SEQUENCE_LENGTH):
    """Predict if a text is fake news"""
    # Preprocess the text
    processed_text = light_preprocess(text)
    
    # Convert to sequence
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    
    # Make prediction
    prediction_prob = model.predict(padded_sequence, verbose=0)[0][0]
    prediction = "True News" if prediction_prob > 0.5 else "Fake News"
    
    return prediction, prediction_prob

# Test on some examples from the test set
test_indices = np.random.choice(len(X_test), 5, replace=False)

print("Sample Predictions:")
print("=" * 80)

for i, idx in enumerate(test_indices):
    # Get original text
    original_idx = df.index[df.index.isin(range(len(X_test)))][idx]
    original_text = df.loc[original_idx, 'title'] + ' ' + df.loc[original_idx, 'text'][:200]
    true_label = "True News" if y_test[idx] == 1 else "Fake News"
    
    # Make prediction
    prediction, prob = predict_fake_news(best_model, tokenizer, original_text)
    
    print(f"\nExample {i+1}:")
    print(f"Text: {original_text[:200]}...")
    print(f"True Label: {true_label}")
    print(f"Predicted: {prediction} (Confidence: {prob:.3f})")
    print("-" * 80)

## Summary

### CNN Model Results:
- **Standard CNN**: Sequential architecture with multiple Conv1D layers
- **Multi-filter CNN**: Parallel filters with different kernel sizes (inspired by Kim 2014)

### Key Findings:
1. CNNs can effectively capture local patterns in text for fake news detection
2. Multi-filter architecture allows capturing different n-gram patterns simultaneously
3. Global max pooling helps extract the most important features
4. Dropout and early stopping help prevent overfitting

### Advantages of CNN approach:
- Faster training compared to RNNs
- Good at capturing local patterns and n-grams
- Parallel processing of different filter sizes
- Less prone to vanishing gradient problems

### Next Steps:
1. Implement LSTM/RNN models for comparison
2. Experiment with pre-trained embeddings (Word2Vec, GloVe)
3. Try attention mechanisms
4. Implement transformer-based models (BERT)
5. Ensemble different models for better performance