# Text Preprocessing and Baseline Model

This notebook implements text preprocessing pipeline and builds a baseline model using bag-of-words and logistic regression.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import joblib
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

## Load Data

In [None]:
# Load the combined dataset
df = pd.read_csv('../data/combined_news_dataset.csv')
print(f"Dataset shape: {df.shape}")
print(f"Label distribution:\n{df['label'].value_counts()}")

## Text Preprocessing Pipeline

In [None]:
class TextPreprocessor:
    def __init__(self, use_stemming=True, remove_stopwords=True):
        self.use_stemming = use_stemming
        self.remove_stopwords = remove_stopwords
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
    
    def clean_text(self, text):
        """Basic text cleaning"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        return text
    
    def tokenize_and_process(self, text):
        """Tokenize and apply stemming/lemmatization"""
        tokens = word_tokenize(text)
        
        # Remove stopwords
        if self.remove_stopwords:
            tokens = [token for token in tokens if token not in self.stop_words]
        
        # Apply stemming or lemmatization
        if self.use_stemming:
            tokens = [self.stemmer.stem(token) for token in tokens]
        else:
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        
        return ' '.join(tokens)
    
    def preprocess(self, text):
        """Full preprocessing pipeline"""
        if pd.isna(text):
            return ''
        
        # Clean text
        text = self.clean_text(text)
        
        # Tokenize and process
        text = self.tokenize_and_process(text)
        
        return text

In [None]:
# Initialize preprocessor
preprocessor = TextPreprocessor(use_stemming=True, remove_stopwords=True)

# Test preprocessing on a sample
sample_text = df.iloc[0]['text'][:500]  # First 500 chars
print("Original text:")
print(sample_text)
print("\n" + "="*50 + "\n")
print("Preprocessed text:")
print(preprocessor.preprocess(sample_text))

In [None]:
# Combine title and text for richer features
df['combined_text'] = df['title'] + ' ' + df['text']

# Apply preprocessing (this may take a while for the full dataset)
print("Preprocessing text... (this may take several minutes)")
df['processed_text'] = df['combined_text'].apply(preprocessor.preprocess)

# Remove empty texts
df = df[df['processed_text'] != ''].reset_index(drop=True)

print(f"Dataset shape after preprocessing: {df.shape}")
print("\nSample processed texts:")
for i in range(3):
    print(f"\n{i+1}. Label: {df.iloc[i]['label']}")
    print(df.iloc[i]['processed_text'][:200] + '...')

## Train-Test Split

In [None]:
# Split the data
X = df['processed_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"\nTraining set label distribution:")
print(y_train.value_counts())
print(f"\nTest set label distribution:")
print(y_test.value_counts())

## Baseline Models

### 1. Bag of Words + Logistic Regression

In [None]:
# Count Vectorizer + Logistic Regression
bow_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

print("Training Bag of Words + Logistic Regression...")
bow_pipeline.fit(X_train, y_train)

# Predictions
bow_pred = bow_pipeline.predict(X_test)
bow_accuracy = accuracy_score(y_test, bow_pred)

print(f"\nBag of Words + Logistic Regression Accuracy: {bow_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, bow_pred, target_names=['Fake', 'True']))

### 2. TF-IDF + Logistic Regression

In [None]:
# TF-IDF + Logistic Regression
tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

print("Training TF-IDF + Logistic Regression...")
tfidf_pipeline.fit(X_train, y_train)

# Predictions
tfidf_pred = tfidf_pipeline.predict(X_test)
tfidf_accuracy = accuracy_score(y_test, tfidf_pred)

print(f"\nTF-IDF + Logistic Regression Accuracy: {tfidf_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, tfidf_pred, target_names=['Fake', 'True']))

### 3. TF-IDF + Naive Bayes

In [None]:
# TF-IDF + Naive Bayes
nb_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

print("Training TF-IDF + Naive Bayes...")
nb_pipeline.fit(X_train, y_train)

# Predictions
nb_pred = nb_pipeline.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)

print(f"\nTF-IDF + Naive Bayes Accuracy: {nb_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, nb_pred, target_names=['Fake', 'True']))

## Cross-Validation

In [None]:
# Perform cross-validation on the best performing model
print("Performing 5-fold cross-validation on TF-IDF + Logistic Regression...")

cv_scores = cross_val_score(tfidf_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

## Results Comparison and Confusion Matrix

In [None]:
# Compare results
results_df = pd.DataFrame({
    'Model': ['Bag of Words + LR', 'TF-IDF + LR', 'TF-IDF + NB'],
    'Accuracy': [bow_accuracy, tfidf_accuracy, nb_accuracy]
})

results_df = results_df.sort_values('Accuracy', ascending=False)
print("Model Performance Comparison:")
print(results_df)

# Plot results
plt.figure(figsize=(10, 6))
sns.barplot(data=results_df, x='Model', y='Accuracy')
plt.title('Baseline Model Performance Comparison')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
for i, v in enumerate(results_df['Accuracy']):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center')
plt.tight_layout()
plt.show()

In [None]:
# Confusion matrix for the best model
best_pred = tfidf_pred  # Assuming TF-IDF + LR is best

cm = confusion_matrix(y_test, best_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
           xticklabels=['Fake', 'True'], yticklabels=['Fake', 'True'])
plt.title('Confusion Matrix - TF-IDF + Logistic Regression')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

# Calculate additional metrics
tn, fp, fn, tp = cm.ravel()
precision_fake = tn / (tn + fn)
recall_fake = tn / (tn + fp)
precision_true = tp / (tp + fp)
recall_true = tp / (tp + fn)

print(f"\nDetailed Metrics:")
print(f"True Negatives: {tn}, False Positives: {fp}")
print(f"False Negatives: {fn}, True Positives: {tp}")
print(f"\nFake News - Precision: {precision_fake:.3f}, Recall: {recall_fake:.3f}")
print(f"True News - Precision: {precision_true:.3f}, Recall: {recall_true:.3f}")

## Feature Analysis

In [None]:
# Analyze most important features
vectorizer = tfidf_pipeline.named_steps['vectorizer']
classifier = tfidf_pipeline.named_steps['classifier']

# Get feature names and coefficients
feature_names = vectorizer.get_feature_names_out()
coefficients = classifier.coef_[0]

# Top features for fake news (negative coefficients)
fake_indices = np.argsort(coefficients)[:20]
fake_features = [(feature_names[i], coefficients[i]) for i in fake_indices]

# Top features for true news (positive coefficients)
true_indices = np.argsort(coefficients)[-20:]
true_features = [(feature_names[i], coefficients[i]) for i in true_indices]

print("Top 20 features associated with FAKE news:")
for feature, coef in fake_features:
    print(f"{feature}: {coef:.4f}")

print("\nTop 20 features associated with TRUE news:")
for feature, coef in reversed(true_features):
    print(f"{feature}: {coef:.4f}")

## Save Models

In [None]:
# Save the best performing model and preprocessor
joblib.dump(tfidf_pipeline, '../models/baseline_tfidf_lr.pkl')
joblib.dump(preprocessor, '../models/text_preprocessor.pkl')

print("Models saved successfully!")
print("- Baseline TF-IDF + Logistic Regression: ../models/baseline_tfidf_lr.pkl")
print("- Text Preprocessor: ../models/text_preprocessor.pkl")

## Summary

### Baseline Results:
- **TF-IDF + Logistic Regression**: Likely the best performing baseline
- **TF-IDF + Naive Bayes**: Good performance, faster training
- **Bag of Words + Logistic Regression**: Simple but effective

### Key Insights:
1. TF-IDF generally outperforms simple bag-of-words
2. Bigrams (2-word combinations) help capture important patterns
3. Text preprocessing (stemming, stopword removal) improves performance
4. The dataset appears to have learnable patterns distinguishing fake from true news

### Next Steps:
1. Implement deep learning models (CNN, LSTM)
2. Experiment with pre-trained embeddings (Word2Vec, GloVe)
3. Try transformer models (BERT, DistilBERT)
4. Fine-tune hyperparameters
5. Ensemble methods