# 06 - Text Classification: Media vs Public
## INSY 669 Text Analytics | GLP-1 Weight Loss Drugs

This notebook builds classifiers to distinguish **media** from **public** text, applying:
1. **Naive Bayes** (Multinomial) - probabilistic classifier using word frequencies
2. **K-Nearest Neighbors (K-NN)** - similarity-based classification with cosine distance
3. **Evaluation** - confusion matrices, precision, recall, F1, and hyperparameter tuning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, f1_score
)
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

## 6.1 Load and Prepare Data

In [None]:
df_public = pd.read_csv('../data/public_processed.csv')
df_media = pd.read_csv('../data/media_processed.csv')

# Create labeled dataset
df_public['label'] = 'public'
df_media['label'] = 'media'
df = pd.concat([df_public[['clean', 'label']], df_media[['clean', 'label']]], ignore_index=True)
df = df.dropna(subset=['clean'])

X = df['clean']
y = df['label']

print(f"Total documents: {len(df)}")
print(f"Class distribution:\n{y.value_counts()}")
print(f"\nClass balance: {y.value_counts(normalize=True).to_dict()}")

In [None]:
# Train/test split (80/20, stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {len(X_train)} | Test: {len(X_test)}")
print(f"Train distribution: {y_train.value_counts().to_dict()}")
print(f"Test distribution:  {y_test.value_counts().to_dict()}")

## 6.2 Naive Bayes Classifier

Multinomial Naive Bayes works with word counts/frequencies and applies Bayes' theorem:

$$P(class | doc) = P(class) \times \prod_{i} P(word_i | class)$$

We test with both Bag-of-Words and TF-IDF features.

In [None]:
# --- Naive Bayes with Bag-of-Words ---
bow_nb = Pipeline([
    ('vectorizer', CountVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])
bow_nb.fit(X_train, y_train)
y_pred_bow = bow_nb.predict(X_test)

print("NAIVE BAYES + BAG-OF-WORDS")
print(f"Accuracy: {accuracy_score(y_test, y_pred_bow):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_bow, pos_label='media'):.4f}")
print()
print(classification_report(y_test, y_pred_bow))

In [None]:
# --- Naive Bayes with TF-IDF ---
tfidf_nb = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])
tfidf_nb.fit(X_train, y_train)
y_pred_tfidf = tfidf_nb.predict(X_test)

print("NAIVE BAYES + TF-IDF")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tfidf):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_tfidf, pos_label='media'):.4f}")
print()
print(classification_report(y_test, y_pred_tfidf))

## 6.3 Naive Bayes Hyperparameter Tuning

In [None]:
# Grid search over alpha (Laplace smoothing) and vectorizer params
param_grid = {
    'vectorizer__max_features': [3000, 5000, 8000],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'classifier__alpha': [0.01, 0.1, 0.5, 1.0, 2.0]
}

grid_nb = GridSearchCV(
    tfidf_nb, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=0
)
grid_nb.fit(X_train, y_train)

print(f"Best parameters: {grid_nb.best_params_}")
print(f"Best CV F1 score: {grid_nb.best_score_:.4f}")

y_pred_nb_best = grid_nb.predict(X_test)
print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred_nb_best):.4f}")
print(f"Test F1 Score: {f1_score(y_test, y_pred_nb_best, pos_label='media'):.4f}")
print()
print(classification_report(y_test, y_pred_nb_best))

## 6.4 K-Nearest Neighbors (K-NN) Classifier

K-NN classifies based on the majority label of the k closest training documents,
using cosine similarity as the distance metric.

In [None]:
# Vectorize with TF-IDF for KNN
tfidf_vec = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)

# K-NN with cosine distance (metric='cosine')
knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn.fit(X_train_tfidf, y_train)
y_pred_knn = knn.predict(X_test_tfidf)

print("K-NN (k=5, cosine distance)")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_knn, pos_label='media'):.4f}")
print()
print(classification_report(y_test, y_pred_knn))

## 6.5 K-NN Hyperparameter Tuning

In [None]:
# Find optimal k using cross-validation
k_range = range(1, 21)
cv_scores = []

for k in k_range:
    knn_cv = KNeighborsClassifier(n_neighbors=k, metric='cosine')
    scores = cross_val_score(knn_cv, X_train_tfidf, y_train, cv=5, scoring='f1_macro')
    cv_scores.append(scores.mean())

best_k = k_range[np.argmax(cv_scores)]
print(f"Best k: {best_k} (CV F1: {max(cv_scores):.4f})")

# Plot k vs CV F1
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(k_range, cv_scores, 'o-', color='#2196F3', linewidth=2)
ax.axvline(x=best_k, color='#E94560', linestyle='--', label=f'Best k={best_k}')
ax.set_xlabel('k (number of neighbors)')
ax.set_ylabel('Cross-Validated F1 Score')
ax.set_title('K-NN: Choosing Optimal k', fontweight='bold')
ax.legend()
plt.tight_layout()
plt.savefig('../figures/knn_k_selection.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Final KNN with best k
knn_best = KNeighborsClassifier(n_neighbors=best_k, metric='cosine')
knn_best.fit(X_train_tfidf, y_train)
y_pred_knn_best = knn_best.predict(X_test_tfidf)

print(f"K-NN (k={best_k}, cosine distance) - TUNED")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn_best):.4f}")
print()
print(classification_report(y_test, y_pred_knn_best))

## 6.6 Model Comparison & Confusion Matrices

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# NB confusion matrix
cm_nb = confusion_matrix(y_test, y_pred_nb_best, labels=['media', 'public'])
ConfusionMatrixDisplay(cm_nb, display_labels=['Media', 'Public']).plot(ax=axes[0], cmap='Blues')
axes[0].set_title(f'Naive Bayes (Tuned)\nAccuracy: {accuracy_score(y_test, y_pred_nb_best):.3f}', fontweight='bold')

# KNN confusion matrix
cm_knn = confusion_matrix(y_test, y_pred_knn_best, labels=['media', 'public'])
ConfusionMatrixDisplay(cm_knn, display_labels=['Media', 'Public']).plot(ax=axes[1], cmap='Oranges')
axes[1].set_title(f'K-NN (k={best_k}, Tuned)\nAccuracy: {accuracy_score(y_test, y_pred_knn_best):.3f}', fontweight='bold')

plt.tight_layout()
plt.savefig('../figures/classification_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Summary comparison table
results = pd.DataFrame({
    'Model': ['Naive Bayes (BoW)', 'Naive Bayes (TF-IDF)', 'Naive Bayes (Tuned)',
              'K-NN (k=5)', f'K-NN (k={best_k}, Tuned)'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_bow),
        accuracy_score(y_test, y_pred_tfidf),
        accuracy_score(y_test, y_pred_nb_best),
        accuracy_score(y_test, y_pred_knn),
        accuracy_score(y_test, y_pred_knn_best)
    ],
    'F1 (media)': [
        f1_score(y_test, y_pred_bow, pos_label='media'),
        f1_score(y_test, y_pred_tfidf, pos_label='media'),
        f1_score(y_test, y_pred_nb_best, pos_label='media'),
        f1_score(y_test, y_pred_knn, pos_label='media'),
        f1_score(y_test, y_pred_knn_best, pos_label='media')
    ]
})
print(results.to_string(index=False))

## 6.7 Most Discriminative Features (Naive Bayes)

In [None]:
# Extract most informative features from best NB model
best_nb_model = grid_nb.best_estimator_
vectorizer = best_nb_model.named_steps['vectorizer']
classifier = best_nb_model.named_steps['classifier']

feature_names = vectorizer.get_feature_names_out()
log_probs = classifier.feature_log_prob_

# media = class 0, public = class 1 (alphabetical)
class_labels = classifier.classes_
media_idx = np.where(class_labels == 'media')[0][0]
public_idx = np.where(class_labels == 'public')[0][0]

# Log-ratio: words most indicative of media vs public
log_ratio = log_probs[media_idx] - log_probs[public_idx]

top_media = np.argsort(log_ratio)[-15:][::-1]
top_public = np.argsort(log_ratio)[:15]

print("Top words indicating MEDIA:")
for i in top_media:
    print(f"  {feature_names[i]:25s} log-ratio: {log_ratio[i]:.3f}")

print(f"\nTop words indicating PUBLIC:")
for i in top_public:
    print(f"  {feature_names[i]:25s} log-ratio: {log_ratio[i]:.3f}")