# Comparison of naive Bayes and logistic regression for text categorization

Adapted from https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

## Load subset of "20 Newsgroups" dataset

In [None]:
categories = ["misc.forsale", "sci.space", 
              "sci.electronics", "comp.graphics"]
twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories, 
                                  shuffle=True)
twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories, 
                                 shuffle=True)

In [None]:
len(twenty_train.data)

In [None]:
len(twenty_test.data)

In [None]:
for t in twenty_train.target[:5]:
    print(twenty_train.target_names[t])

In [None]:
y_train = twenty_train.target
y_test = twenty_test.target

## Normalize and vectorize documents

In [None]:
vectorizer = TfidfVectorizer(min_df=3, stop_words="english").fit(twenty_train.data)
X_train = vectorizer.transform(twenty_train.data)
X_test = vectorizer.transform(twenty_test.data)

In [None]:
X_train.shape

## Naive Bayes model

In [None]:
%%time
nb_model = MultinomialNB(alpha=1.0).fit(X_train, y_train)
y_hat_nb_test = nb_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_hat_nb_test, 
                            target_names=twenty_train.target_names))

## Logistic regression model

In [None]:
%%time
lr_model = LogisticRegression(penalty="none", 
                              multi_class="multinomial",
                              solver="lbfgs").fit(X_train, y_train)
y_hat_lr_test = lr_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_hat_lr_test, 
                            target_names=twenty_train.target_names))

## Logistic regression with L2 penalty

In [None]:
%%time
lr2_model = LogisticRegression(penalty="l2", 
                               solver="lbfgs",
                               multi_class="multinomial",
                               max_iter=1000,
                               C=10).fit(X_train, y_train)
y_hat_lr2_test = lr2_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_hat_lr2_test, 
                            target_names=twenty_train.target_names))

## Comparison of train/test performance across models

In [None]:
model_info = {"Naive Bayes": nb_model,
              "Logistic Regression": lr_model,
              "L2 Regularized LR": lr2_model}
plot_data = []
for name, model in model_info.items():
    train_acc = accuracy_score(y_train, model.predict(X_train))
    plot_data.append([name, "Train", train_acc])
    test_acc = accuracy_score(y_test, model.predict(X_test))
    plot_data.append([name, "Test", test_acc])  

In [None]:
plt.figure(figsize=(6,6))
plt.ylim((0.9,1))
plot_df = pd.DataFrame(plot_data, columns=["model", "dataset", "accuracy"])
sns.lineplot(data=plot_df, 
             sort=False,
             x="dataset", 
             y="accuracy", 
             hue="model")
plt.show()

## Feature importances

In [None]:
pd.options.display.float_format = '{:.4f}'.format
vocab = {idx: w for w, idx in vectorizer.vocabulary_.items()}

### Naive Bayes

In [None]:
word_data = {}
for i, c in enumerate(twenty_train.target_names):
    top_features = np.argsort(nb_model.feature_log_prob_[i,:])[-1:-11:-1]
    logprobs = nb_model.feature_log_prob_[i,top_features]
    words = [vocab[x] for x in top_features]
    word_data[f"{c}_P(w|c)"] = [np.exp(x) for x in logprobs]
    word_data[f"{c}_words"] = words

In [None]:
pd.DataFrame(word_data).T

## Logistic regression

In [None]:
word_data = {}
for i, c in enumerate(twenty_train.target_names):
    top_features = np.argsort(lr2_model.coef_[i,:])[-1:-11:-1]
    coefs = lr2_model.coef_[i,top_features]
    words = [vocab[x] for x in top_features]
    word_data[f"{c}_beta"] = coefs
    word_data[f"{c}_words"] = words

In [None]:
pd.DataFrame(word_data).T