Based on the [sklearn newsgroups example](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html)

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cats = ['rec.autos',
        'rec.motorcycles',
        'rec.sport.baseball',
        'rec.sport.hockey']

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=cats,
                                      remove=('headers', 'footers', 'quotes'))

newsgroups_test = fetch_20newsgroups(subset='test',
                                     categories=cats,
                                     remove=('headers', 'footers', 'quotes'))

In [None]:
df = pd.DataFrame({
    'text': newsgroups_test['data'],
    'label': newsgroups_test['target']
})

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

In [None]:
clf = MultinomialNB(alpha=.01)

In [None]:
clf.fit(vectors_train, newsgroups_train.target)

In [None]:
pred_probs = clf.predict_proba(vectors_test)

In [None]:
df['prediction'] = np.argmax(pred_probs, axis=1)
df['confidence'] = np.max(pred_probs, axis=1)

In [None]:
num_to_label = { index: value for index, value in enumerate(newsgroups_train['target_names']) }

In [None]:
df['label'].replace(num_to_label, inplace=True)
df['prediction'].replace(num_to_label, inplace=True)

In [None]:
df[df['label'] == df['prediction']].shape[0] / df.shape[0]

In [None]:
df.head()

In [None]:
# remove posts that are only whitespace
df = df[df['text'].str.strip() != '']

In [None]:
df.to_csv('../public/datasets/newsgroups-rec.csv', index=False)