In [13]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
import numpy as np

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [3]:
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [5]:
def tokenizer(text):
    tokens = RegexpTokenizer("[\w']+").tokenize(text.lower())
    return [WordNetLemmatizer().lemmatize(token) for token in tokens if len(token) > 1]

In [6]:
tf = TfidfVectorizer(tokenizer=tokenizer, min_df=3, max_df=0.5, stop_words='english')

In [7]:
X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)



In [26]:
nb_clf = MultinomialNB(alpha=0.05).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

(0.9601769911504425, 0.7849223946784922)

In [18]:
pca = PCA(n_components=2000, random_state=7)
X_train_pca = pca.fit_transform(X_train_tf.toarray())
X_test_pca = pca.transform(X_test_tf.toarray())

In [19]:
X_train_tf.shape, X_train_pca.shape, np.sum(pca.explained_variance_ratio_)

((2034, 7581), (2034, 2000), 1.0000000000000002)

In [35]:
logreg = LogisticRegression(C=10, max_iter=1000).fit(X_train_pca, y_train)
logreg.score(X_train_pca, y_train), logreg.score(X_test_pca, y_test)

(0.976401179941003, 0.7597930524759793)

In [36]:
svd = TruncatedSVD(n_components=2000, random_state=7)
X_train_svd = svd.fit_transform(X_train_tf)
X_test_svd = svd.transform(X_test_tf)

In [37]:
X_train_tf.shape, X_train_svd.shape, np.sum(svd.explained_variance_ratio_)

((2034, 7581), (2034, 2000), 1.0000000000000002)

In [38]:
logreg = LogisticRegression(C=10, max_iter=1000).fit(X_train_svd, y_train)
logreg.score(X_train_svd, y_train), logreg.score(X_test_svd, y_test)

(0.976401179941003, 0.7597930524759793)