In [17]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

In [4]:
X_train, X_test, y_train, y_test = newsgroups_train.data, newsgroups_test.data, newsgroups_train.target, newsgroups_test.target

In [7]:
def tokenizer(text):
    reg_tokens = RegexpTokenizer(r"[\w']{2,}").tokenize(text.lower())
    words = [word for word in reg_tokens if (word not in set(stopwords.words('english'))) and len(word) >= 2]
    tokens = [WordNetLemmatizer().lemmatize(token) for token in words]
    return tokens

In [10]:
tfidf = TfidfVectorizer(tokenizer=tokenizer, max_features=10000, min_df=3, max_df=0.5, ngram_range=(1, 2))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
print(X_train_tf.shape, X_test_tf.shape)



(2034, 10000) (1353, 10000)


In [16]:
logreg_clf = LogisticRegression(max_iter=1000, C=3)
logreg_clf.fit(X_train_tf, y_train)
print(logreg_clf.score(X_train_tf, y_train), logreg_clf.score(X_test_tf, y_test))

0.9739429695181907 0.7605321507760532


In [21]:
pca = PCA(n_components=2000, random_state=0)
X_train_pca = pca.fit_transform(X_train_tf)
X_test_pca = pca.transform(X_test_tf)
print(f'{pca.explained_variance_ratio_.sum():.3f}')
print(X_train_pca.shape, X_test_pca.shape)

1.000
(2034, 2000) (1353, 2000)


In [22]:
logreg_clf = LogisticRegression(max_iter=1000, C=3)
logreg_clf.fit(X_train_pca, y_train)
print(logreg_clf.score(X_train_pca, y_train), logreg_clf.score(X_test_pca, y_test))

0.9739429695181907 0.7605321507760532


In [23]:
pca = PCA(n_components=500, random_state=0)
X_train_pca = pca.fit_transform(X_train_tf)
X_test_pca = pca.transform(X_test_tf)
print(f'{pca.explained_variance_ratio_.sum():.3f}')
print(X_train_pca.shape, X_test_pca.shape)

0.577
(2034, 500) (1353, 500)


In [24]:
logreg_clf = LogisticRegression(max_iter=1000, C=3)
logreg_clf.fit(X_train_pca, y_train)
print(logreg_clf.score(X_train_pca, y_train), logreg_clf.score(X_test_pca, y_test))

0.9306784660766961 0.7634885439763488
