In [17]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA, TruncatedSVD
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [7]:
X_train = newsgroups_train.data
y_train = newsgroups_train.target
X_test = newsgroups_test.data
y_test = newsgroups_test.target

In [8]:
reg_token = RegexpTokenizer("[\w']{3,}")

In [9]:
def tokenizer(text):
    tokens = reg_token.tokenize(text.lower())
    features = list(map(lambda token: WordNetLemmatizer().lemmatize(token), tokens))
    return features

In [10]:
tfidf = TfidfVectorizer(tokenizer=tokenizer)
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)



In [14]:
lr_clf = LogisticRegression(C=10, max_iter=1000)
lr_clf.fit(X_train_tf, y_train)
print(lr_clf.score(X_train_tf, y_train), lr_clf.score(X_test_tf, y_test), X_train_tf.shape)

0.9783677482792527 0.7730968218773097 (2034, 24848)


In [16]:
pca = PCA(n_components=2000, random_state=7)
X_train_pca = pca.fit_transform(X_train_tf.toarray())
X_test_pca = pca.transform(X_test_tf.toarray())
print(pca.explained_variance_ratio_.sum())

0.9999999999999998


In [19]:
lsa = TruncatedSVD(n_components=2000, random_state=7)
X_train_lsa = lsa.fit_transform(X_train_tf)
X_test_lsa = lsa.transform(X_test_tf)
print(lsa.explained_variance_ratio_.sum())

1.0
