In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
import numpy as np

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [3]:
X_train, X_test = newsgroups_train.data, newsgroups_test.data
y_train, y_test = newsgroups_train.target, newsgroups_test.target

In [5]:
def my_tokenizer(doc):
    tokens = RegexpTokenizer(r"[\w']+").tokenize(doc.lower())
    return [WordNetLemmatizer().lemmatize(token) for token in tokens if len(token) > 1]

In [7]:
tfidf = TfidfVectorizer(tokenizer=my_tokenizer, min_df=3, max_df=0.5, stop_words='english')
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
X_train_tf.shape, X_test_tf.shape



((2034, 7581), (1353, 7581))

In [14]:
nb_clf = MultinomialNB(alpha=0.05).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

(0.9601769911504425, 0.7849223946784922)

In [20]:
logreg = LogisticRegression(C=3, max_iter=1000).fit(X_train_tf, y_train)
logreg.score(X_train_tf, y_train), logreg.score(X_test_tf, y_test)

(0.9744346116027532, 0.7612712490761271)

In [22]:
pca = PCA(n_components=2000, random_state=7)
X_train_pca = pca.fit_transform(X_train_tf.toarray())
X_test_pca = pca.transform(X_test_tf.toarray())
X_train_pca.shape, X_test_pca.shape, pca.explained_variance_ratio_.sum()

((2034, 2000), (1353, 2000), 1.0)

In [24]:
logreg = LogisticRegression(C=3, max_iter=1000).fit(X_train_pca, y_train)
logreg.score(X_train_pca, y_train), logreg.score(X_test_pca, y_test)

(0.9744346116027532, 0.7612712490761271)

In [26]:
svd = TruncatedSVD(n_components=2000, random_state=7)
X_train_lsa = svd.fit_transform(X_train_tf)
X_test_lsa = svd.transform(X_test_tf)
X_train_lsa.shape, X_test_lsa.shape, svd.explained_variance_ratio_.sum()

((2034, 2000), (1353, 2000), 1.0000000000000002)

In [27]:
svd = TruncatedSVD(n_components=100, random_state=7)
X_train_lsa = svd.fit_transform(X_train_tf)
X_test_lsa = svd.transform(X_test_tf)
X_train_lsa.shape, X_test_lsa.shape, svd.explained_variance_ratio_.sum()

((2034, 100), (1353, 100), 0.22024869908357228)

In [31]:
sim_result = cosine_similarity([X_train_lsa[0]], X_train_lsa)
sim_result_sort = sorted(sim_result[0].round(2), reverse=True)
sim_index = sim_result[0].argsort()[::-1]
print(sim_index[:20], '\n', sim_result_sort[:20])

[   0 1892 1995 1575 1728 1957  501 1674 1029 1209  790 1826 1902 1038
  998 1089 1290  892  651 1490] 
 [1.0, 0.76, 0.74, 0.74, 0.73, 0.72, 0.71, 0.69, 0.68, 0.67, 0.67, 0.67, 0.65, 0.62, 0.6, 0.58, 0.58, 0.58, 0.58, 0.57]
