# 1. PCA

In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset = 'train',
                                     remove = ('header', 'footers', 'quotes'),
                                     categories = categories)
newsgroups_test = fetch_20newsgroups(subset = 'test',
                                    remove = ('header', 'footers', 'quotes'),
                                    categories = categories)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

cachedStopWords = stopwords.words('english')

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

X_train = newsgroups_train.data
y_train = newsgroups_train.target

X_test = newsgroups_test.data
y_test = newsgroups_test.target

RegTok = RegexpTokenizer("[\w']{3,}")
english_stops = set(stopwords.words('english'))

def tokenizer(text):
    tokens = RegTok.tokenize(text.lower())
    words = [word for word in tokens if (word not in english_stops) and len(word) > 2]
    features = (list(map(lambda token: PorterStemmer().stem(token), words)))
    return features

In [10]:
tfidf = TfidfVectorizer(tokenizer = tokenizer)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [11]:
from sklearn.linear_model import LogisticRegression

LR_clf = LogisticRegression()
LR_clf.fit(X_train_tfidf, y_train)
print('#Train set score: {:.3f}'.format(LR_clf.score(X_train_tfidf, y_train)))
print('#Test set score: {:.3f}'.format(LR_clf.score(X_test_tfidf, y_test)))

#Train set score: 0.992
#Test set score: 0.840


In [14]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2000, random_state = 7)
# PCA는 희소 벡터 형식에 대해 직접적인 연산 지원 X -> CountVectorizer, TfidfVectorizer를 toarray()로 변환.
X_train_pca = pca.fit_transform(X_train_tfidf.toarray())
X_test_pca = pca.transform(X_test_tfidf.toarray())

print('Original tfidf matrix shape: ', X_train_tfidf.shape)
print('PCA Converted matrix shape: ', X_train_pca.shape)
print('Sum of explained variance ratio: {:.3f}'.format(pca.explained_variance_ratio_.sum()))

Original tfidf matrix shape:  (2034, 22742)
PCA Converted matrix shape:  (2034, 2000)
Sum of explained variance ratio: 1.000


In [15]:
LR_clf.fit(X_train_pca, y_train)
print('#Train set score: {:.3f}'.format(LR_clf.score(X_train_pca, y_train)))
print('#Test set score: {:.3f}'.format(LR_clf.score(X_test_pca, y_test)))

#Train set score: 0.992
#Test set score: 0.840


## Lasso

In [16]:
lasso_clf = LogisticRegression(penalty = 'l1', solver = 'liblinear', C = 1)
lasso_clf.fit(X_train_tfidf, y_train)

print('#Train set score: {:.3f}'.format(lasso_clf.score(X_train_tfidf, y_train)))
print('#Test set score: {:.3f}'.format(lasso_clf.score(X_test_tfidf, y_test)))

#Train set score: 0.894
#Test set score: 0.787


In [17]:
import numpy as np

print('#Used features count: {}'.format(np.sum(lasso_clf.coef_ != 0)),
     'out of', X_train_tfidf.shape[1])

#Used features count: 283 out of 22742


## Lasso와 동일하게 차원 축소

In [18]:
pca = PCA(n_components = 283, random_state = 7)

X_train_pca = pca.fit_transform(X_train_tfidf.toarray())
X_test_pca = pca.transform(X_test_tfidf.toarray())
print('PCA Converted X shape: ', X_train_pca.shape)
print('Sum of explained variance ratio: {:.3f}'.format(pca.explained_variance_ratio_.sum()))

PCA Converted X shape:  (2034, 283)
Sum of explained variance ratio: 0.438


In [20]:
LR_clf.fit(X_train_pca, y_train)
print('#Train set score: {:.3f}'.format(LR_clf.score(X_train_pca, y_train)))
print('#Test set score: {:.3f}'.format(LR_clf.score(X_test_pca, y_test)))

#Train set score: 0.948
#Test set score: 0.828


## n_components = 100

In [22]:
pca = PCA(n_components = 100, random_state = 7)

X_train_pca = pca.fit_transform(X_train_tfidf.toarray())
X_test_pca = pca.transform(X_test_tfidf.toarray())
print('PCA Converted X shape: ', X_train_pca.shape)
print('Sum of explained variance ratio: {:.3f}'.format(pca.explained_variance_ratio_.sum()))

PCA Converted X shape:  (2034, 100)
Sum of explained variance ratio: 0.251


In [23]:
LR_clf.fit(X_train_pca, y_train)
print('#Train set score: {:.3f}'.format(LR_clf.score(X_train_pca, y_train)))
print('#Test set score: {:.3f}'.format(LR_clf.score(X_test_pca, y_test)))

#Train set score: 0.908
#Test set score: 0.816


# 2. LSA