In [1]:
import pandas as pd

In [2]:
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset = 'all', random_state = 156) # subset 디폴트는 'train', category도 일부만 반환하는 것도 가능

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
print(news_data.keys())
# target_names : 분류값 이름
# target : 분류값 숫자


dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [4]:
print(len(news_data.values()))

5


In [5]:
news_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
# 클래스 값과 분포도

print('<클래스 숫자와 분포>')
print(pd.Series(news_data.target).value_counts().sort_index())

print('<클래스 개수와 shape>')
print(len(news_data.target_names), news_data.target.shape)

<클래스 숫자와 분포>
0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64
<클래스 개수와 shape>
20 (18846,)


In [7]:
print(news_data.data[0])
# Header에 target을 결정할 수 있는 주요 단어가 있음
# data[0] : Organization에 Microsystems 있음
# => 강한 텍스트 분류 모델을 만들기 위해 Header와 Footer는 데이터셋에서 지우고 사용

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

### 학습과 테스트용 데이터 생성

In [8]:
# subset = 'train'으로 학습용(Train) 데이터만 추출. remove = ('headers', 'footers', 'quotes')로 내용만 추출
train_news = fetch_20newsgroups(subset = 'train', remove = ('headers', 'footers', 'quotes'), random_state = 156)
X_train = train_news.data
y_train = train_news.target

# subset = 'test'으로 테스트용(Test) 데이터만 추출. remove = ('headers', 'footers', 'quotes')로 내용만 추출
test_news = fetch_20newsgroups(subset = 'test', remove = ('headers', 'footers', 'quotes'), random_state = 156)
X_test = test_news.data
y_test = test_news.target

print(type(X_train), type(X_test))
print('학습 데이터 크기 {0}, 테스트 데이터 크기 {1}'.format(len(X_train), len(X_test)))

<class 'list'> <class 'list'>
학습 데이터 크기 11314, 테스트 데이터 크기 7532


### Count 피처 벡터화 변환(CountVectorizer)과 머신러닝 모델 학습/예측/평가
* 주의 : 학습 데이터에 대해 fit()된 CountVectorizer를 이용해서 테스트 데이터를 피처 벡터화 해야함.   
테스트 데이터에서 fit_transform()이나 fit()을 수행하면 안됨.
* fit을 하면 차원이 만들어짐. train과 test는 피처수(컬럼수)가 같아야 함. 행의 수는 다를 수 있음.
* 즉, Test 데이터로는 transform()만 하기

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorization으로 feature extraction 변환 수행
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect= cnt_vect.transform(X_train)

# 학습 데이터로 fit()된 CountVectorizer를 이용하여 테스트 데이터를 feature extraction 변환 수행
X_test_cnt_vect = cnt_vect.transform(X_test)

print('학습 데이터와 테스트 데이터의 피처 벡터화된 단어사전의 shape :', X_train_cnt_vect.shape, X_test_cnt_vect.shape)

학습 데이터와 테스트 데이터의 피처 벡터화된 단어사전의 shape : (11314, 101631) (7532, 101631)


In [10]:
# 희소행렬과 텍스트 분류에서 로지스틱 회귀가 잘 동작함 (지도학습-Naive Bayes, SVM도 잘 됨)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# LogisticRegression을 이용하여 학습/예측/평가 수행
lr_reg = LogisticRegression()
lr_reg.fit(X_train_cnt_vect, y_train)
pred = lr_reg.predict(X_test_cnt_vect)
print('CountVectorized Logistic Regression의 예측 정확도 : {0: .3f}'.format(accuracy_score(y_test, pred)))
# Header 포함시 정확도 더 올라갈 수 있긴 함

CountVectorized Logistic Regression의 예측 정확도 :  0.608


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### TF-IDF 피처 변환과 머신러닝 모델 학습/예측/평가

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization 적용하여 학습 데이터셋과 테스트 데이터셋 변환
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

# LogisticRegression을 이용하여 학습/예측/평가 수행
lr_reg = LogisticRegression()
lr_reg.fit(X_train_tfidf_vect, y_train)
pred = lr_reg.predict(X_test_tfidf_vect)
print('TF-IDF Logistic Regression의 예측 정확도 : {0: .3f}'.format(accuracy_score(y_test,pred)))

TF-IDF Logistic Regression의 예측 정확도 :  0.674


### stop words 필터링을 추가하고 ngram을 기본 (1,1)에서 (1,2)로 변경하여 피처 벡터화

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_reg = LogisticRegression()
lr_reg.fit(X_train_tfidf_vect, y_train)
pred = lr_reg.predict(X_test_tfidf_vect)
print('TF-IDF Logistic Regression의 예측 정확도 : {0: .3f}'.format(accuracy_score(y_test,pred)))

TF-IDF Logistic Regression의 예측 정확도 :  0.692


### GridSearchCV로 LogisticRegression의 C 하이퍼 파라미터 튜닝

In [13]:
from sklearn.model_selection import GridSearchCV

# 최적 C값 도출하는 튜닝 수행. CV는 3 fold로 설정.
params = {'C':[0.01, 0.1, 1, 5, 10]} # C: 규제항 계수의 역수 - C값이 커질수록 규제 정도가 약해진다.
grid_cv_lr = GridSearchCV(lr_reg, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print('Logistic Regression best C parameter :', grid_cv_lr.best_params_)

# 최적 C값으로 학습된 grid_cv로 예측 수행하고 정확도 평가
pred = grid_cv_lr.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression의 예측 정확도 : {0: .3f}'.format(accuracy_score(y_test, pred)))


Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative sol

Logistic Regression best C parameter : {'C': 10}
TF-IDF Vectorized Logistic Regression의 예측 정확도 :  0.701


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 사이킷런 파이프라인(Pipeline) 사용 및 GridSearchCV와의 결합

In [None]:
from sklearn.pipeline import Pipeline

# TfidfVectorizer 객체를 tfidf_vect 객체명으로, LogisticRegression 객체를 lr_reg 객체명으로 생성하는 Pipeline 생성
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)),
    ('lr_reg', LogisticRegression(C=10))
])

# 별도 과정(TfidfVectorizer 객체의 fit_transform()과 LogisticRegression의 fit(), predict())가 필요없음
# pipeline의 fit()과 predict()만으로 한꺼번에 Feature Vectorization과 ML 학습/예측/평가가 가능
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
print('Pipeline을 통한 Logistic Regression의 예측 정확도는 {0: .3f}}'.format(accuracy_score(y_test, pred)))


In [16]:
print('Pipeline을 통한 Logistic Regression의 예측 정확도는 {0: .3f}'.format(accuracy_score(y_test, pred)))

Pipeline을 통한 Logistic Regression의 예측 정확도는  0.701


In [17]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)),
    ('lr_reg', LogisticRegression(C=10))
])

# Pipeline에 기술된 각각의 객체 변수에 언더바(__) 2개를 연달아 붙여 GridSearchCV에서 사용할
# 파아미터/하이퍼 파라미터 이름과 값을 설정
params = {'tfidf_vect__ngram_range':[(1,1),(1,2)],
          'tfidf_vect__max_df':[300,700],
          'lr_reg__C':[5,10]
         }

grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_pipe.fit(X_train, y_train)
print(grid_cv_pipe.best_params_, grid_cv_pipe.best_score_)

pred = grid_cv_pipe.predict(X_test)
print('Pipeline을 통한 Logistic Regression의 예측 정확도는 {0: .3f}'.format(accuracy_score(y_test,pred)))

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative sol

{'lr_reg__C': 10, 'tfidf_vect__max_df': 300, 'tfidf_vect__ngram_range': (1, 2)} 0.7536687914006531
Pipeline을 통한 Logistic Regression의 예측 정확도는  0.701
