# 뉴스그룹 분류
- 로지스틱회귀, 서포트벡터머신, 나이브베이지안 알고르즘들이 주로 사용된다.

In [41]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
trains_news = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'), random_state=156)
X_train = trains_news.data
y_train = trains_news.target

tests_news = fetch_20newsgroups(subset='test', remove=('headers','footers','quotes'), random_state=156)
X_test = tests_news.data
y_test = tests_news.target

print(len(X_train), len(X_test))

11314 7532


## Count 기반 피쳐벡터화
- `CountVectorizer` 
- CountVectorized Logistic Regression accuracy :  0.62


In [45]:
from sklearn.feature_extraction.text import CountVectorizer
cnt_vect = CountVectorizer()
# 테스트 데이터 피처 벡터화를 할때 fit_transform을 사용하면안된다.
# 만약 train.fit_transform / test.fit_transform을 사용하면 컬럼수가 달라짐.
cnt_vect.fit(X_train,y_train)
X_train_cnt_vect= cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)

print('X_train_cnt_vect shape :', X_train_cnt_vect.shape)
print('X_test_cnt_vect shape :', X_test_cnt_vect.shape)

X_train_cnt_vect shape : (11314, 101631)
X_test_cnt_vect shape : (7532, 101631)


In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, r2_score, accuracy_score
lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(X_train_cnt_vect, y_train)
pred = lr_model.predict(X_test_cnt_vect)
accuracy = accuracy_score(pred,y_test)
print('CountVectorized Logistic Regression accuracy : ', np.around(accuracy,2))


CountVectorized Logistic Regression accuracy :  0.62


## TFIDF 기반 피쳐벡터화
- `TfidVectorizer` 
- TFIDF Logistic Regression accuracy :  0.68

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

print('X_train_tfidf_vect.shape: ', X_train_tfidf_vect.shape)
print('X_test.shape: ',X_test_tfidf_vect.shape)

X_train_tfidf_vect.shape:  (11314, 101631)
X_test.shape:  (7532, 101631)


In [55]:
lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(X_train_tfidf_vect,y_train)
pred =lr_model.predict(X_test_tfidf_vect)
accuracy = accuracy_score(pred,y_test)
print('TFIDF Logistic Regression accuracy : ', np.around(accuracy,2))

TFIDF Logistic Regression accuracy :  0.68


## hyper-parameter turning
TFIDF가 count 기반보다 높은 예측성능을 보인다. 
그렇다면 파라미터를 변경해서 성능을 높혀보자
- GridSearchCV best accuracy :  0.75
- TFIDF Logistic Regression accuracy :  0.69

In [58]:
tfidf_vect = TfidfVectorizer(stop_words='english',ngram_range=(1,2),max_df=300)
tfidf_vect.fit(X_train,y_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)
print('X_train_tfidf_vect : ',X_train_tfidf_vect.shape)
print('X_test_tfidf_vect : ',X_test_tfidf_vect.shape)

lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(X_train_tfidf_vect,y_train)
pred = lr_model.predict(X_test_tfidf_vect)
accuracy = accuracy_score(pred, y_test)
print('TFIDF Logistic Regression accuracy : ', np.around(accuracy,2))

X_train_tfidf_vect :  (11314, 943453)
X_test_tfidf_vect :  (7532, 943453)
TFIDF Logistic Regression accuracy :  0.69


GridSearchCV로 로지스틱회귀분석의 하이퍼파라미터 최적화를 해보자

In [60]:
from sklearn.model_selection import GridSearchCV
params={'C':[0.01,0.1,1,5,10]}
gridcv= GridSearchCV(lr_model, param_grid=params, cv=3, scoring='accuracy')
gridcv.fit(X_train_tfidf_vect,y_train)
print('TFIDF Logistic Regression best parameter : ', gridcv.best_estimator_)
print('TFIDF Logistic Regression best accuracy : ', np.around(gridcv.best_score_,2))

#최적 C값으로 예측
pred = lr_model.predict(X_test_tfidf_vect)
accuracy = accuracy_score(pred, y_test)
print('TFIDF Logistic Regression accuracy : ', np.around(accuracy,2))


TFIDF Logistic Regression best parameter :  LogisticRegression(C=10, solver='liblinear')
TFIDF Logistic Regression best accuracy :  0.75
TFIDF Logistic Regression accuracy :  0.69


# 결론 
- 본 데이터에서는 tfidf가 더 피처벡터화에 효과적이며 단어의 수가 많아질수록 분류 accuracy가 높아지는걸 확인함
- train 데이터셋을 교차검증해 나온 최고 점수가 0.75이지만 test데이터로 평가해본 결과 0.69이 나온 것으로 보아 파라미터를 설정한 모델이 train 데이터에 다소 과적합 되었다고 볼 수 있다.