# 20 뉴스 그룹 분류

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all', random_state=2021)


- 데이터 탐색

In [3]:
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
np.unique(news.target,return_counts=True)


(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 array([799, 973, 985, 982, 963, 988, 975, 990, 996, 994, 999, 991, 984,
        990, 987, 997, 910, 940, 775, 628], dtype=int64))

In [6]:
print(news.data[0])

From: dagibbs@quantum.qnx.com (David Gibbs)
Subject: Re: Countersteering sans Hands
Organization: QNX Software Systems, Ltd.
Lines: 22

In article <1993Apr20.203344.8417@cs.cornell.edu> karr@cs.cornell.edu (David Karr) writes:
>In article <Clarke.6.735328328@bdrc.bd.com> Clarke@bdrc.bd.com (Richard Clarke) writes:
>>So how do I steer when my hands aren't on the bars? (Open Budweiser in left 
>>hand, Camel cigarette in the right, no feet allowed.) 
>
>>If I lean, and the 
>>bike turns, am I countersteering?
>
>No, the bars would turn only *toward* the direction of turn in
>no-hands steering.

Just in case the original poster was looking for a serious answer,
I'll supply one.

Yes, even when steering no hands you do something quite similar
to countersteering.  Basically to turn left, you to a quick wiggle
of the bike to the right first, causing a counteracting lean to
occur to the left.  It is a lot more difficult to do on a motorcycle
than a bicycle though, because of the extra weight. 

- Train / Test data 추출

In [7]:
train_news = fetch_20newsgroups(
    subset='train', 
    random_state=2021,
    remove=('headers','footers','quotes')
)
X_train = train_news.data
y_train = train_news.target

In [8]:
print(train_news.data[10][:1000])

Usually when I start up an application, I first get the window outline
on my display. I then have to click on the mouse button to actually
place the window on the screen. Yet when I specify the -geometry 
option the window appears right away, the properties specified by
the -geometry argument. The question now is:

How can I override the intermediary step of the user having to specify
window position with a mouseclick? I've tried explicitly setting window
size and position, but that did alter the normal program behaviour.

Thanks for any hints
---> Robert

PS: I'm working in plain X.




In [9]:
train_news.target[10],train_news.target_names[train_news.target[10]]

(5, 'comp.windows.x')

In [10]:
test_news = fetch_20newsgroups(
    subset='test', 
    random_state=2021,
    remove=('headers','footers','quotes')
)
X_test = test_news.data
y_test = test_news.target

In [11]:
len(X_train),len(X_test)

(11314, 7532)

- 피처 벡터화 변환과 머신러닝 모델 학습/평가

- Case 1) CountVectorizer + LogisticRegression

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)


In [39]:
X_train_cv.shape,X_test_cv.shape

((11314, 101631), (7532, 101631))

In [40]:
from sklearn.linear_model import LogisticRegression 
lr = LogisticRegression()
%time lr.fit(X_train_cv,y_train)
lr.score(X_test_cv,y_test)

Wall time: 36.7 s


0.6043547530536378

tf

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer()
tvect.fit(X_train)
X_train_tv = tvect.transform(X_train)
X_test_tv = tvect.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression 
lr = LogisticRegression()
%time lr.fit(X_train_tv,y_train)
lr.score(X_test_tv,y_test)

- case 3) Stop_words filtering, max_df=300, ngram_range = (1,2) 

In [34]:
def auto(model,vect,i):
    g= globals()
    if vect == 'cvect':
        g[f'{vect}{i}'] = CountVectorizer(stop_words='english',ngram_range=(1,i))
    elif vect == 'tvect':
        g[f'{vect}{i}'] = TfidfVectorizer(stop_words='english',ngram_range=(1,i))
    
    g[f'{vect}{i}'].fit(X_train)
    g[f'X_train_{vect}{i}'] = g[f'{vect}{i}'].transform(X_train)
    g[f'X_test_{vect}{i}'] = g[f'{vect}{i}'].transform(X_test)
    model.fit(g[f'X_train_{vect}{i}'],y_train)
    return model.score(g[f'X_test_{vect}{i}'],y_test)

In [None]:
lr = LogisticRegression(max_iter=300)
%time lr.fit(X_train_tv2,y_train)
lr.score(X_test_tv2,y_test)

- case 4) case 3 에서 LogisticRegressio의 값을 c값을 10으로

In [None]:
auto(LogisticRegression(max_iter=100,C=10),'tvect',2)

- Pipeline

In [42]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('tvect', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression()),
])

In [43]:
params = {
    'tvect__max_df' : [300,700],
    'tvect__ngram__range' : [(1,1),(1,2)],
    'lr__C' : [1,10]
}

In [44]:
from sklearn.model_selection import GridSearchCV

In [45]:
grid_pipe = GridSearchCV(pipeline, param_grid=params,cv=3,scoring='accuracy',verbose=1,n_jobs=-1)

In [None]:
grid_pipe.fit(X_train,y_train)

In [None]:
grid_pipe.best_params_

In [None]:
grid_pipe.best_estimator_.score(X_test,y_test)