# 20 뉴스그룹 분류

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset="all", random_state=2021)

## 데이터 탐색

In [3]:
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [5]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
pd.Series(news.target).value_counts().sort_index()

0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64

In [7]:
len(news.data)

18846

In [21]:
print(news.data[0])

From: dagibbs@quantum.qnx.com (David Gibbs)
Subject: Re: Countersteering sans Hands
Organization: QNX Software Systems, Ltd.
Lines: 22

In article <1993Apr20.203344.8417@cs.cornell.edu> karr@cs.cornell.edu (David Karr) writes:
>In article <Clarke.6.735328328@bdrc.bd.com> Clarke@bdrc.bd.com (Richard Clarke) writes:
>>So how do I steer when my hands aren't on the bars? (Open Budweiser in left 
>>hand, Camel cigarette in the right, no feet allowed.) 
>
>>If I lean, and the 
>>bike turns, am I countersteering?
>
>No, the bars would turn only *toward* the direction of turn in
>no-hands steering.

Just in case the original poster was looking for a serious answer,
I'll supply one.

Yes, even when steering no hands you do something quite similar
to countersteering.  Basically to turn left, you to a quick wiggle
of the bike to the right first, causing a counteracting lean to
occur to the left.  It is a lot more difficult to do on a motorcycle
than a bicycle though, because of the extra weight. 

## 훈련/테스트용 데이터 추출

In [24]:
train_news = fetch_20newsgroups(
    subset="test", random_state=2021,
    remove=("headers","footers","quotes")
)
len(train_news.data)

7532

In [22]:
test_news = fetch_20newsgroups(
    subset='test', random_state=2021,
    remove=('headers', 'footers', 'quotes')
)
len(test_news.data)

7532

In [None]:
print(news.data[0])

In [23]:
print(train_news.data[1])

There are chips which perform the voice compression/expansion.  They can't
be expensive, because they exist in many phones connected to PBXs or on the
PBX line cards, as well as in a lot of equipment which compresses
voice-grade circuits to save the cost of long-distance, leased T1s or
satellite circuits.

I can't remember the generic term for these chips.  My impression is that
this was a big deal 10 years ago, but circuits have gotten so cheap that
it isn't done much now.

Lew



## 텍스트 데이터에 대해서 전처리

In [25]:
train_df = pd.DataFrame({"article": train_news.data})
test_df = pd.DataFrame({"article": test_news.data})

- train dataset

In [27]:
# 특수문자 제거
#train_df['article'] = train_df.article.str.replace('[^A-Za-z\$0-9]',' ')
train_df['article'] = train_df.article.str.replace('[^A-Za-z]', ' ')
train_df.article[1]

  train_df['article'] = train_df.article.str.replace('[^A-Za-z]', ' ')


'There are chips which perform the voice compression expansion   They can t be expensive  because they exist in many phones connected to PBXs or on the PBX line cards  as well as in a lot of equipment which compresses voice grade circuits to save the cost of long distance  leased T s or satellite circuits   I can t remember the generic term for these chips   My impression is that this was a big deal    years ago  but circuits have gotten so cheap that it isn t done much now   Lew '

In [28]:
# 길이가 3 이하인 단어 제거
train_df['article'] = train_df.article.apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
train_df.article[1]

'There chips which perform voice compression expansion They expensive because they exist many phones connected PBXs line cards well equipment which compresses voice grade circuits save cost long distance leased satellite circuits remember generic term these chips impression that this deal years circuits have gotten cheap that done much'

In [18]:
s = "a ab abc abcd abcde"
x = []
for w in s.split():
    if len(w) > 3:
        x.append(w)
s = " ".join([w for w in s.split() if len(w) > 3])
s

'abcd abcde'

In [None]:
train_df['article'] = train_df['article'].astype(str).apply(lambda x : re.sub('r[^A-Za-z0-9]', '', x))

In [None]:
# import re
# shortword = re.compile(r"\W*\b\s{1,2}\b")
# print(soortword.sub('', train_df.article[1]))

In [None]:
# 소문자로 변환
train_df['article'] = train_df.article.apply(lambda x : x.lower())

In [None]:
# 소문자로 변환하고 길이가 3 이하인 단어 제거
train_df['article'] = train_df.article.apply(lambda x : " ".join([w.lower() for w in x.split() if len(w)>3]))

## 텍스트 변환

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
# 학습 시키기
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(train_df.article)

TfidfVectorizer(stop_words='english')

In [32]:
X_train = tvect.transform(train_df.article)
X_test = tvect.transform(test_df.article)
X_train.shape, X_test.shape

((7532, 48501), (7532, 48501))

In [33]:
y_train = train_news.target
y_test = test_news.target

## 훈련/예측/평가

In [34]:
# Support Vector Machine의 Classifier 사용
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [35]:
pred = svc.predict(X_test)

In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9669410515135423