### 1) 뉴스그룹 데이터에 대한 이해
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html
<br>https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

In [13]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'foters', 'quotes'))
documents = dataset.data
print('샘플의 수:', len(documents))

샘플의 수: 11314


In [73]:
dataset.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [75]:
dataset.target_names # The names of target classes

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [99]:
len(dataset.target) # The target labels

11314

In [95]:
dataset.target

array([17,  0, 17, ...,  9,  4,  9])

In [96]:
type(dataset)

sklearn.utils.Bunch

In [14]:
documents[1]

'\n\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can\'t pity you, Jim.  And I\'m sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won\'t be bummin\' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don\'t forget your Flintstone\'s Chewables!  :) \n--\nBake Timmons, III\n\n-- "...there\'s nothing higher, stronger, more wholesome and more useful in life\nthan some good memory..." -- Alyosha in Brothers Karamazov (Dostoevsky)\n'

In [15]:
print(dataset.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


### 2) 텍스트 전처리

In [19]:
news_df = pd.DataFrame({'document': documents})

# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]', " ")

# 길이가 3 이하인 단어는 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  after removing the cwd from sys.path.


In [20]:
news_df.head()

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,"\n\n\n\n\n\n\n\nYeah, do you expect people to ...",yeah expect people read actually accept hard a...
2,\n Although I realize that principle is not ...,although realize that principle your strongest...
3,\n Notwithstanding all the legitimate fuss ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...


In [21]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons there nothing higher stronger more wholesome more useful life than some good memory alyosha brothers karamazov dostoevsky'

In [22]:
# NLTK로부터 불용어를 받아온다.
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
# 불용어 제거
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

In [26]:
print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons', 'nothing', 'higher', 'stronger', 'wholesome', 'useful', 'life', 'good', 'memory', 'alyosha', 'brothers', 'karamazov', 'dostoevsky']


In [28]:
print(stop_words); print(len(stop_words))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### 3) TF-IDF 행렬 만들기
불용어 제거를 위해 토큰화 작업을 수행하였지만, TfidfVectorizer는 기본적으로 토큰화가 되어있지 않은 텍스트 데이터를 입력으로 사용함. 다시 토큰화 작업을 역으로 취소하는 작업(역토큰화, Detokenization)을 수행해보자.

In [29]:
# 역토큰화(토큰화 작업을 역으로 되돌림)
detokenized_doc = []

for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
    
news_df['clean_doc'] = detokenized_doc

In [30]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy logic runs steam sorry pity sorry feelings denial faith need well pretend happily ever anyway maybe start newsgroup atheist hard bummin much forget flintstone chewables bake timmons nothing higher stronger wholesome useful life good memory alyosha brothers karamazov dostoevsky'

In [31]:
# 상위 1000개의 단어를 보존
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, max_df=0.5, smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])

# TF-IDF 행렬의 크기 확인
print('TF-IDF 행렬의 크기:', X.shape)

TF-IDF 행렬의 크기: (11314, 1000)


### 4) 토픽 모델링(Topic Modeling)
사이킷런의 절단된 SVD(Truncated SVD)를 사용해서 TF-IDF 행렬을 다수의 행렬로 분해하자.
<br>뉴스그룹 데이터가 20개의 카테고리를 가지고 있었기 때문에, 20개의 토픽을 가졌다고 가정하고 토픽 모델링을 시도해보자.

In [33]:
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)
len(svd_model.components_) # LSA에서 VT(n)에 해당

20

In [36]:
import numpy as np

In [37]:
np.shape(svd_model.components_) # 토픽의 수 t × 단어의 수의 크기

(20, 1000)

In [97]:
svd_model.components_[:, 0]

array([ 1.43556996e-02, -4.85774011e-03,  1.23027268e-03,  6.01449701e-03,
       -6.62011022e-03,  4.01486564e-06, -5.73294383e-03,  6.41457120e-03,
        6.37828836e-03, -4.73308808e-03, -8.94672526e-04, -3.61816924e-03,
       -2.22749189e-03,  8.96920255e-04,  7.34999788e-04, -5.85385468e-03,
       -1.00605485e-02, -4.40995739e-03, -1.99402531e-03,  9.91798532e-04])

In [100]:
terms = vectorizer.get_feature_names() # 단어 집합. 1000개의 단어가 저장됨

# 각 20개의 행의 각 1,000개의 열 중 가장 값이 큰 5개의 값을 찾아서 단어로 출력하는 함수
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), \
              [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
        
get_topics(svd_model.components_, terms)

Topic 1: [('like', 0.20505), ('know', 0.18838), ('people', 0.18376), ('think', 0.16767), ('good', 0.14274)]
Topic 2: [('thanks', 0.3379), ('windows', 0.27465), ('mail', 0.17725), ('card', 0.17113), ('drive', 0.15578)]
Topic 3: [('game', 0.38223), ('team', 0.32242), ('year', 0.27387), ('games', 0.24544), ('season', 0.18665)]
Topic 4: [('drive', 0.51326), ('scsi', 0.20344), ('disk', 0.15638), ('hard', 0.15618), ('card', 0.15153)]
Topic 5: [('thanks', 0.37204), ('drive', 0.3638), ('know', 0.25132), ('scsi', 0.13857), ('advance', 0.12312)]
Topic 6: [('windows', 0.34853), ('know', 0.23487), ('like', 0.1898), ('think', 0.17901), ('file', 0.12958)]
Topic 7: [('like', 0.55178), ('bike', 0.1782), ('know', 0.17522), ('chip', 0.11768), ('sounds', 0.079)]
Topic 8: [('know', 0.24374), ('thanks', 0.22401), ('government', 0.21558), ('people', 0.18357), ('israel', 0.12575)]
Topic 9: [('card', 0.51616), ('video', 0.2482), ('monitor', 0.15725), ('sale', 0.15), ('drivers', 0.13072)]
Topic 10: [('like', 0

