## from https://wikidocs.net/30707


In [1]:
!curl -O https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv

# 15년간 발행된 뉴스의 기사 제목 데이터

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 51.5M  100 51.5M    0     0   150M      0 --:--:-- --:--:-- --:--:--  150M


In [2]:
import pandas as pd

In [4]:
df_data = pd.read_csv('./abcnews-date-text.csv')
df_data.head(5)

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [25]:
df_data = df_data.head(10000)

In [26]:
head_text = df_data[['headline_text']]
type(head_text)

pandas.core.frame.DataFrame

## Data preprocessing

In [27]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
# apply : map() 같은 기능 

head_text['title_text'] = head_text.apply(lambda row : nltk.word_tokenize(row['headline_text']), axis=1)  
# 텍스트화 
head_text.head(3)

Unnamed: 0,headline_text,title_text
0,aba decides against community broadcasting lic...,"[aba, decides, against, community, broadcastin..."
1,act fire witnesses must be aware of defamation,"[act, fire, witnesses, must, be, aware, of, de..."
2,a g calls for infrastructure protection summit,"[a, g, calls, for, infrastructure, protection,..."


In [30]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [32]:
stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

길이가 3 이하인 것도 불용어 처리하기
```
def stopWord(head_text):
  result = []
  for x in head_text['title_text']:
    for word in x:
      if word not in stop or len(word) > 3:
        result.append(word)
  return result

def callStopWord(head_text):
  titles = []
  for x in head_text['title_text']:
    titles.append(word)
  return titles
  ```

In [54]:
# 길이가 3 이하인 것도 불용어 처리하기
# 람다 버전

head_text['title'] = head_text['title_text'].apply(lambda x: [ word for word in x if (len(word) > 3) if (word not in stop)])
head_text.head(5)

Unnamed: 0,headline_text,title_text,title
0,aba decides against community broadcasting lic...,"[aba, decides, against, community, broadcastin...","[decides, community, broadcasting, licence]"
1,act fire witnesses must be aware of defamation,"[act, fire, witnesses, must, be, aware, of, de...","[fire, witnesses, must, aware, defamation]"
2,a g calls for infrastructure protection summit,"[a, g, calls, for, infrastructure, protection,...","[calls, infrastructure, protection, summit]"
3,air nz staff in aust strike for pay rise,"[air, nz, staff, in, aust, strike, for, pay, r...","[staff, aust, strike, rise]"
4,air nz strike to affect australian travellers,"[air, nz, strike, to, affect, australian, trav...","[strike, affect, australian, travellers]"


In [55]:
head_text['title'][3]

['staff', 'aust', 'strike', 'rise']

In [56]:
tokens = []
for i in range(len(head_text)):
  tokens.append( ' '.join(head_text['title'][i]) )  # 리스트 요소를 문자화 시키기

tokens[3:5]

['staff aust strike rise', 'strike affect australian travellers']

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(tokens)
X.shape

(1000, 1000)

In [59]:
# X[4].toarray()

In [60]:
from sklearn.decomposition import LatentDirichletAllocation  # 단어 중요도 나타내기

In [74]:
lda_model = LatentDirichletAllocation(max_iter=10, n_components=4)  # max_itr == epochs

In [75]:
lda_top = lda_model.fit_transform(X)

In [76]:
lda_model.components_.shape,  lda_model.components_  # numpy로 중요도가 담김

((4, 1000),
 array([[0.25028574, 0.25015379, 0.25045747, ..., 1.33722865, 0.25018368,
         1.94468439],
        [0.25026601, 0.25272886, 0.76997497, ..., 0.25880556, 0.25016465,
         0.25047884],
        [1.71641858, 0.25015515, 0.88686109, ..., 0.74087848, 0.25017042,
         0.74953161],
        [0.25278643, 1.60914125, 0.25613691, ..., 0.25043801, 1.43503111,
         2.10016754]]))

In [77]:
terms = tfidf.get_feature_names()  # 단어 모음

In [78]:
n = 5   # 5개씩 뽑음
for idx, topic in enumerate(lda_model.components_):
  print( [(terms[i], topic[i]) for i in topic.argsort()[:-n-1:-1]] )

# topic의 주요 단어 -> 부정적인 내용의 기사가 많군

[('murder', 7.176543782573748), ('still', 6.329541466692143), ('jailed', 4.471502771166158), ('takes', 4.345426193157762), ('funds', 4.336206406231054)]
[('fire', 7.835037243693708), ('says', 7.622906316855529), ('govt', 7.348400394351069), ('iraq', 6.064753554925244), ('police', 5.991916552128032)]
[('face', 6.783130858520391), ('death', 6.625183851146364), ('water', 5.7811805730443115), ('back', 5.706433590067808), ('warne', 5.612224303008051)]
[('council', 9.61187249991279), ('rain', 9.568336428930827), ('drought', 7.498264120048217), ('world', 5.8495079556773835), ('iraqi', 4.863659101182978)]
