In [1]:
import pandas as pd
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv", filename="abcnews-date-text.csv")
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)

In [2]:
print(len(data))

1082168


In [3]:
print(data.head(5))

   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers


In [4]:
text = data[['headline_text']]
text.head(5)

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [6]:
import nltk

In [7]:
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [8]:
print(text.head(5))

                                       headline_text
0  [aba, decides, against, community, broadcastin...
1  [act, fire, witnesses, must, be, aware, of, de...
2  [a, g, calls, for, infrastructure, protection,...
3  [air, nz, staff, in, aust, strike, for, pay, r...
4  [air, nz, strike, to, affect, australian, trav...


In [9]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [10]:
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [11]:
print(text.head(5))

                                       headline_text
0   [aba, decides, community, broadcasting, licence]
1    [act, fire, witnesses, must, aware, defamation]
2     [g, calls, infrastructure, protection, summit]
3          [air, nz, staff, aust, strike, pay, rise]
4  [air, nz, strike, affect, australian, travellers]


In [12]:
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 3])

In [13]:
print(tokenized_doc[:5])

0    [decides, community, broadcasting, licence]
1     [fire, witnesses, must, aware, defamation]
2    [calls, infrastructure, protection, summit]
3                    [staff, aust, strike, rise]
4       [strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [14]:
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

text['headline_text'] = detokenized_doc # 다시 text['headline_text']에 재저장

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [15]:
text['headline_text'][:5]

0    decides community broadcasting licence
1      fire witnesses must aware defamation
2    calls infrastructure protection summit
3                    staff aust strike rise
4       strike affect australian travellers
Name: headline_text, dtype: object

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000) # 상위 1,000개의 단어를 보존 
X = vectorizer.fit_transform(text['headline_text'])
X.shape # TF-IDF 행렬의 크기 확인

(1082168, 1000)

In [17]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=777,max_iter=1)

In [18]:
lda_top=lda_model.fit_transform(X)

In [19]:
print(lda_model.components_)
print(lda_model.components_.shape) 

[[1.00000524e-01 1.00000945e-01 1.00003031e-01 ... 1.00010614e-01
  1.00003498e-01 1.00001781e-01]
 [1.00003244e-01 1.00006985e-01 1.00018485e-01 ... 1.00009374e-01
  1.00004140e-01 1.00002710e-01]
 [1.00000613e-01 1.00001003e-01 1.00001719e-01 ... 1.00010032e-01
  1.00003629e-01 1.00002942e-01]
 ...
 [1.00002359e-01 1.00002223e-01 1.00008046e-01 ... 2.25245378e+03
  1.00004394e-01 1.00001389e-01]
 [1.00001729e-01 1.00001155e-01 3.56084764e+03 ... 1.00006474e-01
  1.00003539e-01 1.00001515e-01]
 [1.00001008e-01 1.00001333e-01 1.00004934e-01 ... 1.00015214e-01
  1.00003583e-01 1.00001347e-01]]
(10, 1000)


In [20]:
terms = vectorizer.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.

In [21]:
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lda_model.components_,terms)

Topic 1: [('trump', 10983.42), ('state', 4433.33), ('league', 4147.04), ('open', 4035.79), ('funding', 3547.71)]
Topic 2: [('court', 7730.97), ('world', 7084.24), ('canberra', 6440.95), ('interview', 5813.89), ('country', 5368.42)]
Topic 3: [('police', 12780.13), ('government', 9184.56), ('sydney', 8885.32), ('woman', 5633.17), ('north', 5261.69)]
Topic 4: [('election', 8013.99), ('south', 6909.67), ('test', 3978.44), ('missing', 3629.12), ('power', 3624.58)]
Topic 5: [('melbourne', 7830.58), ('years', 5286.89), ('calls', 4858.96), ('final', 3758.86), ('accused', 3747.29)]
Topic 6: [('australia', 14376.71), ('death', 6201.75), ('2016', 5731.04), ('turnbull', 4364.71), ('people', 4166.9)]
Topic 7: [('adelaide', 7079.47), ('perth', 6705.92), ('charged', 5845.25), ('dies', 4730.09), ('indigenous', 4334.57)]
Topic 8: [('australian', 11564.41), ('queensland', 7981.64), ('year', 5768.28), ('brisbane', 5103.73), ('tasmania', 4820.45)]
Topic 9: [('coast', 5606.92), ('tasmanian', 5034.86), ('sc

In [23]:
########### gensim을 통한 LDA 시각화

from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

In [27]:
print(corpus[1])

[(4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]


In [28]:
print(dictionary[6])

fire
