In [1]:
import pandas as pd
import numpy as np

import urllib.request
import os


## Datasets

In [2]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv", filename="abcnews-date-text.csv")

('abcnews-date-text.csv', <http.client.HTTPMessage at 0x14d45c21fd0>)

In [2]:
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [4]:
data.shape

(1082168, 2)

In [5]:
data.isna().sum()

publish_date     0
headline_text    0
dtype: int64

In [8]:
text = data[['headline_text']]

## Word_tokenize

In [9]:
from nltk.tokenize import word_tokenize
text['headline_text'] = text.apply(lambda row : word_tokenize(row['headline_text']), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [10]:
text

Unnamed: 0,headline_text
0,"[aba, decides, against, community, broadcastin..."
1,"[act, fire, witnesses, must, be, aware, of, de..."
2,"[a, g, calls, for, infrastructure, protection,..."
3,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,"[air, nz, strike, to, affect, australian, trav..."
...,...
1082163,"[when, is, it, ok, to, compliment, a, womans, ..."
1082164,"[white, house, defends, trumps, tweet]"
1082165,"[winter, closes, in, on, tasmania, as, snow, i..."
1082166,"[womens, world, cup, australia, wins, despite,..."


## Stopwords

In [11]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [12]:
text['headline_text'] = text['headline_text'].apply(lambda x : [word for word in x if word not in stop])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


## Lemmatize

In [15]:
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

In [16]:
text['headline_text'] = text['headline_text'].apply(lambda x : [lemma.lemmatize(word,'v') for word in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [22]:
tokenized_doc = text['headline_text'].apply(lambda x : [word for word in x if len(word)>2])

In [23]:
tokenized_doc

0               [aba, decide, community, broadcast, licence]
1              [act, fire, witness, must, aware, defamation]
2                 [call, infrastructure, protection, summit]
3                      [air, staff, aust, strike, pay, rise]
4              [air, strike, affect, australian, travellers]
                                 ...                        
1082163                   [compliment, womans, smile, guide]
1082164                 [white, house, defend, trump, tweet]
1082165           [winter, close, tasmania, snow, ice, fall]
1082166    [womens, world, cup, australia, win, despite, ...
1082167           [youtube, stunt, death, foreshadow, tweet]
Name: headline_text, Length: 1082168, dtype: object

## Tfidf

In [24]:
d_token = []

for i in range(len(tokenized_doc)):
    d_token.append(' '.join(tokenized_doc[i]))
    
text['headline_text'] = d_token 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [25]:
text

Unnamed: 0,headline_text
0,aba decide community broadcast licence
1,act fire witness must aware defamation
2,call infrastructure protection summit
3,air staff aust strike pay rise
4,air strike affect australian travellers
...,...
1082163,compliment womans smile guide
1082164,white house defend trump tweet
1082165,winter close tasmania snow ice fall
1082166,womens world cup australia win despite atapatt...


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
tf = TfidfVectorizer(stop_words='english', max_features=1000) 

In [30]:
x = tf.fit_transform(text['headline_text']) 

In [32]:
x.shape

(1082168, 1000)

## LDA
- tfidf or DTM을 feature로 받는다

In [33]:
from sklearn.decomposition import LatentDirichletAllocation

In [34]:
lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', n_jobs=-1, random_state=42, max_iter=1)
### topic을 10개로 설정해주었다

In [35]:
lda_top = lda_model.fit_transform(x)

In [38]:
lda_model.components_.shape ## 문서내의 10개의 토픽이있다고 설정(그 토픽에 대하여 단어들의 유의미한 확률을 반환해준다)

(10, 1000)

In [39]:
terms = tf.get_feature_names()

In [77]:
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
        print('='*60)
        
get_topics(lda_model.components_,terms)

Topic 1: [('trump', 11599.88), ('election', 7343.21), ('make', 5394.21), ('live', 5305.5), ('plan', 4725.64)]
Topic 2: [('police', 11448.37), ('world', 6255.91), ('death', 5783.64), ('country', 5039.97), ('warn', 4963.55)]
Topic 3: [('adelaide', 6535.51), ('school', 5271.77), ('2016', 5126.33), ('years', 4884.03), ('drug', 4129.94)]
Topic 4: [('new', 12723.8), ('attack', 6705.8), ('hit', 4942.64), ('shoot', 4284.76), ('qld', 4233.77)]
Topic 5: [('win', 8420.81), ('government', 8367.32), ('canberra', 5876.62), ('test', 4786.73), ('people', 4000.04)]
Topic 6: [('say', 12743.55), ('sydney', 8087.36), ('court', 7263.05), ('interview', 5924.27), ('change', 5627.23)]
Topic 7: [('melbourne', 7136.92), ('man', 6718.01), ('home', 5443.39), ('open', 5438.24), ('crash', 4915.54)]
Topic 8: [('queensland', 7496.03), ('house', 5916.87), ('report', 5437.09), ('coast', 5180.63), ('north', 4838.26)]
Topic 9: [('australia', 13124.81), ('australian', 10711.53), ('charge', 7952.49), ('day', 6753.22), ('ma