In [2]:
import pandas as pd
import urllib.request
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/19.%20Topic%20Modeling%20(LDA%2C%20BERT-Based)/dataset/abcnews-date-text.csv")

data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)
print('뉴스 제목 개수 :',len(data))


HTTPError: HTTP Error 404: Not Found

In [3]:
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)
print('뉴스 제목 개수 :',len(data))

TypeError: read_csv() got an unexpected keyword argument 'error_bad_lines'

In [49]:
data = pd.read_csv('abcnews-date-text.csv', on_bad_lines='skip')
print('뉴스 제목 개수 :',len(data))

뉴스 제목 개수 : 1244184


In [50]:
print(data.head(5))


   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers


In [51]:
text = data[['headline_text']]
text.head(5)


Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [52]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [53]:
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)


In [54]:
print(text.head(5))


                                       headline_text
0  [aba, decides, against, community, broadcastin...
1  [act, fire, witnesses, must, be, aware, of, de...
2  [a, g, calls, for, infrastructure, protection,...
3  [air, nz, staff, in, aust, strike, for, pay, r...
4  [air, nz, strike, to, affect, australian, trav...


In [55]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
stop_words = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])


In [57]:
print(text.head(5))

                                       headline_text
0   [aba, decides, community, broadcasting, licence]
1    [act, fire, witnesses, must, aware, defamation]
2     [g, calls, infrastructure, protection, summit]
3          [air, nz, staff, aust, strike, pay, rise]
4  [air, nz, strike, affect, australian, travellers]


In [58]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [59]:
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
print(text.head(5))


                                       headline_text
0       [aba, decide, community, broadcast, licence]
1      [act, fire, witness, must, aware, defamation]
2      [g, call, infrastructure, protection, summit]
3          [air, nz, staff, aust, strike, pay, rise]
4  [air, nz, strike, affect, australian, travellers]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])


In [60]:
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 3])
print(tokenized_doc[:5])


0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [61]:
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

# 다시 text['headline_text']에 재저장
text['headline_text'] = detokenized_doc


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = detokenized_doc


In [62]:
text['headline_text'][:5]


Unnamed: 0,headline_text
0,decide community broadcast licence
1,fire witness must aware defamation
2,call infrastructure protection summit
3,staff aust strike rise
4,strike affect australian travellers


In [63]:
# 상위 1,000개의 단어를 보존
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000)
X = vectorizer.fit_transform(text['headline_text'])

# TF-IDF 행렬의 크기 확인
print('TF-IDF 행렬의 크기 :',X.shape)


TF-IDF 행렬의 크기 : (1244184, 1000)


In [64]:
lda_model = LatentDirichletAllocation(n_components=10,learning_method='online',random_state=777,max_iter=1)
lda_top = lda_model.fit_transform(X)
print(lda_model.components_)
print(lda_model.components_.shape)


[[1.00000685e-01 1.00000156e-01 1.00000751e-01 ... 1.00012161e-01
  1.00002507e-01 1.00010956e-01]
 [1.00002081e-01 1.63489327e+02 5.30598045e+02 ... 1.00009630e-01
  1.00005759e-01 1.00006963e-01]
 [1.00000479e-01 1.00000253e-01 1.00001101e-01 ... 1.00005250e-01
  1.00002314e-01 1.00008790e-01]
 ...
 [1.00000478e-01 1.00001944e-01 1.00000917e-01 ... 1.00004778e-01
  1.00002522e-01 1.00004187e-01]
 [1.00001023e-01 1.00000823e-01 1.00000911e-01 ... 1.00004481e-01
  1.00000892e-01 1.00006699e-01]
 [1.00001227e-01 1.00000112e-01 1.00000712e-01 ... 1.00006545e-01
  1.00001215e-01 1.00005108e-01]]
(10, 1000)


In [65]:
# 단어 집합. 1,000개의 단어가 저장됨.
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])

get_topics(lda_model.components_,terms)


AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names'

In [66]:
# 단어 집합. 1,000개의 단어가 저장됨.
terms = vectorizer.get_feature_names_out()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])

get_topics(lda_model.components_,terms)


Topic 1: [('australia', np.float64(20547.3)), ('sydney', np.float64(11217.02)), ('court', np.float64(7717.43)), ('change', np.float64(7573.85)), ('south', np.float64(7105.09))]
Topic 2: [('coronavirus', np.float64(35122.91)), ('covid', np.float64(28869.18)), ('queensland', np.float64(13375.0)), ('open', np.float64(6906.54)), ('world', np.float64(6321.66))]
Topic 3: [('border', np.float64(6860.6)), ('kill', np.float64(6209.25)), ('miss', np.float64(4657.9)), ('care', np.float64(4523.07)), ('interview', np.float64(4041.95))]
Topic 4: [('donald', np.float64(8534.87)), ('death', np.float64(6828.42)), ('people', np.float64(6649.43)), ('restrictions', np.float64(6435.2)), ('state', np.float64(6080.56))]
Topic 5: [('news', np.float64(8556.58)), ('vaccine', np.float64(8035.9)), ('live', np.float64(7519.57)), ('coast', np.float64(6001.4)), ('woman', np.float64(5637.57))]
Topic 6: [('trump', np.float64(14885.11)), ('case', np.float64(13136.88)), ('victoria', np.float64(11694.09)), ('government',