<a href="https://colab.research.google.com/github/pdh93621/Deep-learning/blob/main/LSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## LSA(잠재 의미 분석)


In [44]:
import numpy as np
import pandas as pd
import urllib.request

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [47]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [48]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [49]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv",
                           filename="/content/abcnews-data-text.csv")


('/content/abcnews-data-text.csv', <http.client.HTTPMessage at 0x7f639d574c90>)

In [50]:
csv_filename = '/content/abcnews-data-text.csv'

In [51]:
data = pd.read_csv(csv_filename, error_bad_lines= False)

In [52]:
data.columns

Index(['publish_date', 'headline_text'], dtype='object')

In [53]:
text = data[['headline_text']]

In [54]:
text

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers
...,...
1082163,when is it ok to compliment a womans smile a g...
1082164,white house defends trumps tweet
1082165,winter closes in on tasmania as snow ice falls
1082166,womens world cup australia wins despite atapat...


In [55]:
text.nunique()

headline_text    1054983
dtype: int64

In [56]:
# 중복 제거
text.drop_duplicates(inplace=True)
text = text.reset_index(drop=True)
print(len(text))

1054983


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


데이터 정제 및 정규화

In [57]:
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']),axis = 1)

In [58]:
# 불용어 제거
stop_words = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])

In [59]:
text.head()

Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [60]:
# 단어 정규화 과정 길이가 1~2인 단어는 제거하는 전처리
# 단어 정규화 3인칭 단수 표현 -> 1인칭 변환, 과거형 동사 -> 현재형 동사등을 수행
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])

In [61]:
# 길이가 1~2인 단어를 제거
text = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 2])
print(text[:5])

0     [aba, decide, community, broadcast, licence]
1    [act, fire, witness, must, aware, defamation]
2       [call, infrastructure, protection, summit]
3            [air, staff, aust, strike, pay, rise]
4    [air, strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [62]:
# 역토큰화
detokenized_doc = []
for i in range(len(text)):
  t = ' '.join(text[i])
  detokenized_doc.append(t)

train_data = detokenized_doc


In [63]:
train_data[:5]

['aba decide community broadcast licence',
 'act fire witness must aware defamation',
 'call infrastructure protection summit',
 'air staff aust strike pay rise',
 'air strike affect australian travellers']

In [64]:
# 상위 5000개의 단어만 사용
c_vectorizer = CountVectorizer(stop_words ='english', max_features = 5000)
document_term_matrix = c_vectorizer.fit_transform(train_data)

In [65]:
# DTM의 크기
print(f'행렬의 크기: {document_term_matrix.shape}') # 문서의 수 * 단어 집합의 크기

행렬의 크기: (1054983, 5000)


## scikit-learn TruncateSVD 활용

In [66]:
from sklearn.decomposition import TruncatedSVD

n_topics = 10
lsa_model = TruncatedSVD(n_components = n_topics)
lsa_model.fit_transform(document_term_matrix)

array([[ 1.20387553e-02, -3.66461716e-03,  1.84857705e-02, ...,
         5.92283300e-03,  1.00999177e-03,  1.63185093e-02],
       [ 2.90696575e-02, -1.09895684e-02,  1.82141845e-02, ...,
         2.48063812e-03, -1.00012290e-02,  5.74007373e-04],
       [ 5.03771554e-03, -2.03766817e-03,  9.82190391e-03, ...,
        -1.20021287e-03,  2.68521612e-03,  4.85887241e-03],
       ...,
       [ 2.95510130e-02,  5.12081512e-03,  2.53073083e-02, ...,
         2.86190546e-02,  1.80304744e-02,  1.77090174e-02],
       [ 6.21626939e-02, -9.17194009e-03,  1.31732484e-01, ...,
         9.31510895e-01,  8.01212719e-01, -4.64605005e-01],
       [ 7.14991972e-02,  2.72351890e-02, -1.10558444e-03, ...,
         5.14503351e-02,  3.28881908e-02, -3.45196857e-02]])

In [67]:
print(np.shape(lsa_model.components_))

(10, 5000)


In [68]:
term = c_vectorizer.get_feature_names()

In [69]:
def get_topics(components, feature_names, n =5):
  for idx, topic in enumerate(components):
    print('Topic %d' %(idx+1),[(feature_names[i],topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])

In [70]:
get_topics(lsa_model.components_,term)

Topic 1 [('police', 0.74635), ('man', 0.45357), ('charge', 0.21092), ('new', 0.1409), ('court', 0.11141)]
Topic 2 [('man', 0.69424), ('charge', 0.30041), ('court', 0.16911), ('face', 0.11246), ('murder', 0.10686)]
Topic 3 [('new', 0.83643), ('plan', 0.23659), ('say', 0.18243), ('govt', 0.1095), ('council', 0.1085)]
Topic 4 [('say', 0.74017), ('plan', 0.35896), ('govt', 0.16576), ('council', 0.12645), ('urge', 0.07243)]
Topic 5 [('plan', 0.73443), ('council', 0.17463), ('govt', 0.14345), ('urge', 0.08023), ('water', 0.06701)]
Topic 6 [('govt', 0.54038), ('court', 0.27435), ('urge', 0.24431), ('fund', 0.19284), ('nsw', 0.16639)]
Topic 7 [('charge', 0.52949), ('court', 0.43207), ('face', 0.35924), ('murder', 0.11759), ('plan', 0.11393)]
Topic 8 [('win', 0.59268), ('court', 0.3663), ('kill', 0.18065), ('crash', 0.14694), ('face', 0.11462)]
Topic 9 [('win', 0.59908), ('charge', 0.47901), ('australia', 0.07099), ('qld', 0.06368), ('cup', 0.06358)]
Topic 10 [('council', 0.70156), ('kill', 0.2

##LDA

In [71]:
#TF-IDF 생성
#상위 5000개의 단어만 사용
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tf_idf_matrix = tfidf_vectorizer.fit_transform(train_data)

print('matrix size:', tf_idf_matrix.shape)

matrix size: (1054983, 5000)


In [72]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components = 10, learning_method = 'online', random_state = 777, max_iter = 1)
lda_model.fit_transform(tf_idf_matrix)

array([[0.0335099 , 0.0335099 , 0.0335099 , ..., 0.17024867, 0.0335099 ,
        0.0335099 ],
       [0.03365631, 0.03365631, 0.03365631, ..., 0.03365631, 0.03365631,
        0.03365631],
       [0.25184095, 0.0366096 , 0.0366096 , ..., 0.0366096 , 0.0366096 ,
        0.0366096 ],
       ...,
       [0.26687206, 0.02914502, 0.02914502, ..., 0.13007484, 0.02916018,
        0.28739608],
       [0.10378115, 0.02637829, 0.12325014, ..., 0.02637829, 0.02637829,
        0.02637829],
       [0.03376055, 0.03376055, 0.2255442 , ..., 0.03376055, 0.03376055,
        0.03376055]])

In [73]:
print(np.shape(lda_model.components_))

(10, 5000)


In [74]:
def get_topic2(components, feature_names, n =5):
  for idx, topic in enumerate(components):
    print('Topic %d' %(idx+1),[(feature_names[i],topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])

get_topics(lda_model.components_,term)

Topic 1 [('australia', 9359.06334), ('sydney', 5854.97288), ('attack', 4784.76322), ('change', 4193.63035), ('year', 3924.88997)]
Topic 2 [('government', 6344.07413), ('charge', 5947.12292), ('man', 4519.7974), ('state', 3658.16422), ('live', 3625.10473)]
Topic 3 [('australian', 7666.65651), ('say', 7561.01807), ('police', 5513.22932), ('home', 4048.38409), ('report', 3796.04446)]
Topic 4 [('melbourne', 5298.35047), ('south', 4844.59835), ('death', 4281.78433), ('china', 3214.44581), ('women', 3029.28443)]
Topic 5 [('win', 5704.0914), ('canberra', 4322.0963), ('die', 4025.63057), ('open', 3771.65243), ('warn', 3577.47151)]
Topic 6 [('court', 5246.3124), ('world', 4536.86331), ('country', 4166.34794), ('woman', 3983.97748), ('crash', 3793.50267)]
Topic 7 [('election', 5418.5038), ('adelaide', 4864.95604), ('house', 4478.6135), ('school', 3966.82676), ('2016', 3955.11155)]
Topic 8 [('trump', 8189.58575), ('new', 6625.2724), ('north', 3705.40987), ('rural', 3521.42659), ('donald', 3356.26