### NMF (Non-Negative Matric Factorization, 음수 미포함 행렬 분해)

* 음수를 포함하지 않는 행렬 X를 음수를 포함하지 않는 행렬 W와 H의 곱으로 분해하는 알고리즘.
* 수식으로 표현하면, X = W * H

* 장점
* 음수값이 포함되지 않은 데이터를 설명할 때 유용함.
* feature 벡터들이 서로 직교(하나가 바뀌어도 나머지에 어떤 영향도 주지 않음)하게 되면 데이터셋의 실제 데이터 구조를 잘 반영하지 못하게 될 수도 있다. 이러한 점을 해결함.
* -> feature들의 독립성을 잘 catch하여 데이터 구조를 잘 반영함.

In [1]:
import pandas as pd
npr = pd.read_csv('npr.csv')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [2]:
npr.info() # 기사

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11992 entries, 0 to 11991
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  11992 non-null  object
dtypes: object(1)
memory usage: 93.8+ KB


In [3]:
# 기사들만 가지고 훈련함. 정답 데이터 없음. 비지도 학습.
# 비지도 학습 - LDA, clustering, 차원 축소

### Text Preprocessing

In [4]:
# tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# max_df(threshold:임계값), min_df(threshold) -> 0.0 ~ 1.0 or int (default 1)
tfidf = TfidfVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')

In [8]:
# 문서 단어 행렬(Document-Term Matrix, DTM)
dtm = tfidf.fit_transform(npr['Article'])

In [9]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [10]:
# NMP (토픽 모델링의 대표 알고리즘)
from sklearn.decomposition import NMF

In [11]:
# n_components -> topic count
NMF = NMF(n_components = 7, random_state = 42)

In [12]:
NMF.fit(dtm)



NMF(n_components=7, random_state=42)

In [13]:
tfidf.get_feature_names()

['00',
 '000',
 '00000',
 '000s',
 '000th',
 '002',
 '004',
 '007',
 '009',
 '00s',
 '01',
 '011',
 '012',
 '015',
 '02',
 '021',
 '024',
 '029',
 '03',
 '032',
 '033',
 '04',
 '042',
 '05',
 '050',
 '054',
 '058',
 '06',
 '060',
 '062',
 '064',
 '065',
 '068',
 '07',
 '075',
 '08',
 '080',
 '088',
 '09',
 '094',
 '098',
 '0_hellofriend',
 '10',
 '100',
 '1000',
 '100th',
 '101',
 '101st',
 '102',
 '103',
 '104',
 '1040',
 '105',
 '105th',
 '106',
 '1066',
 '107',
 '1070',
 '108',
 '109',
 '10k',
 '10s',
 '10th',
 '11',
 '110',
 '111',
 '112',
 '113',
 '113th',
 '114',
 '114th',
 '115',
 '115th',
 '116',
 '117',
 '118',
 '119',
 '11th',
 '12',
 '120',
 '1200',
 '121',
 '122',
 '123',
 '1234',
 '124',
 '125',
 '125th',
 '126',
 '127',
 '128',
 '129',
 '12th',
 '13',
 '130',
 '1300',
 '1300s',
 '131',
 '131st',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138',
 '139',
 '13th',
 '14',
 '140',
 '1400s',
 '141',
 '142',
 '143',
 '143rd',
 '144',
 '145',
 '146',
 '147',
 '148',
 '149',

In [14]:
len(tfidf.get_feature_names())

54777

In [15]:
import random

for i in range(10): # 0 ~ 9
    random_word_id = random.randint(0, 54776)
    print(tfidf.get_feature_names()[random_word_id])

flowery
jaffe
011
behaving
straus
darien
hanning
androgynous
beckoning
florist


In [16]:
for i in range(10):
    random_word_id = random.randint(0, 54776)
    print(tfidf.get_feature_names()[random_word_id])

gratification
5c
month
antarctica
raghunathan
ncse
tudors
parishes
timber
garb


In [17]:
len(NMF.components_) # topic count

7

In [18]:
NMF.components_ # (7, 54777)

array([[0.00000000e+00, 2.49950821e-01, 0.00000000e+00, ...,
        1.70313822e-03, 2.37544362e-04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 8.22048918e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 3.12379960e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.89723338e-03, 0.00000000e+00, 1.50186440e-03, ...,
        7.06428924e-04, 5.85500542e-04, 6.89536542e-04],
       [4.01763234e-03, 5.31643833e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [19]:
len(NMF.components_[0])

54777

In [20]:
single_topic = NMF.components_[0] # 첫번째

In [21]:
single_topic

array([0.00000000e+00, 2.49950821e-01, 0.00000000e+00, ...,
       1.70313822e-03, 2.37544362e-04, 0.00000000e+00])

In [22]:
single_topic.argsort() # sort

array([    0, 27208, 27206, ..., 36283, 54692, 42993], dtype=int64)

In [23]:
single_topic[2475]

0.0

In [24]:
single_topic[42993]

2.0050551654185766

In [25]:
# argsort() - array를 오름차순으로 정렬.
single_topic.argsort()[-10:] # 하위 10개 토픽 출력

array([14441, 36310, 53989, 52615, 47218, 53152, 19307, 36283, 54692,
       42993], dtype=int64)

In [26]:
top_word_indices = single_topic.argsort()[-10:]

In [27]:
for index in top_word_indices:
    print(tfidf.get_feature_names()[index])

disease
percent
women
virus
study
water
food
people
zika
says


In [28]:
for index, topic in enumerate(NMF.components_):
    print(f'TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

TOPIC #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


TOPIC #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


TOPIC #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


TOPIC #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


TOPIC #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


TOPIC #5
['love', 've', 'don', 'album', 'way', 'time', 'song', 'life', 'really', 'know', 'people', 'think', 'just', 'music', 'like']


TOPIC #6
['teacher', 'sta

In [29]:
dtm.shape

(11992, 54777)

In [30]:
len(npr)

11992

In [31]:
topic_results = NMF.transform(dtm)
topic_results.shape

(11992, 7)

In [32]:
topic_results[0]

array([0.        , 0.12075603, 0.00140297, 0.05919954, 0.01518909,
       0.        , 0.        ])

In [33]:
topic_results[0].round(2)

array([0.  , 0.12, 0.  , 0.06, 0.02, 0.  , 0.  ])

In [34]:
topic_results[0].argmax()

1

In [35]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [36]:
# axis=1을 설정하면 각 행을 따라 가장 높은 값의 인덱스를 제공합니다.
topic_results.argmax(axis = 1)

array([1, 1, 1, ..., 0, 4, 3], dtype=int64)

In [37]:
npr['Topic'] = topic_results.argmax(axis = 1)

In [38]:
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6
