#### LSA (Latent Semantic Analysis)를 이용한 뉴스 데이터 클러스터링

In [1]:
import numpy as np
import re #정규 표현식 사용
import pickle#자료형 변경없이 불러올 수 있음
from nltk.corpus import stopwords
#from sklearn.datasets import fetch_20newsgroups

In [2]:
#newsData = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

In [None]:
#with open('./data/news.data', 'wb') as f:
#    pickle.dump(newsData , f, pickle.HIGHEST_PROTOCOL)

주석 애들 처리하면 직접 파일 받아옴

In [4]:
with open('./news.data', 'rb') as f:
    newsData  = pickle.load(f)

In [5]:
news = newsData.data
print(len(news))
print(news[0])#첫 번째 뉴스

11314
Well i'm not sure about the story nad it did seem biased. What
I disagree with is your statement that the U.S. Media is out to
ruin Israels reputation. That is rediculous. The U.S. media is
the most pro-israeli media in the world. Having lived in Europe
I realize that incidences such as the one described in the
letter have occured. The U.S. media as a whole seem to try to
ignore them. The U.S. is subsidizing Israels existance and the
Europeans are not (at least not to the same degree). So I think
that might be a reason they report more clearly on the
atrocities.
	What is a shame is that in Austria, daily reports of
the inhuman acts commited by Israeli soldiers and the blessing
received from the Government makes some of the Holocaust guilt
go away. After all, look how the Jews are treating other races
when they got power. It is unfortunate.



In [6]:
print(newsData.target_names)
print(len(newsData.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
20


#### preprocessing
영문자가 아닌 문자를 제거한다.

In [7]:
news1 = []
for doc in news: #뉴스 하나씩 읽어
    news1.append(re.sub("[^a-zA-Z]", " ", doc)) #^: start  영문자 이외의 문자는 공백
 # ^ : start  + : 전 문자가 1개 이상 / * : 0개 이상 /{2, 5} : 2~5번 반복 / $ : end / [a-z] : a-z 중 한 개
                                                                 # [^a-z] : ^가 not으로 바뀜 --> a-z아닌것

불용어를 제거하고, 모든 단어를 소문자로 변환하고, 길이가 3 이하인 단어를 제거한다.

In [8]:
stop_words = stopwords.words('english')
news2 = []
for doc in news1:
    doc1 = []
    for w in doc.split():
        w = w.lower()
        if len(w) > 3 and w not in stop_words:
            doc1.append(w)
    news2.append(' '.join(doc1))

In [9]:
print(news2[0])

well sure story seem biased disagree statement media ruin israels reputation rediculous media israeli media world lived europe realize incidences described letter occured media whole seem ignore subsidizing israels existance europeans least degree think might reason report clearly atrocities shame austria daily reports inhuman acts commited israeli soldiers blessing received government makes holocaust guilt away look jews treating races power unfortunate


In [38]:
len(news2)

11314

#### TF-IDF matrix 생성

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tf_vector = TfidfVectorizer(max_features = 500) #객체 생성, 사용 단어 500개 한정

In [12]:
tfidf = tf_vector.fit_transform(news2) #500개까지 쳐줘서 tfidf 구성하겠다/ fit : vocabulary 생성, transform
# Term Document Matrix--> 여기서는 DTM

In [13]:
print(tfidf.shape) #     (1, 500) : 문서 1개의 tfidf   X11314
#print(tfidf[0].toarray()[0])

(11314, 500)


In [14]:
vocab = tf_vector.get_feature_names() #voca 구성

In [15]:
print(vocab[:20])

['able', 'access', 'actually', 'address', 'advance', 'agree', 'allow', 'almost', 'already', 'also', 'although', 'always', 'american', 'among', 'anonymous', 'another', 'answer', 'anti', 'anybody', 'anyone']


전처리 끝!

#### Latent Semantic Analysis (LSA) :(U S V^t분해!)

In [5]:
from sklearn.decomposition import TruncatedSVD #주제 개수 만큼 S행렬을 자른다.

In [17]:
svd = TruncatedSVD(n_components = len(newsData.target_names), n_iter=100) 
                #(20개로 임의로 값을 줌, 100 -수치 반복)
     #      주제 개수 =        20            반복        topic 주제 수는 정해진 값이 없다.

In [18]:
svd.fit(tfidf) #분해!  fit시키면 tfidf 애들이 U S Vt에 대한 정보가 output으로 나온다

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=100,
       random_state=None, tol=0.0)

#### U, S, VT 행렬
U 행렬 ~ 차원 = (문서 개수 X topic 개수) : 문서당 topic 분포<br>
S 행렬 ~ 차원 = (topic 개수 X topic 개수)<br>
VT 행렬. 차원 = (topic 개수 X 단어 개수) : topic 당 단어 빈도 (분포)

In [19]:
U = svd.fit_transform(tfidf) / svd.singular_values_
U.shape
#(문서갯수 , 주제 개수)

(11314, 20)

In [20]:
VT = svd.components_
VT.shape
#(주제 개수, 단어 개수)

(20, 500)

In [21]:
S = np.diag(svd.singular_values_)
S.shape
#(주제 수 , )  truncated SVD

(20, 20)

#### Topic별로 문서 분류
U 행렬에서 가장 큰 colume을 선택한다. Colume에 topic 값이 부여돼 있다.

In [22]:
U[0:1, :] # U : (문서개수(20개), topic개수) 
# - 값에 대한 의문? 왜나오는가(모호)----->LSD는 이러한 모호함 X

array([[ 0.00834054, -0.01091546, -0.00328674, -0.00442068, -0.00456436,
         0.00239253,  0.00031934, -0.00100528,  0.00407314, -0.00194108,
        -0.00194613,  0.0092436 ,  0.00233742, -0.00271065, -0.00185346,
         0.00304093, -0.00786485,  0.01936485, -0.01445553,  0.00790409]])

In [23]:
for i in range(10): #10개 문서만 보자
    print('문서-{:d} : topic = {:d}'.format(i+1, np.argmax(U[i:(i+1), :][0])+1))
# U 행렬을 통해 문서를 주제 별로 분류 할 수 있다

# 문서 1과 문서 2는 같은 주제를 다루는 NEWS다

문서-1 : topic = 18
문서-2 : topic = 1
문서-3 : topic = 18
문서-4 : topic = 6
문서-5 : topic = 4
문서-6 : topic = 9
문서-7 : topic = 1
문서-8 : topic = 7
문서-9 : topic = 8
문서-10 : topic = 13


#### VT 행렬에서 topic 별로 중요 단어를 표시한다

In [24]:
VT.shape

(20, 500)

In [25]:
for i in range(len(VT)):
    idx = np.flipud(VT[i].argsort())[:10] # argsort : 작은 것 부터 순서대로 문서 반환, flipud : 배열 위아래 뒤집 
    print('토픽-{:2d} : '.format(i+1), end='')
    for n in idx:
        print('{:s} '.format(vocab[n]), end='')
    print()
    
    #중요한 순으로 문장의 단어 나열
    #전처리가 잘 되지는 않음. would like know 이런 애들은 불용어 처리 해 주어야 함.

토픽- 1 : would like know people think good also could time well 
토픽- 2 : thanks windows please anyone mail card know advance drive file 
토픽- 3 : would thanks anyone know like please could mail someone advance 
토픽- 4 : game team year games good last season players play hockey 
토픽- 5 : would like drive system windows card scsi disk team problem 
토픽- 6 : drive please scsi hard mail sale would email drives people 
토픽- 7 : drive know like anyone scsi drives hard something card think 
토픽- 8 : like please sale mail email offer something send list interested 
토픽- 9 : think windows people please card jesus thanks believe bible mail 
토픽-10 : good card think sale price bike also much looking offer 
토픽-11 : card people video know sale monitor government drivers price offer 
토픽-12 : think chip system could encryption clipper need government space much 
토픽-13 : could thanks right card problem much bike well someone advance 
토픽-14 : good people windows file government files thanks drive would year 
토픽

In [40]:
print(news[0])

Well i'm not sure about the story nad it did seem biased. What
I disagree with is your statement that the U.S. Media is out to
ruin Israels reputation. That is rediculous. The U.S. media is
the most pro-israeli media in the world. Having lived in Europe
I realize that incidences such as the one described in the
letter have occured. The U.S. media as a whole seem to try to
ignore them. The U.S. is subsidizing Israels existance and the
Europeans are not (at least not to the same degree). So I think
that might be a reason they report more clearly on the
atrocities.
	What is a shame is that in Austria, daily reports of
the inhuman acts commited by Israeli soldiers and the blessing
received from the Government makes some of the Holocaust guilt
go away. After all, look how the Jews are treating other races
when they got power. It is unfortunate.



In [27]:
print(news[2])

Although I realize that principle is not one of your strongest
points, I would still like to know why do do not ask any question
of this sort about the Arab countries.

   If you want to continue this think tank charade of yours, your
fixation on Israel must stop.  You might have to start asking the
same sort of questions of Arab countries as well.  You realize it
would not work, as the Arab countries' treatment of Jews over the
last several decades is so bad that your fixation on Israel would
begin to look like the biased attack that it is.

   Everyone in this group recognizes that your stupid 'Center for
Policy Research' is nothing more than a fancy name for some bigot
who hates Israel.


#### A = U x S x VT
이 값은 tfidf matrix에 SVD가 적용된 행렬이다

In [28]:
A = np.dot(U, np.dot(S, VT))

In [29]:
A.shape

(11314, 500)

In [30]:
tfidf.shape

(11314, 500)

In [31]:
A

array([[ 0.00671794,  0.00212366,  0.01166669, ...,  0.01825244,
         0.02951324,  0.02430763],
       [ 0.01198476,  0.00455351,  0.01565654, ...,  0.01271674,
        -0.01521117, -0.00296277],
       [ 0.01327301,  0.00209872,  0.01144589, ...,  0.01295922,
         0.0428024 ,  0.03125753],
       ...,
       [ 0.00355683, -0.00235559,  0.00711601, ...,  0.00302724,
         0.00996596,  0.00567007],
       [ 0.01201571,  0.02172375,  0.00327202, ..., -0.00319135,
        -0.09744651, -0.02149235],
       [ 0.0133031 ,  0.002536  ,  0.02125163, ...,  0.0140308 ,
         0.11475994,  0.0528168 ]])

In [32]:
tfidf.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.17877864, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.13438696, ..., 0.        , 0.12961864,
        0.24839851]])

#### 문서 유사도 측정

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

#### 문서-1과 문서-2의 유사도

In [34]:
cosine_similarity(A[0:1, :], A[1:2, :])

array([[0.26719502]])

In [35]:
cosine_similarity(tfidf.toarray()[0:1, :], tfidf.toarray()[1:2, :])

array([[0.01992352]])

#### 문서-1과 문서-3의 유사도
문서-1은 문서-2보다 문서-3과 더 유사하다

In [36]:
cosine_similarity(A[0:1, :], A[2:3, :])

array([[0.69869725]])

In [37]:
cosine_similarity(tfidf.toarray()[0:1, :], tfidf.toarray()[2:3, :])

array([[0.11930785]])

#### Q) U와 VT에 있는 음수 값은 무엇을 의미하는가?
참고 : Non-negative Matrix Factorisation (NMF)<br>
from sklearn.decomposition import NMF