## LSA(Latent Semantic Analysis)
- 토픽 모델링이라는 분야에 아이디어를 제공한 알고리즘
- 기본적으로 DTM이나 TF-IDF행렬에 절단된 SVD를 사용하여 차원을 축소시키고, 단어들의 잠재된 의미를 이끌어내는 방법

In [2]:
import numpy as np
import pandas as pd

A=np.array([[0,0,0,1,0,1,1,0,0],[0,0,0,1,1,0,1,0,0],[0,1,1,0,2,0,0,0,0],[1,0,0,0,0,0,0,1,1]])
A.shape

(4, 9)

### DTM 생성

In [3]:
pd.DataFrame(A) 

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


### full SVD

In [4]:
U,s,VT = np.linalg.svd(A, full_matrices =True) ## s는 특이값 분해의 결과로 특이값의 리스트를 반환

In [13]:
U.shape, s.shape, VT.shape

((4, 4), (4,), (9, 9))

In [18]:
S = np.zeros((4,9))
S[:4,:4] = np.diag(s)

In [21]:
np.round(S,2)

array([[2.69, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 2.05, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 1.73, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.77, 0.  , 0.  , 0.  , 0.  , 0.  ]])

In [23]:
np.allclose(A, np.dot(np.dot(U,S), VT)) ## A와 SVD행렬의 곱이 같은 것 확인

True

###  Truncated SVD(t=2)

In [28]:
U_ = U[:,:2]
S_ = S[:2,:2]
VT_= VT[:2,:]

In [30]:
U_.shape, S_.shape, VT_.shape

((4, 2), (2, 2), (2, 9))

In [31]:
T_SVD = np.dot(np.dot(U_,S_), VT_)

In [33]:
T_SVD.shape

(4, 9)

In [35]:
pd.DataFrame(np.round(T_SVD,2))

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,-0.17,-0.17,1.08,0.12,0.62,1.08,-0.0,-0.0
1,0.0,0.2,0.2,0.91,0.86,0.45,0.91,0.0,0.0
2,0.0,0.93,0.93,0.03,2.05,-0.17,0.03,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
pd.DataFrame(A)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


## practice

In [37]:
from sklearn.datasets import fetch_20newsgroups

In [38]:
dataset = fetch_20newsgroups(shuffle=True, random_state=42, remove=('headers','footers',' quotes'))

In [49]:
document = dataset.data 

In [55]:
import re
from nltk.corpus import stopwords
stopword = stopwords.words('english')

def cleaning(text):
    text = re.sub('[^A-Za-z]',' ',text)
    text = text.lower()
    text = [i for i in text.split() if len(i)>2]
    
    return text

In [56]:
df = pd.DataFrame({'document': document})

In [57]:
clean_doc = [cleaning(i) for i in df['document']]

In [60]:
df['clean_doc'] = clean_doc 

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words='english', max_features=1000, max_df=0.5, smooth_idf=True)

In [63]:
x = tf.fit_transform(df['clean_doc'])

In [70]:
pd.DataFrame(x.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.129097,0.000000,0.0,0.0
1,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0
2,0.0,0.000000,0.0,0.094407,0.0,0.0,0.0,0.0,0.0,0.07856,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0
3,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.00000,0.163698,0.0,0.0,0.000000,0.000000,0.0,0.0
4,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.125843,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,0.0,0.146493,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.130949,0.000000,0.0,0.0
11310,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0
11311,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0
11312,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.10818,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0


In [71]:
from sklearn.decomposition import TruncatedSVD

In [72]:
svd = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=42)

In [73]:
svd.fit(x)

TruncatedSVD(n_components=20, n_iter=100, random_state=42)

In [80]:
pd.DataFrame(svd.components_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.011636,0.039157,0.016833,0.03194,0.016289,0.010696,0.015981,0.013993,0.011573,0.048003,...,0.010639,0.017256,0.043862,0.044729,0.018162,0.070612,0.061281,0.04027,0.014228,0.016013
1,-0.000163,0.026382,-0.013283,0.048487,-0.009475,-0.004309,-0.013601,-0.008511,-0.011574,-0.011486,...,0.009822,0.005051,-0.019574,-0.010132,-0.011332,-0.037134,-0.030641,-0.010683,-0.003274,-0.013034
2,-0.011499,-0.017181,-0.022142,0.00182,-0.008779,-0.006847,-0.015567,-0.016526,-0.015202,-0.0147,...,-0.008297,-0.012875,-0.021286,0.017412,0.002623,0.001908,-0.030445,-0.009346,-0.001695,-0.004957
3,-0.001954,-0.008246,-0.011648,0.045909,0.00135,-0.001646,-0.000345,-0.00439,0.000654,0.00606,...,-0.004336,-0.00991,-0.013738,0.008454,0.007092,-0.02973,0.001902,0.002427,-0.001971,-0.006797
4,0.00153,0.000557,-0.021527,-0.01553,-0.003707,0.000525,-0.010266,-0.001354,-0.010811,0.009721,...,-0.007082,-0.018621,-0.005427,-0.032966,0.016703,0.219792,0.072652,0.00181,0.034594,0.023036
5,0.001874,0.010715,-0.008595,0.025219,-0.006397,-0.002589,0.014918,0.012309,0.008217,0.001349,...,0.002734,-0.003171,-0.006652,-0.015383,-0.005426,-0.077802,0.012735,0.001974,-0.009573,-0.012789
6,-0.003386,-0.000939,0.001349,-0.011112,-0.000747,-0.000181,-0.006604,-0.009002,-0.004121,-0.003891,...,-0.009271,-0.009496,0.001737,0.004229,-0.002778,-0.027371,-0.008274,0.003278,-0.011072,-0.010927
7,-0.008142,-0.008289,-0.004955,-0.019161,-0.003699,0.002378,-0.004174,-0.005438,-0.007878,-0.014791,...,0.005775,-0.00269,-0.018306,-0.006401,-0.008273,-0.017414,0.003592,-0.002867,0.005065,-0.004757
8,0.011022,0.013828,-0.000278,0.040303,0.006931,0.001195,-0.000314,-0.010212,-0.003518,-0.003393,...,-0.002196,0.004811,-0.006614,0.01072,-0.008612,0.036055,-0.017363,-0.001769,0.002302,0.002662
9,-0.006814,9.5e-05,0.009893,0.012619,0.010291,0.008338,0.016199,0.017945,0.013045,-0.018593,...,0.004873,0.009598,-0.004056,-0.002242,-0.008257,0.012578,0.01849,0.001352,0.018165,0.013223


In [79]:
terms = tf.get_feature_names()

In [82]:
topics=[]
for i, topic in enumerate(svd.components_):
    topics.append(topic)
    

In [111]:
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print(f" Topic {idx+1} ==> {[(feature_names[i], topic[i].round(4)) for i in topic.argsort()[:-n-1:-1]]}")
        print('='*60)

In [112]:
get_topics(svd.components_, terms)

 Topic 1 ==> [('edu', 0.3007), ('com', 0.2055), ('article', 0.2023), ('don', 0.153), ('just', 0.1514)]
 Topic 2 ==> [('windows', 0.3071), ('thanks', 0.2159), ('card', 0.1686), ('drive', 0.1454), ('dos', 0.144)]
 Topic 3 ==> [('edu', 0.6433), ('article', 0.298), ('apr', 0.2023), ('com', 0.1769), ('uiuc', 0.0782)]
 Topic 4 ==> [('com', 0.7923), ('netcom', 0.1382), ('article', 0.1109), ('att', 0.0741), ('key', 0.0595)]
 Topic 5 ==> [('game', 0.2823), ('team', 0.2765), ('year', 0.2198), ('games', 0.1798), ('season', 0.1515)]
 Topic 6 ==> [('key', 0.2582), ('government', 0.1911), ('chip', 0.1764), ('encryption', 0.1605), ('clipper', 0.1424)]
 Topic 7 ==> [('drive', 0.5003), ('scsi', 0.231), ('car', 0.185), ('drives', 0.1419), ('hard', 0.1379)]
 Topic 8 ==> [('thanks', 0.4376), ('mail', 0.2777), ('know', 0.1795), ('advance', 0.1726), ('looking', 0.1566)]
 Topic 9 ==> [('key', 0.3299), ('chip', 0.2356), ('god', 0.2341), ('game', 0.1825), ('team', 0.1723)]
 Topic 10 ==> [('israel', 0.2985), ('