## LDA(Latent Dirichlet Allocation)
- 문서의 집합에서 토픽을 찾아내는 프로세스 즉 토픽 모델링의 대표적 알고리즘
- LDA DTM 혹은 TF-IDF행렬을 입력으로 한다.

## Text data preprocessing

In [6]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=42, remove=('headers','footers',' quotes'))

In [3]:
document = dataset.data 

In [4]:
import re
from nltk.corpus import stopwords
stopword = stopwords.words('english')

def cleaning(text):
    text = re.sub('[^A-Za-z]',' ',text)
    text = text.lower()
    text = [i for i in text.split() if len(i)>2]
    
    return text

In [7]:
df = pd.DataFrame({'document': document})

In [8]:
clean_doc = [cleaning(i) for i in df['document']]

In [9]:
df['clean_doc'] = clean_doc 

In [10]:
df

Unnamed: 0,document,clean_doc
0,I was wondering if anyone out there could enli...,"[was, wondering, anyone, out, there, could, en..."
1,A fair number of brave souls who upgraded thei...,"[fair, number, brave, souls, who, upgraded, th..."
2,"well folks, my mac plus finally gave up the gh...","[well, folks, mac, plus, finally, gave, the, g..."
3,Robert J.C. Kyanko (rob@rjck.UUCP) wrote:\n> a...,"[robert, kyanko, rob, rjck, uucp, wrote, abrax..."
4,"From article <C5owCB.n3p@world.std.com>, by to...","[from, article, owcb, world, std, com, tombake..."
...,...,...
11309,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,"[from, nyeda, cnsvax, uwec, edu, david, nye, n..."
11310,"I have a (very old) Mac 512k and a Mac Plus, b...","[have, very, old, mac, and, mac, plus, both, w..."
11311,I just installed a DX2-66 CPU in a clone mothe...,"[just, installed, cpu, clone, motherboard, and..."
11312,In article <1qkgbuINNs9n@shelley.u.washington....,"[article, qkgbuinns, shelley, washington, edu,..."


In [11]:
df.clean_doc[:5]

0    [was, wondering, anyone, out, there, could, en...
1    [fair, number, brave, souls, who, upgraded, th...
2    [well, folks, mac, plus, finally, gave, the, g...
3    [robert, kyanko, rob, rjck, uucp, wrote, abrax...
4    [from, article, owcb, world, std, com, tombake...
Name: clean_doc, dtype: object

### gensim library

In [15]:
import warnings
warnings.filterwarnings('ignore')
from gensim import corpora

In [18]:
dictionary = corpora.Dictionary(df.clean_doc)

In [28]:
len(dictionary) ## 단어의 갯수

79250

In [21]:
corpus = [dictionary.doc2bow(text) for text in df.clean_doc]

In [24]:
print(len(corpus))
print(corpus[0]) ## 44,45와 mapping된 단어는 4번이 사용되었다.

11314
[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 4), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 6), (43, 1), (44, 4), (45, 4), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1)]


In [29]:
import gensim

In [30]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics= 20, id2word=dictionary, passes=15)
## topic을 20개로 설정

#### 단어 앞의 수치는 각 토픽에대한 단어의 기여도를 말한다

In [34]:
topics = ldamodel.print_topics()
for topic in topics:
    print(topic)
    print('='*60) ## 

(0, '0.042*"the" + 0.023*"with" + 0.022*"and" + 0.020*"drive" + 0.015*"card" + 0.014*"have" + 0.014*"for" + 0.012*"scsi" + 0.010*"system" + 0.010*"this"')
(1, '0.149*"edu" + 0.120*"writes" + 0.111*"article" + 0.099*"com" + 0.056*"apr" + 0.012*"you" + 0.010*"news" + 0.010*"uiuc" + 0.010*"netcom" + 0.008*"org"')
(2, '0.042*"scx" + 0.017*"chz" + 0.016*"gcx" + 0.013*"sandvik" + 0.012*"rlk" + 0.012*"rck" + 0.010*"kent" + 0.009*"uww" + 0.009*"syx" + 0.009*"mcx"')
(3, '0.074*"the" + 0.020*"and" + 0.014*"team" + 0.013*"game" + 0.012*"for" + 0.010*"will" + 0.010*"was" + 0.009*"year" + 0.008*"but" + 0.008*"games"')
(4, '0.066*"the" + 0.035*"that" + 0.026*"and" + 0.021*"you" + 0.018*"not" + 0.014*"are" + 0.013*"this" + 0.011*"have" + 0.010*"for" + 0.009*"but"')
(5, '0.017*"pat" + 0.016*"digex" + 0.016*"helmet" + 0.014*"lib" + 0.012*"henrik" + 0.009*"libxmu" + 0.009*"xmu" + 0.009*"cyprus" + 0.008*"access" + 0.008*"com"')
(6, '0.026*"pitt" + 0.022*"gordon" + 0.020*"banks" + 0.016*"geb" + 0.013*"sur

### Topic distribution by text

In [52]:
for idx, topic_list in enumerate(ldamodel[corpus]):
    if idx == 10:
        break
        
    print(f" text_num : {idx} : {topic_list}")
    print('='*60)

 text_num : 0 : [(11, 0.087477356), (15, 0.89907825)]
 text_num : 1 : [(0, 0.22481254), (4, 0.1085002), (11, 0.32920027), (15, 0.23399226), (18, 0.09359312)]
 text_num : 2 : [(0, 0.05326696), (3, 0.030249566), (4, 0.055214375), (11, 0.19569851), (15, 0.6621454)]
 text_num : 3 : [(0, 0.09049226), (1, 0.1026771), (11, 0.31038532), (15, 0.44106385), (17, 0.04170013)]
 text_num : 4 : [(1, 0.10365048), (4, 0.29804382), (6, 0.027303578), (10, 0.0122889625), (11, 0.27612337), (15, 0.23263854), (18, 0.044315405)]
 text_num : 5 : [(1, 0.1236303), (4, 0.3717101), (6, 0.04107518), (10, 0.026564334), (15, 0.14156517), (18, 0.29312345)]
 text_num : 6 : [(11, 0.31914705), (14, 0.016620371), (15, 0.55174357), (18, 0.094680406)]
 text_num : 7 : [(0, 0.5385661), (1, 0.01861356), (4, 0.19817573), (11, 0.09643086), (12, 0.09988304), (15, 0.025428852), (19, 0.020625718)]
 text_num : 8 : [(11, 0.82249403), (15, 0.14146975)]
 text_num : 9 : [(0, 0.23261598), (11, 0.30091503), (12, 0.025311721), (15, 0.43616

#### n번째  text에서 토픽이 차지하는 비율

In [53]:
def topictable(ldamodel, corpus):
    topic_table = pd.DataFrame()
    
    for idx, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list
        doc = sorted(doc, key = lambda x: x[1], reverse = True) ## 내림차순 정렬

        for j,(topic_num, score) in enumerate(doc):
            if j == 0:
                topic_table = topic_table.append(pd.Series([int(topic_num), round(score,4), topic_list]), ignore_index=True)
                
            else:
                break
                
    return topic_table

In [54]:
df = topictable(ldamodel, corpus)

In [55]:
df = df.reset_index()

In [56]:
df.columns = ['문서 번호', '가장 비중높은 토픽', '비중 높은 토픽의 비중','각 토픽의 비중']

In [58]:
df

Unnamed: 0,문서 번호,가장 비중높은 토픽,비중 높은 토픽의 비중,각 토픽의 비중
0,0,15.0,0.8991,"[(11, 0.08743862), (15, 0.89911693)]"
1,1,11.0,0.3292,"[(0, 0.22483087), (4, 0.108510405), (11, 0.329..."
2,2,15.0,0.6621,"[(0, 0.053271353), (3, 0.030250825), (4, 0.055..."
3,3,15.0,0.4410,"[(0, 0.09042129), (1, 0.10267393), (11, 0.3105..."
4,4,4.0,0.2982,"[(1, 0.10364907), (4, 0.29824147), (6, 0.02730..."
...,...,...,...,...
11309,11309,15.0,0.5499,"[(4, 0.28352305), (10, 0.023721097), (14, 0.13..."
11310,11310,15.0,0.6115,"[(0, 0.35070455), (14, 0.028084742), (15, 0.61..."
11311,11311,15.0,0.7545,"[(0, 0.23475595), (15, 0.75451803)]"
11312,11312,4.0,0.4717,"[(1, 0.032270037), (4, 0.47173855), (10, 0.030..."
