In [1]:
import pandas as pd
import numpy as np
from khaiii import KhaiiiApi
import gensim
from pprint import pprint

In [2]:
data = pd.read_csv("./modi_data/kor_full.csv")
data.head()

Unnamed: 0,no,year,title,kor_full
0,1,1997,멀티미디어 원격교육에 관한 연구,멀티미디어 원격교육에 관한 연구
1,2,1997,교육용 하이퍼미디어 자료 편집기에 관한 연구,교육용 하이퍼미디어 자료 편집기에 관한 연구
2,3,1997,인터넷 기반의 코스웨어의 설계 및 구현,인터넷 기반의 코스웨어의 설계 및 구현
3,4,1997,Web에서의 협력 환경 구축 방안 연구,Web에서의 협력 환경 구축 방안 연구
4,5,1997,열린교육에서의 개별화수업과 CAI,열린교육에서의 개별화수업과 CAI


In [3]:
dfWordList = pd.read_excel("./khaiii_word_cor.xlsx")
dfWordList2 = pd.read_excel("./khaiii_word_cor_etc.xlsx")
#print(dfWordList2)

dfWordDel = dfWordList[dfWordList["수정"] == "삭제"]
dfWordMod = dfWordList[dfWordList["수정"] != "삭제"]
dfWordDiv = dfWordList2
#print(dfWordMod)

seriesDelete = dfWordDel["기존"]
stopword = []
for word in seriesDelete.values:
    stopword.append(word)
#print(stopword)

seriesModify = dfWordMod["기존"]
modiword = []
for word in seriesModify.values:
    modiword.append(word)
#print(len(modiword))

seriesModify2 = dfWordMod["수정"]
modiword2 = []
for word in seriesModify2.values:
    modiword2.append(word)
#print(len(modiword2))

seriesDivide = dfWordDiv["기존"]
divword = []
for word in seriesDivide.values:
    divword.append(word)
#print(divword)
#print(len(divword))

seriesDivide2 = dfWordDiv["수정"]
divword2 = []
for words in seriesDivide2.values:
    divword2.append(words.split(', '))
#print(divword2)
#print(len(divword2))

In [4]:
api = KhaiiiApi()
def khaiiiTokenizer(raw, stopword=stopword, pos=['NNG', 'NNP', 'NNB', 'NP', 'NR', 'SL'], minLen=1): # 일반명사 고유명사 의존명사 대명사 수사 외국어
    list = []
    skip = 0

    for word in api.analyze(raw): #raw data
        #print(word)
        
        for i, morph in enumerate(word.morphs):
            #print(morph.lex)
            if skip == 1: 
                #print(morph.lex) # '지능'
                skip = 0
                continue

            if morph.lex == '인공' and i+1 < len(word.morphs) and word.morphs[i+1].lex == "지능":
                #print(morph.lex + word.morphs[i+1].lex) # 인공지능
                list.append(morph.lex + word.morphs[i+1].lex)
                skip = 1
                continue

            if len(morph.lex) > minLen and morph.tag in pos and morph.lex not in stopword: 
                if morph.tag == 'SL':
                    morph.lex = morph.lex.lower()
                if morph.lex in divword:
                    morph.lex = divword2[divword.index(morph.lex)]
                    list.extend(morph.lex)
                elif morph.lex in modiword:
                    morph.lex = modiword2[modiword.index(morph.lex)]
                    list.append(morph.lex)
                else: list.append(morph.lex)
                
    return list

In [5]:
tokenized = data["kor_full"].apply(lambda row: khaiiiTokenizer(row))
print(tokenized)
#tokenized.to_csv("./modi_data/token_khaiii.csv")
print("========= tokenization completed =========")

0                                     [멀티미디어, 원격, 교육, 연구]
1                               [교육, 하이퍼미디어, 자료, 편집기, 연구]
2                                     [인터넷, 코스웨어, 설계, 구현]
3                               [web, 협력, 환경, 구축, 방안, 연구]
4                                       [교육, 개별, 수업, cai]
                              ...                        
1144    [개정, 교과서, 소프트웨어, 교육, 단원, 탐구, 비교, 분석, 교육, 과정, 교...
1145    [이러닝, 콘텐츠, 사용자, 경험, ux, 평가, 이러닝, 대리, 상호, 작용, 사...
1146    [초등, 데이터, 리터러시, 함양, ai, 데이터, 과학, 교육, 프로그램, 개발,...
1147    [초등, 예비, 교사, 소프트웨어, 교육, 온라인, 교육, 효과, 분석, 소프트웨어...
1148    [초등, 교과서, 소프트웨어교육, 영역, 컴퓨팅, 사고력, 요소, 분석, 소프트웨어...
Name: kor_full, Length: 1149, dtype: object


In [6]:
#lda
id2word = gensim.corpora.Dictionary(tokenized)

corpus=[id2word.doc2bow(text) for text in tokenized]
#print("id2word for each document : ", corpus)
print("# words in total : ", len(id2word))
print("# documents : ", len(corpus))

# words in total :  5086
# documents :  1149


In [7]:
optimal_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=13, # 13 or 11
                                                random_state=100,
                                                update_every=1,
                                                iterations=1000,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                eta='auto',
                                                per_word_topics=True)
model_topics = optimal_model.show_topics(formatted=False)

pprint(optimal_model.print_topics())

[(0,
  '0.072*"수업" + 0.052*"활용" + 0.050*"문제" + 0.041*"학생" + 0.041*"대상" + 0.040*"효과" '
  '+ 0.036*"적용" + 0.029*"연구" + 0.027*"모형" + 0.026*"중심"'),
 (1,
  '0.120*"인공지능" + 0.094*"ai" + 0.078*"데이터" + 0.078*"창의" + 0.071*"알고리즘" + '
  '0.040*"융합" + 0.037*"프로그램" + 0.028*"모델" + 0.026*"탐색" + 0.020*"분야"'),
 (2,
  '0.138*"측정" + 0.072*"정도" + 0.067*"it" + 0.036*"문서" + 0.030*"타당도" + '
  '0.030*"산출" + 0.021*"범주" + 0.015*"xml" + 0.013*"상관관계" + 0.011*"밀접"'),
 (3,
  '0.318*"평가" + 0.078*"유형" + 0.060*"수행" + 0.042*"플랫폼" + 0.033*"단원" + '
  '0.029*"이러닝" + 0.027*"절차" + 0.021*"발표" + 0.021*"관찰" + 0.017*"iptv"'),
 (4,
  '0.117*"로봇" + 0.081*"영향" + 0.047*"분석" + 0.040*"교과서" + 0.033*"유의미" + '
  '0.029*"빅데이터" + 0.028*"긍정" + 0.027*"사용" + 0.026*"연수" + 0.025*"통계"'),
 (5,
  '0.106*"컴퓨팅" + 0.105*"프로그래밍" + 0.087*"사고력" + 0.046*"프로그램" + 0.043*"교육" + '
  '0.039*"초등" + 0.024*"사고" + 0.023*"게임" + 0.022*"소프트웨어교육" + 0.021*"개발"'),
 (6,
  '0.060*"시스템" + 0.046*"온라인" + 0.041*"집단" + 0.032*"확인" + 0.029*"기준" + '
  '0.028*"이용" + 0.025*"전문가" 

In [8]:
# get p(word|topic)
def get_topic_word_prob(lda_model):
    topic_word_freq = lda_model.state.get_lambda()
    topic_word_prob = topic_word_freq / topic_word_freq.sum(axis=1)[:, None]

    return topic_word_prob

In [9]:
topic_word_prob = get_topic_word_prob(optimal_model)
print(topic_word_prob.shape) # (#topics, #words)

wordlist = []
for i in range(len(id2word)):
    #print(id2word[i])
    wordlist.append(id2word[i])
#print(wordlist)
seriesWordlist = pd.Series(wordlist)

topic_word = pd.DataFrame(data=topic_word_prob[0:, 0:], columns=seriesWordlist)

print(topic_word)

(13, 5086)
          교육     멀티미디어        연구        원격        자료       편집기    하이퍼미디어  \
0   0.002215  0.000195  0.028537  0.000005  0.000011  0.000005  0.000005   
1   0.001012  0.000014  0.000014  0.000014  0.000014  0.000014  0.000014   
2   0.000099  0.000099  0.000099  0.000099  0.000099  0.000099  0.000099   
3   0.000032  0.000032  0.000032  0.000032  0.000032  0.000032  0.000032   
4   0.000014  0.000014  0.012572  0.000014  0.000014  0.000014  0.000014   
5   0.043205  0.000007  0.011587  0.000007  0.000007  0.000097  0.000007   
6   0.000011  0.000011  0.002519  0.000011  0.003488  0.000011  0.000011   
7   0.000018  0.004570  0.001044  0.000018  0.000018  0.000018  0.000612   
8   0.000051  0.000051  0.000051  0.000051  0.000051  0.000051  0.000051   
9   0.000005  0.000005  0.020482  0.000005  0.009867  0.000005  0.000005   
10  0.156253  0.000002  0.039828  0.000002  0.005235  0.000002  0.000002   
11  0.000058  0.000058  0.000058  0.049458  0.000058  0.000058  0.000058   
1

In [10]:
tokenized2 = data['kor_full'].apply(lambda row: khaiiiTokenizer(row, pos=['NNG', 'NNP', 'NNB', 'NP', 'NR', 'SL', 'VV', 'VA', 'MM'], minLen=0))
print(tokenized2)
print("========= tokenization completed =========")

0                                 [멀티미디어, 원격, 교육, 관하, 연구]
1                           [교육, 하이퍼미디어, 자료, 편집기, 관하, 연구]
2                                     [인터넷, 코스웨어, 설계, 구현]
3                               [web, 협력, 환경, 구축, 방안, 연구]
4                                   [열리, 교육, 개별, 수업, cai]
                              ...                        
1144    [개정, 실, 교과서, 소프트웨어, 교육, 단원, 탐구, 비교, 분석, 개, 정, ...
1145    [화, 이러닝, 콘텐츠, 관하, 사용자, 경험, ux, 질, 평가, 화, 이러닝, ...
1146    [초등, 데이터, 리터러시, 함양, 위하, ai, 데이터, 과학, 교육, 프로그램,...
1147    [초등, 예비, 교사, 위하, 소프트웨어, 교육, 대하, 온라인, 교육, 효과, 분...
1148    [초등, 실, 교과서, 나, 소프트웨어교육, 영역, 나타나, 컴퓨팅, 사고력, 요소...
Name: kor_full, Length: 1149, dtype: object


In [11]:
# n-gram candidates

def get_ngrams(raw, n_range=(1,3)): # 1~n-gram 까지

    def to_ngrams(words, n):
        ngrams = []
        for b in range(0, len(words) - n + 1):
            ngrams.append(str(tuple(words[b:b+n])))
        return ngrams

    n_begin, n_end = n_range
    ngram_list = []
    
    for n in range(n_begin, n_end + 1):
        for ngram in to_ngrams(raw, n):
            ngram_list.append(ngram)
            
    return ngram_list


In [12]:
ngrams = tokenized2.apply(lambda row: get_ngrams(row))

print(ngrams)
#print(type(ngrams))
#print(type(ngrams.loc[0]))
#print(type(ngrams.loc[0][0]))
#ngrams.to_csv('./modi_data/ngrams.csv')

0       [('멀티미디어',), ('원격',), ('교육',), ('관하',), ('연구',...
1       [('교육',), ('하이퍼미디어',), ('자료',), ('편집기',), ('관하...
2       [('인터넷',), ('코스웨어',), ('설계',), ('구현',), ('인터넷'...
3       [('web',), ('협력',), ('환경',), ('구축',), ('방안',),...
4       [('열리',), ('교육',), ('개별',), ('수업',), ('cai',),...
                              ...                        
1144    [('개정',), ('실',), ('교과서',), ('소프트웨어',), ('교육',...
1145    [('화',), ('이러닝',), ('콘텐츠',), ('관하',), ('사용자',)...
1146    [('초등',), ('데이터',), ('리터러시',), ('함양',), ('위하',...
1147    [('초등',), ('예비',), ('교사',), ('위하',), ('소프트웨어',...
1148    [('초등',), ('실',), ('교과서',), ('나',), ('소프트웨어교육'...
Name: kor_full, Length: 1149, dtype: object


In [13]:
# get score(l, t)
# score = sum_w(p(w|t) * PMI(w,l|c))

def get_score(unigram_counter, ngram_candidates, candidate, topic, topic_word_prob, ngram_docs):
    def get_pmi(w, l_freq, unigram_counter, docs):
        word = "('" + w +"',)"
        count = 0

        for doc in docs:
            #print(doc)
            if word in doc: 
                count += 1
        pmi = count / (unigram_counter.loc[0, w] * l_freq)
        return pmi
    
    candi_docs = []
    #print(type(candi_docs))
    for doc in ngram_docs:
        if candidate in doc:
            candi_docs.append(doc)
    #print(candi_docs)

    for i in range(len(ngram_candidates)):
        if ngram_candidates.loc[i, '후보명'] == candidate:
            candi_count = ngram_candidates.loc[i, 'count']
    #print(candidate, candi_count)

    score = 0
    for word in unigram_counter.columns:
        tw_prob = topic_word_prob.loc[topic, word]
        pmi = get_pmi(word, candi_count, unigram_counter, candi_docs)
        score += tw_prob * pmi

    return round(score * 1000, 5)
    

In [14]:
nc = pd.read_excel("./modi_data/ngram_candidates.xlsx")
uc = pd.read_excel("./modi_data/unigram_counter.xlsx")
nc = nc.drop(['Unnamed: 0'], axis=1)
uc = uc.drop(['Unnamed: 0'], axis=1)

#print(nc.columns[0])
#print(nc.loc[0, '후보명'])
#print(topic_word.loc[0, '교육'])
#score1 = get_score(uc, nc, nc.loc[21, '후보명'], 0, topic_word, ngrams)

#print(score1)

In [15]:
# for all topics
score_full = pd.DataFrame() # (topic, l_index)
for j in range(13): # 11 or 13
    score0 = []
    for i in range(len(nc)):
        score = get_score(uc, nc, nc.loc[i, '후보명'], j, topic_word, ngrams)
        score0.append(score)
        #score0[nc.loc[i, '후보명']] = score
    score_full = score_full.append(pd.Series(score0), ignore_index=True)
    #score0 = sorted(score0.items(), key=(lambda x:x[1]), reverse=True)
    #score0 = pd.DataFrame.from_dict(score0, orient='index')
    #score0.to_excel('./final_data/topic' + str(j) + '_candidates.xlsx')
    print('topic'+str(j))

print(score_full)

topic0
topic1
topic2
topic3
topic4
topic5
topic6
topic7
topic8
topic9
topic10
topic11
topic12
         0        1        2        3        4        5        6        7   \
0   0.17263  0.22817  0.20887  0.08939  0.52270  0.61278  0.17654  0.50033   
1   0.03081  0.18426  0.21663  0.07855  0.98509  0.54752  0.19194  0.37957   
2   0.07492  0.42676  0.51182  0.42490  0.47892  0.31328  0.13022  0.85102   
3   0.13085  0.31231  0.18703  0.07584  0.43160  0.21659  0.24813  0.40945   
4   0.15167  0.20995  0.10571  0.24286  0.55487  1.19974  0.19807  0.46585   
5   0.04707  0.07205  0.11746  0.06334  0.54650  0.96316  0.11539  0.36678   
6   0.12145  0.25256  0.39088  0.07162  1.02844  0.32443  0.16169  0.55452   
7   0.11584  0.31606  0.32396  0.15760  0.46091  0.14183  0.08214  0.50978   
8   0.03920  0.28063  0.22725  0.47682  0.66137  0.54381  0.19198  0.47954   
9   0.11646  0.22093  0.18979  0.06557  0.47885  0.34552  0.15357  0.46641   
10  0.10963  0.21840  0.17108  0.08647  0.51755 

In [16]:
score_full.to_excel('./final_data/topic_full_candidates.xlsx')

In [17]:
# score'(l,t) = score(l,t) - alpha(변별성계수) * avg(score(l, t제외))
alpha = 0.2
score_final = pd.DataFrame() # (t, l)

for i in range(len(nc)):
    score_ = []
    for t in range(13): # 11 or 13
        score = score_full.loc[t, i] #score(l, t)
        score_list = [s for j, s in enumerate(score_full[i]) if j != t]
        avg = sum(score_list) / len(score_list)
        score_.append(score - alpha * avg)
    score_final[nc.loc[i, '후보명']] = score_
print(score_final)

    ('사이버', '가', '정학습')  ('동', '영', '상')  ('애', '플리케이션')  ('로봇', '보조', '학습')  \
0              0.154630         0.168788        0.155721            0.053940   
1              0.010447         0.124146        0.163610            0.042919   
2              0.055292         0.370687        0.463720            0.395042   
3              0.112154         0.254330        0.133517            0.040164   
4              0.133321         0.150264        0.050842            0.209968   
5              0.026978         0.010066        0.062787            0.027456   
6              0.102597         0.193584        0.340765            0.035874   
7              0.096894         0.258142        0.272729            0.123287   
8              0.018976         0.222122        0.174407            0.447827   
9              0.097524         0.161427        0.136323            0.029723   
10             0.090580         0.158855        0.117301            0.050972   
11             0.050920         0.649061

In [18]:
for i in range(13):
    #print(score_final.loc[i]) #(100, 1)
    s = score_final.loc[i].sort_values(axis=0, ascending=False)
    #print(s)
    s.to_excel('./final_data/topic' + str(i) + '_candidates.xlsx')


In [84]:
# get score(l, t)
# score = sum_w(p(w|t) * PMI(w,l|c))
"""
def get_score(unigram_counter, ngram_candidates, candidate, topic, topic_word_prob, ngram_docs):
    def get_pmi(w, l, l_freq, unigram_counter, docs):
        word = "('" + w +"',)"
        #print(word)
        count = 0

        for doc in docs:
            #print(doc)
            if word in doc and l in doc:#and word in doc:
                #print('yesss')
                count += 1

        pmi = count / (unigram_counter.loc[0, w] * l_freq)
        return pmi
    
    for i in range(len(ngram_candidates)):
        if ngram_candidates.loc[i, '후보명'] == candidate:
            candi_count = ngram_candidates.loc[i, 'count']
    print(candidate, candi_count)

    score = 0
    for word in unigram_counter.columns:
        tw_prob = topic_word_prob.loc[topic, word]
        pmi = get_pmi(word, candidate, candi_count, unigram_counter, ngram_docs)
        score += tw_prob * pmi

    return score
    """
    

In [60]:
# get PMI(word, 후보명(l)|문맥(c))
# PMI = (#(w,l)) / (#w * #l)
"""
def get_pmi(unigram_counter, ngram_candidates, ngram_docs):
    dfPMI = pd.DataFrame()
    pmi_list = []

    def get_wl_count(word, candidate, docs):
        word = '(' + word + ',)'
        count = 0

        for doc in docs:
            if candidate in doc and word in doc:
                count += 1
        return count

    for i, l in enumerate(ngram_candidates['후보명']):
        for w in unigram_counter.columns:
            pmi = get_wl_count(w, l, ngram_docs) / (unigram_counter.loc[0, w] * ngram_candidates.loc[i, 'count'])
            pmi_list.append(pmi) 
            print('word')   
        dfPMI.append(pmi_list)
        pmi_list = []
        print('5000')

    dfPMI.columns = unigram_counter.columns

    return dfPMI
    """

Error: ModuleNotFoundError