In [1]:
import pandas as pd
import numpy as np
from khaiii import KhaiiiApi
import gensim
from pprint import pprint

In [2]:
data = pd.read_csv("./modi_data/kor_full.csv")
data.head()

Unnamed: 0,no,year,title,kor_full
0,1,1997,멀티미디어 원격교육에 관한 연구,멀티미디어 원격교육에 관한 연구
1,2,1997,교육용 하이퍼미디어 자료 편집기에 관한 연구,교육용 하이퍼미디어 자료 편집기에 관한 연구
2,3,1997,인터넷 기반의 코스웨어의 설계 및 구현,인터넷 기반의 코스웨어의 설계 및 구현
3,4,1997,Web에서의 협력 환경 구축 방안 연구,Web에서의 협력 환경 구축 방안 연구
4,5,1997,열린교육에서의 개별화수업과 CAI,열린교육에서의 개별화수업과 CAI


In [3]:
dfWordList = pd.read_excel("./khaiii_word_cor.xlsx")
dfWordList2 = pd.read_excel("./khaiii_word_cor_etc.xlsx")
#print(dfWordList2)

dfWordDel = dfWordList[dfWordList["수정"] == "삭제"]
dfWordMod = dfWordList[dfWordList["수정"] != "삭제"]
dfWordDiv = dfWordList2
#print(dfWordMod)

seriesDelete = dfWordDel["기존"]
stopword = []
for word in seriesDelete.values:
    stopword.append(word)
#print(stopword)

seriesModify = dfWordMod["기존"]
modiword = []
for word in seriesModify.values:
    modiword.append(word)
#print(len(modiword))

seriesModify2 = dfWordMod["수정"]
modiword2 = []
for word in seriesModify2.values:
    modiword2.append(word)
#print(len(modiword2))

seriesDivide = dfWordDiv["기존"]
divword = []
for word in seriesDivide.values:
    divword.append(word)
#print(divword)
#print(len(divword))

seriesDivide2 = dfWordDiv["수정"]
divword2 = []
for words in seriesDivide2.values:
    divword2.append(words.split(', '))
#print(divword2)
#print(len(divword2))

In [4]:
api = KhaiiiApi()
def khaiiiTokenizer(raw, stopword=stopword, pos=['NNG', 'NNP', 'NNB', 'NP', 'NR', 'SL']): # 일반명사 고유명사 의존명사 대명사 수사 외국어
    list = []
    skip = 0

    for word in api.analyze(raw): #raw data
        #print(word)
        
        for i, morph in enumerate(word.morphs):
            #print(morph.lex)
            if skip == 1: 
                #print(morph.lex) # '지능'
                skip = 0
                continue

            if morph.lex == '인공' and i+1 < len(word.morphs) and word.morphs[i+1].lex == "지능":
                #print(morph.lex + word.morphs[i+1].lex) # 인공지능
                list.append(morph.lex + word.morphs[i+1].lex)
                skip = 1
                continue

            if len(morph.lex) > 1 and morph.tag in pos and morph.lex not in stopword: 
                if morph.tag == 'SL':
                    morph.lex = morph.lex.lower()
                if morph.lex in divword:
                    morph.lex = divword2[divword.index(morph.lex)]
                    list.extend(morph.lex)
                elif morph.lex in modiword:
                    morph.lex = modiword2[modiword.index(morph.lex)]
                    list.append(morph.lex)
                else: list.append(morph.lex)
                
    return list

In [5]:
tokenized = data["kor_full"].apply(lambda row: khaiiiTokenizer(row))
print(tokenized)
#tokenized.to_csv("./modi_data/token_khaiii.csv")
print("========= tokenization completed =========")

0                                     [멀티미디어, 원격, 교육, 연구]
1                               [교육, 하이퍼미디어, 자료, 편집기, 연구]
2                                     [인터넷, 코스웨어, 설계, 구현]
3                               [web, 협력, 환경, 구축, 방안, 연구]
4                                       [교육, 개별, 수업, cai]
                              ...                        
1144    [개정, 교과서, 소프트웨어, 교육, 단원, 탐구, 비교, 분석, 교육, 과정, 교...
1145    [이러닝, 콘텐츠, 사용자, 경험, ux, 평가, 이러닝, 대리, 상호, 작용, 사...
1146    [초등, 데이터, 리터러시, 함양, ai, 데이터, 과학, 교육, 프로그램, 개발,...
1147    [초등, 예비, 교사, 소프트웨어, 교육, 온라인, 교육, 효과, 분석, 소프트웨어...
1148    [초등, 교과서, 소프트웨어교육, 영역, 컴퓨팅, 사고력, 요소, 분석, 소프트웨어...
Name: kor_full, Length: 1149, dtype: object


In [6]:
#lda
id2word = gensim.corpora.Dictionary(tokenized)

corpus=[id2word.doc2bow(text) for text in tokenized]
#print("id2word for each document : ", corpus)
print("# words in total : ", len(id2word))
print("# documents : ", len(corpus))

# words in total :  5086
# documents :  1149


In [7]:
a = int(input("set the number of topics "))  # 13 or 11
optimal_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=a,
                                                random_state=100,
                                                update_every=1,
                                                iterations=1000,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                eta='auto',
                                                per_word_topics=True)
model_topics = optimal_model.show_topics(formatted=False)

pprint(optimal_model.print_topics())

[(0,
  '0.072*"수업" + 0.052*"활용" + 0.050*"문제" + 0.041*"학생" + 0.041*"대상" + 0.040*"효과" '
  '+ 0.036*"적용" + 0.029*"연구" + 0.027*"모형" + 0.026*"중심"'),
 (1,
  '0.120*"인공지능" + 0.094*"ai" + 0.078*"데이터" + 0.078*"창의" + 0.071*"알고리즘" + '
  '0.040*"융합" + 0.037*"프로그램" + 0.028*"모델" + 0.026*"탐색" + 0.020*"분야"'),
 (2,
  '0.138*"측정" + 0.072*"정도" + 0.067*"it" + 0.036*"문서" + 0.030*"타당도" + '
  '0.030*"산출" + 0.021*"범주" + 0.015*"xml" + 0.013*"상관관계" + 0.011*"밀접"'),
 (3,
  '0.318*"평가" + 0.078*"유형" + 0.060*"수행" + 0.042*"플랫폼" + 0.033*"단원" + '
  '0.029*"이러닝" + 0.027*"절차" + 0.021*"발표" + 0.021*"관찰" + 0.017*"iptv"'),
 (4,
  '0.117*"로봇" + 0.081*"영향" + 0.047*"분석" + 0.040*"교과서" + 0.033*"유의미" + '
  '0.029*"빅데이터" + 0.028*"긍정" + 0.027*"사용" + 0.026*"연수" + 0.025*"통계"'),
 (5,
  '0.106*"컴퓨팅" + 0.105*"프로그래밍" + 0.087*"사고력" + 0.046*"프로그램" + 0.043*"교육" + '
  '0.039*"초등" + 0.024*"사고" + 0.023*"게임" + 0.022*"소프트웨어교육" + 0.021*"개발"'),
 (6,
  '0.060*"시스템" + 0.046*"온라인" + 0.041*"집단" + 0.032*"확인" + 0.029*"기준" + '
  '0.028*"이용" + 0.025*"전문가" 

In [8]:
# get p(word|topic)
def get_topic_word_prob(lda_model):
    topic_word_freq = lda_model.state.get_lambda()
    topic_word_prob = topic_word_freq / topic_word_freq.sum(axis=1)[:, None]

    return topic_word_prob

In [9]:
topic_word_prob = get_topic_word_prob(optimal_model)
print(topic_word_prob.shape) # (#topics, #words)

wordlist = []
for i in range(len(id2word)):
    #print(id2word[i])
    wordlist.append(id2word[i])
#print(wordlist)
seriesWordlist = pd.Series(wordlist)

topic_word = pd.DataFrame(data=topic_word_prob[0:, 0:], columns=seriesWordlist)

print(topic_word)

(13, 5086)
          교육     멀티미디어        연구        원격        자료       편집기    하이퍼미디어  \
0   0.002215  0.000195  0.028537  0.000005  0.000011  0.000005  0.000005   
1   0.001012  0.000014  0.000014  0.000014  0.000014  0.000014  0.000014   
2   0.000099  0.000099  0.000099  0.000099  0.000099  0.000099  0.000099   
3   0.000032  0.000032  0.000032  0.000032  0.000032  0.000032  0.000032   
4   0.000014  0.000014  0.012572  0.000014  0.000014  0.000014  0.000014   
5   0.043205  0.000007  0.011587  0.000007  0.000007  0.000097  0.000007   
6   0.000011  0.000011  0.002519  0.000011  0.003488  0.000011  0.000011   
7   0.000018  0.004570  0.001044  0.000018  0.000018  0.000018  0.000612   
8   0.000051  0.000051  0.000051  0.000051  0.000051  0.000051  0.000051   
9   0.000005  0.000005  0.020482  0.000005  0.009867  0.000005  0.000005   
10  0.156253  0.000002  0.039828  0.000002  0.005235  0.000002  0.000002   
11  0.000058  0.000058  0.000058  0.049458  0.000058  0.000058  0.000058   
1

In [10]:
# n-gram candidates

def get_ngrams(raw, n_range=(1,3)): # 1~3-gram 까지

    def to_ngrams(words, n):
        ngrams = []
        for b in range(0, len(words) - n + 1):
            ngrams.append(str(tuple(words[b:b+n])))
        return ngrams

    n_begin, n_end = n_range
    ngram_list = []
    
    for n in range(n_begin, n_end + 1):
        for ngram in to_ngrams(raw, n):
            ngram_list.append(ngram)
            
    return ngram_list


In [11]:
ngrams = tokenized.apply(lambda row: get_ngrams(row))

print(ngrams)
print(type(ngrams))
print(type(ngrams.loc[0]))
print(type(ngrams.loc[0][0]))
#ngrams.to_csv('./modi_data/ngrams.csv')

0       [('멀티미디어',), ('원격',), ('교육',), ('연구',), ('멀티미디...
1       [('교육',), ('하이퍼미디어',), ('자료',), ('편집기',), ('연구...
2       [('인터넷',), ('코스웨어',), ('설계',), ('구현',), ('인터넷'...
3       [('web',), ('협력',), ('환경',), ('구축',), ('방안',),...
4       [('교육',), ('개별',), ('수업',), ('cai',), ('교육', '...
                              ...                        
1144    [('개정',), ('교과서',), ('소프트웨어',), ('교육',), ('단원'...
1145    [('이러닝',), ('콘텐츠',), ('사용자',), ('경험',), ('ux',...
1146    [('초등',), ('데이터',), ('리터러시',), ('함양',), ('ai',...
1147    [('초등',), ('예비',), ('교사',), ('소프트웨어',), ('교육',...
1148    [('초등',), ('교과서',), ('소프트웨어교육',), ('영역',), ('컴...
Name: kor_full, Length: 1149, dtype: object
<class 'pandas.core.series.Series'>
<class 'list'>
<class 'str'>


In [12]:
# get score(l, t)
# score = sum_w(p(w|t) * PMI(w,l|c))

def get_score(unigram_counter, ngram_candidates, candidate, topic, topic_word_prob, ngram_docs):
    def get_pmi(w, l_freq, unigram_counter, docs):
        word = "('" + w +"',)"
        count = 0

        for doc in docs:
            #print(doc)
            if word in doc: 
                count += 1
        pmi = count / (unigram_counter.loc[0, w] * l_freq)
        return pmi
    
    candi_docs = []
    #print(type(candi_docs))
    for doc in ngram_docs:
        if candidate in doc:
            candi_docs.append(doc)
    #print(candi_docs)

    for i in range(len(ngram_candidates)):
        if ngram_candidates.loc[i, '후보명'] == candidate:
            candi_count = ngram_candidates.loc[i, 'count']
    #print(candidate, candi_count)

    score = 0
    for word in unigram_counter.columns:
        tw_prob = topic_word_prob.loc[topic, word]
        pmi = get_pmi(word, candi_count, unigram_counter, candi_docs)
        score += tw_prob * pmi

    return round(score * 1000, 5)
    

In [13]:
nc = pd.read_excel("./modi_data/ngram_candidates.xlsx")
uc = pd.read_excel("./modi_data/unigram_counter.xlsx")
nc = nc.drop(['Unnamed: 0'], axis=1)
uc = uc.drop(['Unnamed: 0'], axis=1)

#print(nc.columns[0])
#print(nc.loc[0, '후보명'])
#print(topic_word.loc[0, '교육'])
#score1 = get_score(uc, nc, nc.loc[21, '후보명'], 0, topic_word, ngrams)

#print(score1)

In [25]:
# top 10 for all topics
score_full = pd.DataFrame() # (topic, l_index)
for j in range(13): # 11 or 13
    score0 = []
    for i in range(len(nc)):
        score = get_score(uc, nc, nc.loc[i, '후보명'], j, topic_word, ngrams)
        score0.append(score)
        #score0[nc.loc[i, '후보명']] = score
    score_full = score_full.append(pd.Series(score0), ignore_index=True)
    #score0 = sorted(score0.items(), key=(lambda x:x[1]), reverse=True)
    #score0 = pd.DataFrame.from_dict(score0, orient='index')
    #score0.to_excel('./final_data/topic' + str(j) + '_candidates.xlsx')
    print('topic'+str(j))

print(score_full)

topic0
topic1
topic2
topic3
topic4
topic5
topic6
topic7
topic8
topic9
topic10
topic11
topic12
         0        1        2        3        4        5        6        7   \
0   0.30640  0.11027  0.50814  0.19702  0.15329  0.23506  0.38023  0.61278   
1   0.34096  0.10340  0.95732  0.25306  0.05704  0.07944  0.77635  0.54752   
2   0.42192  0.30845  0.45792  0.20166  0.16740  0.19382  0.33303  0.31328   
3   0.41318  0.14740  0.41734  0.18153  0.14359  0.13960  0.35332  0.21659   
4   0.18608  0.16638  0.53876  0.21014  0.11834  0.17457  0.49776  1.19974   
5   0.66813  0.04966  0.53112  0.10937  0.04092  0.06924  0.37244  0.96316   
6   0.23481  0.15084  0.99973  0.22459  0.05712  0.13421  0.52474  0.32443   
7   0.31568  0.13461  0.44695  0.24495  0.14557  0.19537  0.40550  0.14183   
8   0.41779  0.21937  0.63932  0.46359  0.14865  0.18761  0.34122  0.54381   
9   0.23321  0.11936  0.46523  0.21118  0.12072  0.15959  0.43489  0.34552   
10  0.28796  0.14210  0.50275  0.18428  0.07453 

ValueError: No engine for filetype: ''

In [26]:
score_full.to_excel('./final_data/topic_full_candidates.xlsx')

In [57]:
# score'(l,t) = score(l,t) - alpha(변별성계수) * avg(score(l, t제외))
alpha = 0.2
score_final = pd.DataFrame() # (t, l)

for i in range(len(nc)):
    score_ = []
    for t in range(13): # 11 or 13
        score = score_full.loc[t, i] #score(l, t)
        score_list = [s for j, s in enumerate(score_full[i]) if j != t]
        avg = sum(score_list) / len(score_list)
        score_.append(score - alpha * avg)
    score_final[nc.loc[i, '후보명']] = score_
print(score_final)

    ('computational', 'thinking')  ('중도', '탈락')  ('산업', '혁명')  ('의사', '결정')  \
0                        0.240066      0.079600      0.391148      0.150387   
1                        0.275202      0.072616      0.847814      0.207361   
2                        0.357512      0.281083      0.340091      0.155105   
3                        0.348626      0.117349      0.298835      0.134639   
4                        0.117741      0.136646      0.422278      0.163726   
5                        0.607825      0.017980      0.414511      0.061276   
6                        0.167283      0.120846      0.890931      0.178417   
7                        0.249501      0.104346      0.328938      0.199116   
8                        0.353313      0.190519      0.524514      0.421400   
9                        0.165657      0.088842      0.347523      0.164783   
10                       0.221319      0.111961      0.385668      0.137435   
11                       0.095781      0.130800     

In [61]:
for i in range(13):
    #print(score_final.loc[i]) #(100, 1)
    s = score_final.loc[i].sort_values(axis=0, ascending=False)
    #print(s)
    s.to_excel('./final_data/topic' + str(i) + '_candidates.xlsx')


In [84]:
# get score(l, t)
# score = sum_w(p(w|t) * PMI(w,l|c))
"""
def get_score(unigram_counter, ngram_candidates, candidate, topic, topic_word_prob, ngram_docs):
    def get_pmi(w, l, l_freq, unigram_counter, docs):
        word = "('" + w +"',)"
        #print(word)
        count = 0

        for doc in docs:
            #print(doc)
            if word in doc and l in doc:#and word in doc:
                #print('yesss')
                count += 1

        pmi = count / (unigram_counter.loc[0, w] * l_freq)
        return pmi
    
    for i in range(len(ngram_candidates)):
        if ngram_candidates.loc[i, '후보명'] == candidate:
            candi_count = ngram_candidates.loc[i, 'count']
    print(candidate, candi_count)

    score = 0
    for word in unigram_counter.columns:
        tw_prob = topic_word_prob.loc[topic, word]
        pmi = get_pmi(word, candidate, candi_count, unigram_counter, ngram_docs)
        score += tw_prob * pmi

    return score
    """
    

In [60]:
# get PMI(word, 후보명(l)|문맥(c))
# PMI = (#(w,l)) / (#w * #l)
"""
def get_pmi(unigram_counter, ngram_candidates, ngram_docs):
    dfPMI = pd.DataFrame()
    pmi_list = []

    def get_wl_count(word, candidate, docs):
        word = '(' + word + ',)'
        count = 0

        for doc in docs:
            if candidate in doc and word in doc:
                count += 1
        return count

    for i, l in enumerate(ngram_candidates['후보명']):
        for w in unigram_counter.columns:
            pmi = get_wl_count(w, l, ngram_docs) / (unigram_counter.loc[0, w] * ngram_candidates.loc[i, 'count'])
            pmi_list.append(pmi) 
            print('word')   
        dfPMI.append(pmi_list)
        pmi_list = []
        print('5000')

    dfPMI.columns = unigram_counter.columns

    return dfPMI
    """

Error: ModuleNotFoundError