In [2]:
import pandas as pd
import numpy as np
from khaiii import KhaiiiApi
from gensim import corpora, models
from pprint import pprint


In [3]:
year = [i for i in range(2015, 2022)]
sortedresult = pd.DataFrame()

In [None]:
def khaiiiTokenizer(raw, pos=['NNG', 'NNP', 'NNB', 'NP', 'NR', 'SL']): # 일반명사 고유명사 의존명사 대명사 수사 외국어
    api = KhaiiiApi()
    list = []

    for word in api.analyze(raw): #raw data
        for _, morph in enumerate(word.morphs):
            if len(morph.lex) > 1 and morph.tag in pos:
                if morph.tag == 'SL':
                    morph.lex = morph.lex.lower()
                list.append(morph.lex)             
    return list

In [4]:
for i in range(5):
    print("==== "+str(year[i])+" ====")
    data = pd.read_csv("./modi_data/data_"+str(year[i])+".csv")

    tokenized = data['full_data'].apply(lambda row: khaiiiTokenizer(row))
    print("========= tokenization completed =========")

    id2word = corpora.Dictionary(tokenized)
    corpus=[id2word.doc2bow(text) for text in tokenized]
    print("# words in total : ", len(id2word))
    print("# documents : ", len(corpus))

    #tfidf
    print("==== calculating tfidf ====")
    tfidf = models.TfidfModel(corpus)

    #tfidf per doc
    tfidflist = []
    for doc in tfidf[corpus]:
        inner_list = [0]*len(id2word) 
        for id, freq in doc:
            inner_list[id] = np.around(freq, decimals=2) #put tfidf value in the place matching its index
        tfidflist.append(inner_list)
    #print(len(tfidflist))

    tfidf_df = pd.DataFrame(tfidflist)
    tfidf_df.columns = [id2word[i] for i in range(len(id2word))] #set columns' names as words

    total_df = pd.concat([data[["date", "id"]], tfidf_df], axis=1)
    total_df.to_csv("./final_data/tfidf"+str(year[i])+".csv") 

    #sum of tfidf for each word
    columnsum = pd.DataFrame(total_df.sum(axis=0)).T
    columnsum = columnsum.drop(['id'], axis=1)
    columnsum['date'] = year[i]
    columnsum.to_csv("./final_data/sum"+str(year[i])+".csv")

    #sort tfidf value in descending order
    columnsum = columnsum.sort_values(by=0, axis=1, ascending=False)
    print(columnsum)
    columnsum.to_csv("./final_data/sorted"+str(year[i])+".csv")

    print("==== completed ====")

==== 2017 ====
# words in total :  27404
# documents :  7275
==== calculating tfidf ====
   date  date    청소년     폐지     보호법     아이     소년법      교육    청소년법      학교  \
0  2017  2017  323.1  313.5  230.16  196.0  191.08  170.44  163.08  148.94   

   ... traffic touch topic today title  tire thrill thought those pencil  
0  ...    0.01  0.01  0.01  0.01  0.01  0.01   0.01    0.01  0.01   0.01  

[1 rows x 27404 columns]
==== completed ====
==== 2018 ====
# words in total :  66454
# documents :  17015
==== calculating tfidf ====
   date  date      아이      학생     교사      교육    어린이집      학교     유치원      시간  \
0  2018  2018  451.48  394.68  384.9  375.64  355.46  352.23  343.51  263.33   

   ...  관할청인  시청소년   해당함  회비횡령 경인여자대  횡령사건    인준 이사회대법원    역순   의결도  
0  ...  0.01  0.01  0.01  0.01  0.01  0.01  0.01   0.01  0.01  0.01  

[1 rows x 66454 columns]
==== completed ====
==== 2019 ====
# words in total :  24986
# documents :  3030
==== calculating tfidf ====
   date  date     아이     학생     

In [5]:
# tfidf for full data

data = pd.read_csv("./modi_data/full_data.csv")

tokenized = data['full_data'].apply(lambda row: khaiiiTokenizer(row))
print("========= tokenization completed =========")

id2word = corpora.Dictionary(tokenized)
corpus=[id2word.doc2bow(text) for text in tokenized]
print("# words in total : ", len(id2word))
print("# documents : ", len(corpus))

#tfidf
print("==== calculating tfidf ====")
tfidf = models.TfidfModel(corpus)

#tfidf per doc
tfidflist = []
for doc in tfidf[corpus]:
    inner_list = [0]*len(id2word) 
    for id, freq in doc:
        inner_list[id] = np.around(freq, decimals=2) #put tfidf value in the place matching its index
    tfidflist.append(inner_list)

tfidf_df = pd.DataFrame(tfidflist)
tfidf_df.columns = [id2word[i] for i in range(len(id2word))] #set columns' names as words
total_df = pd.concat([data[["date", "id"]], tfidf_df], axis=1)

#sum of tfidf for each word
columnsum = pd.DataFrame(total_df.sum(axis=0)).T
columnsum = columnsum.drop(['id'], axis=1)
columnsum['date'] = 2021

#sort tfidf value in descending order
columnsum = columnsum.sort_values(by=0, axis=1, ascending=False).transpose()
print(columnsum)
columnsum.to_csv("./final_data/sorted_full.csv")

print("==== completed ====")

# words in total :  91609
# documents :  28482
==== calculating tfidf ====
            0
date     2021
date     2021
아이     756.04
학생     635.46
교육     619.34
...       ...
front    0.01
gate     0.01
green    0.01
guest    0.01
find     0.01

[91609 rows x 1 columns]
==== completed ====
