In [34]:
'''
    qna_total 에서 분야별로 문의 글을 정리한다. 
'''

#QNA_TYPE="LAWYER"
QNA_TYPES=["LAWYER", "DOCTOR", "LABOR", "ANIMALDOCTOR", "PHARMACIST", "TAXACC", "DIETITIAN"]

def find_file_name(qna_type):
    input_file="./qnas_total/"+qna_type+"_total.txt"
    return input_file

def make_file_name(qna_type):
    output_file="./question_ma/"+qna_type+"_questions.txt"
    return output_file
    
def write_question(lines):
    with open(output_file, "w", encoding="utf-8") as output:
        for line in lines:
            print(line, file=output)        

for qna_type in QNA_TYPES:
    input_file=find_file_name(qna_type)
    output_file=make_file_name(qna_type)
    #input_file="./qnas/LABOR_cc3729k_page1.txt"
    questions=[]
    with open(input_file, "r", encoding="utf-8") as input_texts:
        for text in input_texts:
            text=text.strip()
            text=text.split("\t")
            if len(text)<2: continue             #공백이거나 질의응답이 없는 것은 제외한다. 
            question=text[2].split(" ###Q&A### ")[0]    
            if len(question)<10: continue
            questions.append(question)


    write_question(questions)

        

In [1]:
'''
    question의 ma 분석 진행
'''
import re
import ujson
from konlpy.tag import Komoran

QNA_TYPES=["LAWYER", "DOCTOR", "LABOR", "ANIMALDOCTOR", "PHARMACIST", "TAXACC", "DIETITIAN"]

def split_sentences(text):
    all_sentences=[]
    lines=text.strip()
    sentences=re.split("(?<=[.?!]) ",lines)
   
    return sentences

def get_morph_anal(analyzer,text):
    sent_morph_anals=[]
    sentences=split_sentences(text)
    
    for sentence in sentences:
        sent_morph_anal=analyzer.pos(sentence)
        sent_morph_anals.append(sent_morph_anal)
        
    json_obj={"ma_res": sent_morph_anals}
        
    return json_obj

def print_morph_anal(morph_anals, output_file):
    with open(output_file, "w", encoding="utf-8")as output:
        for morph_anal in morph_anals:
            print(ujson.dumps(morph_anal, ensure_ascii=False), file=output)
        print(type(morph_anals))
        
input_file="./question_ma/LABOR_questions.txt"
output_file="./question_ma/LABOR_questions_ma.txt"
komoran=Komoran()
ma_res=[]
with open(input_file, "r", encoding="utf-8")as input_texts:
    for text in input_texts:
        ma_re=get_morph_anal(komoran, text[2:])
        ma_res.append(ma_re)
    print(type(ma_res))
    print_morph_anal(ma_res,output_file)



<class 'list'>
<class 'list'>


In [8]:
'''
    K-means 방법 활용하여 질문을 그룹화 한다. 
'''

import ujson
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

INPUT_FILE_NAME = "./question_ma/LABOR_questions_ma.txt"
FEATURE_POSES = ["NNG", "NNP", "XR"]
MA_KEY = "ma_res"
NUM_CLUSTERS = 40
QNA_TYPES=["LAWYER", "DOCTOR", "LABOR", "ANIMALDOCTOR", "PHARMACIST", "TAXACC", "DIETITIAN"]
qna_type="LABOR"

def make_file_name():
    output_file_name="./question_ma/"+qna_type+"_questions_ma_analy_"+str(NUM_CLUSTERS)+"Groups.txt"
    return output_file_name


def read_documents():
    documents = []

    with open(INPUT_FILE_NAME, "r", encoding="utf-8") as input_file:
        for line in input_file:
            morphs = []
            json_obj = ujson.loads(line)

            for sent_anal in json_obj[MA_KEY]:
                for morph_lex, morph_cat in sent_anal:
                    if morph_cat not in FEATURE_POSES:
                        continue

                    morphs.append(morph_lex)

            document = " ".join(morphs)
            documents.append(document)

    return documents


def build_doc_term_mat(documents):
    vectorizer = TfidfVectorizer()
    doc_term_mat = vectorizer.fit_transform(documents)
    words = vectorizer.get_feature_names()

    return doc_term_mat, words


def get_flat_clusters(doc_term_mat):
    km = KMeans(n_clusters=NUM_CLUSTERS, init="k-means++", verbose=1)    # 클러스터의 개수를 지정해주어야함!. 
                                                    # 점 찍는 곳에 따라 결과가 달라짐. k-means++ 가 점 찍는 방법. "random"도 있음
                                                    # verbose=1 : 반복작업을 화면에 보여달라. verbose=0: 화면 표시 하지 마라
    km.fit(doc_term_mat)

    return km


def print_centroid_words(model, words,output_file):
    print("군집별 중심 어휘")
    print()

    ordered_centroids = model.cluster_centers_.argsort()[:, ::-1]    # argsort() value로 sort 하지만, 결과는 ID 로 하라는 의미. 
                                                                        # [:, ::-1] --> 역순 정렬하라. 
    with open(output_file,"w",encoding="utf-8") as output:

        for cluster_num in range(NUM_CLUSTERS):
            center_word_nums = []

            for word_num in ordered_centroids[cluster_num, :20]:       #가자 중요한 단어를 20개만 보여달라
                center_word_nums.append(word_num)

            # 위의 for 문 대신 아래의 리스트 내포를 이용할 수 있다.
            # center_word_nums = [word_num
            #                     for word_num in ordered_centroids[cluster_num,
            #                                                       :20]]

            center_words = []

            for word_num in center_word_nums:
                center_words.append(words[word_num])

            # 위의 for 문 대신 아래의 리스트 내포를 이용할 수 있다.
            # center_words = [words[word_num] for word_num in center_word_nums]
            print("군집 {}: {}".format(cluster_num, ", ".join(center_words)))
            print("군집 {}: {}".format(cluster_num, ", ".join(center_words)), file=output)
            print()
        print()
        print(model.labels_, file=output)
    print()
    

def main():
    
    documents = read_documents()
    doc_term_mat, words = build_doc_term_mat(documents)
    model = get_flat_clusters(doc_term_mat)      # 러닝의 아웃풋은 모델(="모형","흉내") --> model  머신러닝에서 많이 사용하는 용어
    print(model.labels_)   # 문서별로 어떤 그룹에 해당하는지 확인 가능함. .labels_
                                        # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
    output_file=make_file_name()
    
    print_centroid_words(model,words,output_file)
    

# 실행
main()

Initialization complete
Iteration  0, inertia 675.757
Iteration  1, inertia 395.114
Iteration  2, inertia 391.966
Iteration  3, inertia 390.308
Iteration  4, inertia 389.679
Iteration  5, inertia 389.496
Iteration  6, inertia 389.094
Iteration  7, inertia 388.899
Iteration  8, inertia 388.799
Iteration  9, inertia 388.503
Converged at iteration 9
Initialization complete
Iteration  0, inertia 695.250
Iteration  1, inertia 400.610
Iteration  2, inertia 397.217
Iteration  3, inertia 396.128
Iteration  4, inertia 395.917
Iteration  5, inertia 395.737
Converged at iteration 5
Initialization complete
Iteration  0, inertia 450.253
Iteration  1, inertia 427.021
Iteration  2, inertia 423.067
Iteration  3, inertia 420.427
Iteration  4, inertia 418.435
Iteration  5, inertia 418.224
Iteration  6, inertia 418.052
Iteration  7, inertia 417.843
Iteration  8, inertia 417.772
Iteration  9, inertia 417.713
Iteration 10, inertia 417.686
Iteration 11, inertia 417.661
Converged at iteration 11
Initializati