# Training Korean Text
* from korean wikipedia

In [None]:
import time
import csv
import os
from pathlib import Path
import sys
sys.path.insert(0, '../')
import pycor.korlang as korlang
import pycor.utils as utils

stopwatch = utils.StopWatch()

docsize = 70

#for aux in lm.auxmap.values():
#    print(aux)

def listfiles(path):
    result_arr = []
    filenames = os.listdir(path)
    for filename in filenames:
        full_filename = os.path.join(path, filename)
        result_arr.append(full_filename)
    return result_arr
    
files = listfiles('../samples')
files = sorted(files)
files.extend( listfiles('../../data/wiki') )
files.extend( listfiles('../../data/NP') )

print (len(files), "files")

outputpath = "../../output/" + str(docsize) + "/"
model_path = outputpath + "model/"

os.makedirs(outputpath, exist_ok=True)
os.makedirs(model_path, exist_ok=True)


trainer = korlang._trainer


In [None]:

stopwatch.start()

def extractHeads(words) :
    word_texts = []
    for word in words:
        if word.bestpair:
            word_texts.append(word.bestpair.head.text)
        else:
            word_texts.append(word.text)
    return word_texts


def extractTails(words) :
    word_texts = []
    for word in words:
        if word.bestpair:
            #word_texts.append(word.bestpair.tail.text + ":" + str(word.bestpair.type))
            word_texts.append(word.bestpair.tail.text)
        else:
            word_texts.append("")
            #word_texts.append(word.text)
    return word_texts



def extractTags(words) :
    word_texts = []
    for word in words:
        if word.bestpair:
            word_texts.append(word.bestpair.tags)
        else:
            word_texts.append("*")
    return word_texts



def writeTails(outputpath, index, sentence_array):
    outputfile = outputpath + 'samples/' + str(index) + '_tails.txt'
    
    words_array = trainer.scoreDocument(sentence_array)
    
    with open(outputfile, 'w', encoding='utf-8') as out:
        writer = csv.writer(out)
        for words in words_array:
            word_texts = extractTails(words)
            writer.writerow(word_texts)
        out.close()
        
def writeAll(tailswriter, headswriter, tagswriter, sentence_array):
    words_array = trainer.scoreDocument(sentence_array)
    
    for words in words_array:
        word_texts = extractTails(words)
        
        if len(word_texts) > 2:
            tailswriter.writerow(word_texts)
            heads = extractHeads(words)
            headswriter.writerow(heads)
            tags = extractTags(words)
            tagswriter.writerow(tags)



        
tailsfile = open(outputpath + 'tails.txt', 'w', encoding='utf-8')
tailswriter = csv.writer(tailsfile)


headsfile = open(outputpath + 'heads.txt', 'w', encoding='utf-8')
headswriter = csv.writer(headsfile)


tagsfile = open(outputpath + 'tags.txt', 'w', encoding='utf-8')
tagswriter = csv.writer(tagsfile)

print("Loading Texts >>> ")
index =0
for file in files[:docsize]:
    if file.endswith(".txt") :
        sentence_array, words_array = trainer.loadfile(file)
        writeAll(tailswriter,headswriter, tagswriter,sentence_array)
        index += 1
        if index % 100 == 0:
            print(index,end=">")
            if index % 1000 == 0:
                print()
            
tailsfile.close()
tagsfile.close()

print("Load Texts: " , stopwatch.secmilli() , "(", stopwatch.millisecstr(), "ms.)")



In [None]:

stopwatch.start()

print("Build Vocab: " , stopwatch.secmilli() , "(", stopwatch.millisecstr(), "ms.)")
print("Word Count : " , utils.comma(len(trainer.wordmap.words)) )
print("head Count : " , utils.comma(len(trainer.wordmap.heads)) )
print("tail Count : " , utils.comma(len(trainer.wordmap.tails)) )



snglist, ylist, clist, ambilist = trainer.buildVocab()


trainer.savemodel(model_path)

with open(model_path + "ambiguous.csv", 'w', encoding='utf-8') as csvfile :
    writer = csv.writer(csvfile)
    for head in ambilist:
        writer.writerow([head.text, '+'.join(head.pos), head.tails])
            
    csvfile.close()

with open(model_path + "single.csv", 'w', encoding='utf-8') as csvfile :
    writer = csv.writer(csvfile)
    for head in snglist:
        writer.writerow([head.text, '+'.join(head.pos), head.tails])
            
    csvfile.close()

In [None]:
def writeWord(writer, word):
    if len(word.particles) == 0:
        writer.writerow([word.text, 'X'])
    for part in word.particles:
        h = part.head.text if part.head else ''
        t = part.tail.text if part.tail else ''
        writer.writerow([word.text, h, t, part.score, part.tags, part.pos])

utils.writecsv(outputpath + "np_words.csv", trainer.wordmap.words.values(), writeWord)

def writeWord(writer, word):
    bestpair = word.bestpair
    if bestpair :
        h = bestpair.head.text if bestpair.head else ''
        t = bestpair.tail.text if bestpair.tail else 'X'
        writer.writerow([ word.text, '   ', h, t, '   ', bestpair.score, bestpair.tags, bestpair.pos ])
    else:
        writer.writerow([ word.text, 'X' ])
        
utils.writecsv(outputpath + "np_words_scored.csv", trainer.wordmap.words.values(), writeWord)

In [None]:
def writeTails(writer, tail):
    #row = [tail.text]
    #for head in tail.heads:
    #    row.append(head.text)
    #row = [tail.text, len(tail.heads), tail.heads]
    row = [tail.text, len(tail.heads)]
    writer.writerow(row)

tailsorted = sorted(trainer.wordmap.tails.values(), key=lambda tail:tail.text[len(tail.text)-1])
utils.writecsv(outputpath + "np_tails.csv", tailsorted, writeTails)


tailsorted2 = sorted(trainer.wordmap.tails.values(), key=lambda tail:len(tail.heads), reverse=True)
utils.writecsv(outputpath + "np_tails_occ.csv", tailsorted2, writeTails)



In [None]:
def writeHeads(writer, head):
    row = [ str(head), '  ', len(head.tails), ' ']
    for tail in head.tails:
        row.append(tail.text)
    writer.writerow(row)

headsorted = sorted(trainer.wordmap.heads.values(), key=lambda head:head.text)
utils.writecsv(outputpath + "np_heads.csv", headsorted, writeHeads)


headsorted2 = sorted(trainer.wordmap.heads.values(), key=lambda head:len(head.tails), reverse=True)
utils.writecsv(outputpath + "np_heads_occ.csv", headsorted2, writeHeads)


headset = set()

for word in trainer.wordmap.words.values():
    bestpair = word.bestpair
    if bestpair and bestpair.head:
        headset.add(bestpair.head)
        
def writeHead(writer, head):
    row = [ str(head), '\t', len(head.tails), '\t']
    
    for tail in head.tails:
        row.append(tail.text)
        
    writer.writerow(row)
        

        
headlist = sorted(headset , key=lambda head:head.text )

utils.writecsv(outputpath + "np_head_scored.csv", headlist, writeHead)




In [None]:
import pycor.langmodel as lm

def trace(worm, indent):
    print(indent, worm) 
    for p in worm.precedents.values():
        for p2 in p:
            if p2 == worm:
                break
            trace(p2, indent+".")
        
# for euls in lm.auxmap.values():
#     for eul in euls:
#        trace(eul,"")


In [None]:

def extractWords(words) :
    word_texts = []
    for word in words:
        if word.bestpair:
            #word_texts.append(word.bestpair.head.text +":"+word.bestpair.tail.text)
            pos = word.bestpair.pos if word.bestpair.pos else ''
            tag = word.bestpair.tags if word.bestpair.tags else ''
#             print(word.bestpair.tail.occurence())
            postag = None
            if pos or tag:
                postag = "("+ str(pos) + ":" + str(tag) +")"
            else :
                postag = ""
            word_texts.append(word.bestpair.head.text + postag)
        else:
            word_texts.append(word.text)
    return word_texts

def writeExtracted(outputpath, suffix, index, sentence_array):
    outputfile = outputpath + str(index) + suffix + '.txt'
    
    words_array = trainer.scoreDocument(sentence_array)

    with open(outputfile, 'w', encoding='utf-8') as out:
        writer = csv.writer(out)
        for words in words_array:
            word_texts = extractWords(words)
            writer.writerow(word_texts)
        out.close()
        
index = 0
for file in files[:5]:
    if file.endswith(".txt") :
        sentence_array, words_array = trainer.loadfile(file)
        index += 1
        writeExtracted(outputpath, '_train', index, sentence_array)

import pycor
pycor.loadmodel(model_path)

index = 0
for file in files[:5]:
    if file.endswith(".txt") :
        sentence_array, words_array = pycor.readfile(file)
        index += 1
        writeExtracted(outputpath, '_pycor', index, sentence_array)


