In [65]:
from collections import defaultdict
from collections import namedtuple
from collections import Counter
import math
import re

In [66]:
# KMcorpus

class KMcorpus:
    
    PC = "[,\.!\?！＇，ㆍ．／：；？｀、。·‥…¨〃∼´～˝\(\)\{\}\[\]（）［］｛｝‘’“”〔〕〈〉《》「」『』【】]"
    
    def __init__( self, text, comments_header="#", doc_sep="\r?\n\r?\n" ):
        self.text = text
        self.doc_sep = doc_sep
        self.comments_header = comments_header
        self.docs = []
        
    def remove_comments( self ):
        pattern = "{}.*?$".format( self.comments_header )
        regex = re.compile( pattern, re.MULTILINE|re.DOTALL )
        self.text = re.sub( regex, "", self.text ).strip()
        return self
        
    def remove_punctuation( self ):  
        regex_PC = re.compile( self.PC )
        self.text = re.sub( regex_PC , " ", self.text ).strip()
        return self
    
    def remove_chrs( self, chr_types=["Korean", "Alphabet", "Numbers"] ):
        if "Korean" in chr_types:
            self.text = re.sub( re.compile("[가-힣]"), "", self.text )
        if "Alphabet" in chr_types:
            self.text = re.sub( re.compile("[a-zA-Z]"), "", self.text )
        if "Numbers" in chr_types:
            self.text = re.sub( re.compile("[\d]+?"), "", self.text )
        self.text = self.text.strip()
        return self
    
    def merge_spaces( self ):
        self.text = re.sub( re.compile("[ \t]+?"), " ", self.text )
        self.text = re.sub( re.compile("^[ \t]+?", re.MULTILINE|re.DOTALL), "", self.text ).strip()
        return self
        
    def text2docs(self):
        docs = re.split( re.compile( self.doc_sep ), self.text )
        self.docs = [ doc.strip().split() for doc in docs ]
        return self
    
    
    def merge_duplications(self, dict_path="dicts/duplications.dic" ):
        return self.merge_chrs( dict_path ) 
    
    def merge_variants(self, dict_path="dicts/variants.dic" ):
        return self.merge_chrs( dict_path ) 
    
    def merge_chrs(self, dict_path ):
        dic = open(dict_path, 'r', encoding='utf-8').readlines()
        for pair in dic:
            a, b = pair.split("\t")
            self.text = self.text.replace(a, b)
        return self
    


In [67]:
# Segment

class TokenExtractor:
    
    def __init__( self, corpus, min_freq = 5 ):
        self.corpus = corpus
        self.min_freq = min_freq
        self.token_counter = Counter()
        self.unigram_counter = Counter( self.corpus.text )
        self.bigram_counter = Counter()
        self.score = defaultdict()
        
    def train( self, method="allgram", min_window=2, max_window=8 ):
        
        self.min_window = min_window
        
        if method == "allgram" :
            for doc in self.corpus.docs:
                for phrase in doc:
                    particles = self.__class__.allgram( phrase, min_window, max_window )
                    self.token_counter.update( Counter( particles ) )
                    
                    bigrams = self.__class__.ngram( phrase, n=2 )
                    self.bigram_counter.update( Counter( bigrams ) )

        self.token_counter = Counter( {x : self.token_counter[x] for x in self.token_counter if self.token_counter[x] >= self.min_freq } )
        return self
        
    def cohesion_score( self, word ):
        word_len = len( word )
        if (not word) or ( word_len < self.min_window ):
            return 0
        
        first_chr_freq = self.unigram_counter[ word[0] ]
        last_chr_freq = self.unigram_counter[ word[-1] ]
        whole_word_freq = self.token_counter[ word ]
        
        cohesion_l = 0 if whole_word_freq == 0 else math.pow( ( whole_word_freq / first_chr_freq ), (1 / (word_len - 1)) )
        cohesion_r = 0 if whole_word_freq == 0 else math.pow( ( whole_word_freq / last_chr_freq ), (1 / (word_len - 1)) )
        return ( cohesion_l, cohesion_r,  cohesion_l * cohesion_r, (cohesion_l + cohesion_r)/2 )

    def extract( self ):
        for x in self.token_counter:
            _score = namedtuple('Score', ['freq', 'cohesion_l', 'cohesion_r', 'cohesion', 'cohesion_s'])
            _score.freq = self.token_counter[x]
            _score.cohesion_l, _score.cohesion_r, _score.cohesion, _score.cohesion_s  = self.cohesion_score( x )
            self.score[x] = _score
        return self.score
    
    @staticmethod
    def ngram( text, n):
        return [ text[i:i+n] for i in range( 0, len(text) - n + 1 )  ]

    def allgram( text, min_window=2, max_window=8 ):
        len_txt = len(text)
        mx_wd = len_txt if ( len_txt < max_window ) else max_window
        rst = []
        for i in range(min_window, mx_wd + 1):
            rst += TokenExtractor.ngram(text, i)
        return rst

In [68]:
class Segmenter:
    
    def __init__( self, token_with_score, target_score="cohesion_s", score_cutoff=0.02):
        self.score_list = [ ( tk, getattr( sc, target_score ) ) for tk, sc in token_with_score.items() if getattr( sc, target_score ) >= score_cutoff  ]
        self.score = dict( self.score_list )
        self.tokens = self.score.keys()
        self.target_text = ""
        
    def load( self, text, min_window=2, max_window=8 ):
        _token_candis = set( TokenExtractor.allgram(text, min_window, max_window) )
        token_candis_with_score = [ ( it, self.score[it] ) for it in _token_candis if it in self.tokens ]
        self.token_candis = sorted( token_candis_with_score, key=lambda x: (-x[1], -len(x[0] )  ) )
        self.target_text = text
        return self
    
    def segment( self, segment_marker="$" ):
        target_text = self.target_text + ""
        self.segment_marker = segment_marker
        for i, candi in enumerate( self.token_candis ):
            marker = "{0}{1}{0}".format( self.segment_marker, i )
            target_text = marker.join( target_text.split( candi[0] ) )
        
        self.text_segment_marked = target_text
        return self
    
    def show( self, verbose=False, sep="%"):
        target_text = self.text_segment_marked + ""
        for i, candi in enumerate( self.token_candis ):
            marker = "{0}{1}{0}".format( self.segment_marker, i )
            seg = "【{0}/{1:01.3f}】".format( candi[0], candi[1] ) if verbose else "【{}】".format( candi[0] )
            target_text = target_text.replace(marker, seg )
        self.text_segmented = target_text    
        return target_text
        

## TEST

In [69]:
import pprint

text = open("_tmp/_dummy_corpus.txt", 'r', encoding="utf-8").read()
corpus = KMcorpus(text, comments_header="//")
corpus.merge_duplications().merge_variants().remove_comments().remove_punctuation().remove_chrs().merge_spaces().text2docs()

tmp = open("_tmp/_dummy_corpus_clean.txt", 'w', encoding="utf-8")
pp = pprint.PrettyPrinter(indent=4, stream=tmp)
pp.pprint( corpus.docs )

te = TokenExtractor( corpus )
te.train().extract()
print("train ending")


train ending


In [70]:
sg = Segmenter( te.score, target_score='cohesion' )
sgl = Segmenter( te.score, target_score='cohesion_l' )
sgr = Segmenter( te.score, target_score='cohesion_r' )
sgs = Segmenter( te.score, target_score='cohesion_s' )


docs = [
    "治風證眩暈. 山茱萸肉 一兩, 山藥ㆍ甘菊ㆍ人參ㆍ川芎ㆍ茯神 各五錢. 右爲末, 每二錢, 酒調下. 《本事》",
    "眞人養生銘曰人欲勞於形百病不能成飮酒勿大醉諸疾自不生食了行百步數以手摩肚寅丑日剪甲頭髮梳百度飽卽立小便飢則坐漩尿行處勿當風居止無小隙常夜濯足臥飽食終無益思慮最傷神喜怒最傷氣每去鼻中毛常習不唾地平明欲起時下床先左脚一日無災殃去邪兼辟惡如能七星步令人長壽樂酸味傷於筋苦味傷於骨甘卽不益肉辛多敗正氣鹹多促人壽不得偏耽嗜春夏少施泄秋冬固陽事獨臥是守眞愼靜最爲貴錢財生有分知足將爲利强知是大患少慾終無累神靜自常安修道宜終始書之屋壁中將以傳君子",
    "久服明目輕身延年酒浸曝乾蒸之如此九次搗爲末每二錢空心溫酒調服一日二次本草",
    "治折傷後爲風寒濕所侵手足疼痛生蒼朮破古紙半生半炒骨碎補穿山甲桑灰炒爲珠生草烏各二兩茴香一兩半右將草烏剉如麥大同連皮生薑四兩擂爛淹兩宿焙乾同前藥爲末酒糊和丸梧子大溫酒下五十丸少麻無妨得效"
]

for sn in docs:
    #sg.load( sn ).segment()
    #sgl.load( sn ).segment()
    #sgr.load( sn ).segment()
    sgs.load( sn ).segment()
    print( sgs.token_candis )
    #print( sg.target_text )
    #print( sg.show() )
    #print( sgl.show() )
    #print( sgr.show() )
    print( sgs.show() )
    print("\n")


[('茱萸', 0.947980397980398), ('川芎', 0.7391782540875262), ('右爲末', 0.5921452228091995), ('爲末', 0.4845754162786111), ('人參', 0.4609669379041367), ('山茱萸', 0.43029153176491475), ('眩暈', 0.3594959749782085), ('甘菊', 0.32764004542013625), ('每二錢', 0.32498633756266126), ('右爲', 0.28247710679528576), ('各五錢', 0.2818964624190051), ('二錢', 0.2591218108911663), ('一兩', 0.25458261843574165), ('本事', 0.21540697948819604), ('山茱', 0.19401709401709402), ('調下', 0.1829016762105842), ('酒調下', 0.18200296579184727), ('各五', 0.17176678034247622), ('五錢', 0.15146704420881457), ('山藥', 0.1411460010735373), ('每二', 0.12820259481350757), ('酒調', 0.11455354381164418), ('茯神', 0.09468202147760157), ('治風', 0.040685718028607584)]
【治風】證【眩暈】. 山【茱萸】肉 【一兩】, 【山藥】ㆍ【甘菊】ㆍ【人參】ㆍ【川芎】ㆍ【茯神】 【各五錢】. 【右爲末】, 【每二錢】, 酒【調下】. 《【本事】》


[('思慮', 0.37985573423333996), ('小便', 0.34370002284669865), ('不能', 0.23154676774135768), ('以手摩', 0.20549052773867427), ('君子', 0.20006366756215352), ('令人', 0.15978098924150275), ('喜怒', 0.13790439401255433), ('眞人養', 0.1284422

## SANDBOX

In [64]:
np.power( 0.7, 1/2 )

0.83666002653407556

In [20]:
rst = sorted( te.score.items(), key=lambda it: -1* ( it[1].cohesion_r * it[1].cohesion_l ) )
tmp2 = open("_tmp/_dummy_corpus_score.txt", 'w', encoding="utf-8")
pp = pprint.PrettyPrinter(indent=4, stream=tmp2)
pp.pprint( [ ( r[0], r[1].freq , r[1].cohesion_r, r[1].cohesion_l ) for r in rst ] )

print("ending")

ending


In [13]:
te.unigram_counter.get("枸")

95

In [74]:
from collections import Counter
c = Counter("huihihu ihuihi") 
d = Counter("werwwqweqriiiiiiiiiiii")
c.update(d)
print(c)

Counter({'i': 17, 'h': 5, 'w': 4, 'u': 3, 'e': 2, 'r': 2, 'q': 2, ' ': 1})


In [11]:
97 / 8

12.125

In [24]:
"c" in {'a':1, 'b':2}.keys()


False

In [25]:
dict( [('a', 1), ('b', 2)] )

{'a': 1, 'b': 2}