In [65]:
from collections import defaultdict
from collections import namedtuple
from collections import Counter
import math
import re

In [66]:
import os
import psutil
import sys

class Tools:
    # Frome here : https://github.com/lovit/soynlp/tree/master/soynlp/utils
    @staticmethod
    def get_available_memory():
        """It returns remained memory as percentage"""

        mem = psutil.virtual_memory()
        return 100 * mem.available / (mem.total)

    def get_process_memory():
        """It returns the memory usage of current process"""

        process = psutil.Process(os.getpid())
        return process.memory_info().rss / (1024 ** 3)


In [67]:
# KMcorpus

class KMcorpus:
    
    PC = "[,\.!\?！＇，ㆍ．／：；？｀、。·‥…¨〃∼´～˝\(\)\{\}\[\]（）［］｛｝‘’“”〔〕〈〉《》「」『』【】]"
    
    def __init__( self, text, comments_header="#", doc_sep="\r?\n\r?\n" ):
        self.text = text
        self.doc_sep = doc_sep
        self.comments_header = comments_header
        self.docs = []
        
    def remove_comments( self ):
        pattern = "{}.*?$".format( self.comments_header )
        regex = re.compile( pattern, re.MULTILINE|re.DOTALL )
        self.text = re.sub( regex, "", self.text ).strip()
        print("# Comments were removed")
        return self
        
    def remove_punctuation( self ):  
        regex_PC = re.compile( self.PC )
        self.text = re.sub( regex_PC , " ", self.text ).strip()
        print("# Punctuations were removed")
        return self
    
    def remove_chrs( self, chr_types=["Korean", "Alphabet", "Numbers"] ):
        if "Korean" in chr_types:
            self.text = re.sub( re.compile("[가-힣]"), "", self.text )
        if "Alphabet" in chr_types:
            self.text = re.sub( re.compile("[a-zA-Z]"), "", self.text )
        if "Numbers" in chr_types:
            self.text = re.sub( re.compile("[\d]+?"), "", self.text )
        self.text = self.text.strip()
        print("# {} were removed".format( ", ".join( chr_types  ) ) )
        return self
    
    def merge_spaces( self ):
        self.text = re.sub( re.compile("[ \t]+?"), " ", self.text )
        self.text = re.sub( re.compile("^[ \t]+?", re.MULTILINE|re.DOTALL), "", self.text ).strip()
        print("# Spaces were merged")
        return self
        
    def text2docs(self):
        docs = re.split( re.compile( self.doc_sep ), self.text )
        self.docs = [ doc.strip().split() for doc in docs ]
        print("# Text was converted to List Data")
        return self
    
    
    def merge_duplications(self, dict_path="dicts/duplications.dic" ):
        print("# Duplicated Characters were merged")
        self.text = self.__class__.merge_chrs( self.text, dict_path ) 
        return self
    
    def merge_variants(self, dict_path="dicts/variants.dic" ):
        print("# Variants Characters were merged")
        self.text = self.__class__.merge_chrs( self.text, dict_path ) 
        return self
    

    
    @staticmethod
    def merge_chrs(text, dict_path ):
        dic = open(dict_path, 'r', encoding='utf-8').readlines()
        text_ = text + ""
        for pair in dic:
            a, b = pair.split("\t")
            text_ = text_.replace(a, b)
        return text_
    
    def ngram( text, n):
        return [ text[i:i+n] for i in range( 0, len(text) - n + 1 )  ]

    def allgram( text, min_window=2, max_window=8 ):
        len_txt = len(text)
        mx_wd = len_txt if ( len_txt < max_window ) else max_window
        rst = []
        for i in range(min_window, mx_wd + 1):
            rst += KMcorpus.ngram(text, i)
        return rst

In [86]:
# Segment

class TokenExtractor:
    
    def __init__( self, corpus ):
        self.corpus = corpus
        self.token_counter = Counter()
        self.unigram_counter = Counter( self.corpus.text )
        self.bigram_counter = Counter()
        
    def _cohesion_score( self, word ):
        word_len = len( word )
        if (not word) or ( word_len < self.min_window ):
            return 0
        
        first_chr_freq = self.unigram_counter[ word[0] ]
        last_chr_freq = self.unigram_counter[ word[-1] ]
        whole_word_freq = self.token_counter[ word ]
        
        cohesion_l = 0 if whole_word_freq == 0 else math.pow( ( whole_word_freq / first_chr_freq ), (1 / (word_len - 1)) )
        cohesion_r = 0 if whole_word_freq == 0 else math.pow( ( whole_word_freq / last_chr_freq ), (1 / (word_len - 1)) )
        cohesion = math.sqrt(cohesion_l * cohesion_r)
        return ( cohesion_l, cohesion_r, cohesion , (cohesion_l + cohesion_r)/2 )

    def _branch_entropy_score( self, word ):
        word_len = len( word )
        whole_word_freq = self.token_counter[ word ]
        token_l, token_r = word[:-1], word[1:]        
        branch_entropy_l = self.__class__.entropy( whole_word_freq / self.token_counter[token_l] ) if ( token_l in self.token_counter ) and (self.token_counter[token_l] != 0 ) else 0
        branch_entropy_r = self.__class__.entropy( whole_word_freq / self.token_counter[token_r] ) if ( token_r in self.token_counter ) and (self.token_counter[token_r] != 0 ) else 0
        
        # debuging ###
        if not( ( token_l in self.token_counter ) and (self.token_counter[token_l] != 0 ) ):
            print("token_l", word, token_l, self.token_counter[token_l] )
            
        if not ( ( token_r in self.token_counter ) and (self.token_counter[token_r] != 0 ) ):
            print("token_r", word, token_r, self.token_counter[token_r] )
        ### 
        
        return ( ( token_l, branch_entropy_l ), ( token_r, branch_entropy_r ) )

   # return self
        
    def train( self, min_freq = 5, min_window=2, max_window=8  ):
        self.min_freq = min_freq
        self.max_window = max_window
        if min_window < 2:
            self.min_window = 2
            print("!!! Min_window must be greater than 2. Automatically set 2")
        else:
            self.min_window = min_window
        
        corpus_size = len(self.corpus.docs)
        
        for i, doc in enumerate(self.corpus.docs):
            sys.stdout.write("\r# Training ... ({:06d} in {:06d} docs) System memory {:.3f} Gb used".format(i, corpus_size, Tools.get_process_memory() ) )
            for phrase in doc:
                particles = KMcorpus.allgram( phrase, min_window, max_window + 1 ) # branch entropy를 구히기 위해 window 범위를 1씩 늘림
                self.token_counter.update( Counter( particles ) )

                bigrams = KMcorpus.ngram( phrase, n=2 )
                self.bigram_counter.update( Counter( bigrams ) )

        # Token_Counter
        self.token_counter.update( self.unigram_counter ) # branch entropy를 구히기 위해 window 범위를 1씩 늘림
        # Branch Entropy
        self._total_branch_entropy_score()
        print( "\r# Training was done. Used memory {:.3f} Gb".format( Tools.get_process_memory() ) )
        return self
        
    def _total_branch_entropy_score( self ):
        branch_entropy_l = defaultdict(lambda: 0)
        branch_entropy_r = defaultdict(lambda: 0)
        for (w, f) in self.token_counter.items():
            if ( len(w) < self.min_window ): continue
            be_l, be_r = self._branch_entropy_score( w )
            branch_entropy_l[ be_l[0] ] += be_l[1]
            branch_entropy_r[ be_r[0] ] += be_r[1]
        self.total_branch_entropy_l = branch_entropy_l
        self.total_branch_entropy_r = branch_entropy_r
        return self
    
    def extract( self ):
        self._score = defaultdict()
        self._score_header = ['freq', 'cohesion_l', 'cohesion_r', 'cohesion', 'cohesion_s', 'branch_entropy_l', 'branch_entropy_r', 'branch_entropy' ]
        i = 0
        for (w, f) in self.token_counter.items():
            if len(w) > self.max_window: continue
            if len(w) < self.min_window: continue
            if f < self.min_freq: continue

            _score = namedtuple('Score', self._score_header )
            _score.freq = f
            # Cohesion Score
            _score.cohesion_l, _score.cohesion_r, _score.cohesion, _score.cohesion_s  = self._cohesion_score( w )
            # Branch Entropy Score
            _score.branch_entropy_l = self.total_branch_entropy_l[w]
            _score.branch_entropy_r = self.total_branch_entropy_r[w]
            _score.branch_entropy = ( _score.branch_entropy_l + _score.branch_entropy_r ) / 2
            self._score[ w ] = _score
            
            # Report progress
            i += 1
            sys.stdout.write("\r# Extracting ... ({:08d} ) System memory {:.3f} Gb used".format(i, Tools.get_process_memory() ) )

        print("# Extrating was done. System memory {:.3f} Gb used".format( Tools.get_process_memory()) )
        return self
    
    # get score
    def score(self):
        return self._score
    
    def report(self, output_filename, sep="\t", order="cohesion"):
        handler = open(output_filename, 'w', encoding="utf-8")
        header = "token" + sep + sep.join( self._score_header ) + "\n"
        handler.write(header)

        _score_list = self.score().items()
        score_list = sorted( _score_list, key=lambda x: getattr( x[1], order ), reverse=True )
        for word, score in score_list:
            handler.write( word + sep + sep.join( [ "{:01.3f}".format( getattr( score, s ) ) for s in self._score_header ]) + "\n" )
        handler.close()
        print("# {:d} of tokens were reported in {}".format( len( score_list) , output_filename  ) )
        return self
                          
    @staticmethod
    def entropy( p ):
        return -1 * p * math.log2( p )
    

In [70]:
class Segmenter:
    
    def __init__( self, token_with_score, target_score="cohesion_s", score_cutoff=0):
        self.score_list = [ ( tk, getattr( sc, target_score ) ) for tk, sc in token_with_score.items() if getattr( sc, target_score ) >= score_cutoff  ]
        self.score = dict( self.score_list )
        self.tokens = self.score.keys()
        self.target_text = ""
        
    def load( self, text, min_window=2, max_window=8 ):
        _token_candis = set( KMcorpus.allgram(text, min_window, max_window) )
        token_candis_with_score = [ ( it, self.score[it] ) for it in _token_candis if it in self.tokens ]
        self.token_candis = sorted( token_candis_with_score, key=lambda x: (-x[1], -len(x[0] )  ) )
        self.target_text = text
        return self
    
    def segment( self, segment_marker="$" ):
        target_text = self.target_text + ""
        self.segment_marker = segment_marker
        for i, candi in enumerate( self.token_candis ):
            marker = "{0}{1}{0}".format( self.segment_marker, i )
            target_text = marker.join( target_text.split( candi[0] ) )
        
        self.text_segment_marked = target_text
        return self
    
    def show( self, verbose=False, sep="%"):
        target_text = self.text_segment_marked + ""
        for i, candi in enumerate( self.token_candis ):
            marker = "{0}{1}{0}".format( self.segment_marker, i )
            seg = "【{0}/{1:01.3f}】".format( candi[0], candi[1] ) if verbose else "【{}】".format( candi[0] )
            target_text = target_text.replace(marker, seg )
        self.text_segmented = target_text    
        return target_text
        

## TEST

In [71]:
import pprint

text = open("_tmp/_dummy_corpus.txt", 'r', encoding="utf-8").read()
corpus = KMcorpus(text, comments_header="//")
corpus.merge_duplications().merge_variants().remove_comments().remove_punctuation().remove_chrs().merge_spaces().text2docs()

tmp = open("_tmp/_dummy_corpus_clean.txt", 'w', encoding="utf-8")
pp = pprint.PrettyPrinter(indent=4, stream=tmp)
pp.pprint( corpus.docs )


# Duplicated Characters were merged
# Variants Characters were merged
# Comments were removed
# Punctuations were removed
# Korean, Alphabet, Numbers were removed
# Spaces were merged
# Text was converted to List Data


In [87]:
te = TokenExtractor( corpus )
te.train().extract().report("_tmp/_dummy_corpus_tokens.txt")
print("train ending")

# Training was done. Used memory 1.660 Gbstem memory 1.512 Gb used
# Extracting ... (00036585 ) System memory 1.099 Gb used# Extrating was done. System memory 1.099 Gb used
# 36585 of tokens were reported in _tmp/_dummy_corpus_tokens.txt
train ending


In [None]:
sg = Segmenter( te.score(), target_score='cohesion' )
# sgl = Segmenter( te.score, target_score='cohesion_l' )
# sgr = Segmenter( te.score, target_score='cohesion_r' )
sgE = Segmenter( te.score(), target_score='branch_entropy' )


docs = [
    "治風證眩暈. 山茱萸肉 一兩, 山藥ㆍ甘菊ㆍ人參ㆍ川芎ㆍ茯神 各五錢. 右爲末, 每二錢, 酒調下. 《本事》",
    "眞人養生銘曰人欲勞於形百病不能成飮酒勿大醉諸疾自不生食了行百步數以手摩肚寅丑日剪甲頭髮梳百度飽卽立小便飢則坐漩尿行處勿當風居止無小隙常夜濯足臥飽食終無益思慮最傷神喜怒最傷氣每去鼻中毛常習不唾地平明欲起時下床先左脚一日無災殃去邪兼辟惡如能七星步令人長壽樂酸味傷於筋苦味傷於骨甘卽不益肉辛多敗正氣鹹多促人壽不得偏耽嗜春夏少施泄秋冬固陽事獨臥是守眞愼靜最爲貴錢財生有分知足將爲利强知是大患少慾終無累神靜自常安修道宜終始書之屋壁中將以傳君子",
    "久服明目輕身延年酒浸曝乾蒸之如此九次搗爲末每二錢空心溫酒調服一日二次本草",
    "治折傷後爲風寒濕所侵手足疼痛生蒼朮破古紙半生半炒骨碎補穿山甲桑灰炒爲珠生草烏各二兩茴香一兩半右將草烏剉如麥大同連皮生薑四兩擂爛淹兩宿焙乾同前藥爲末酒糊和丸梧子大溫酒下五十丸少麻無妨得效"
]

for sn in docs:
    sg.load( sn ).segment()
    #sgl.load( sn ).segment()
    #sgr.load( sn ).segment()
    sgE.load( sn ).segment()
#     print( sgs.token_candis )
    #print( sg.target_text )
    print( sg.show() )
    #print( sgl.show() )
    #print( sgr.show() )
    print( sgE.show() )
    print("\n")


## SANDBOX

In [64]:
np.power( 0.7, 1/2 )

0.83666002653407556

In [20]:
rst = sorted( te.score.items(), key=lambda it: -1* ( it[1].cohesion_r * it[1].cohesion_l ) )
tmp2 = open("_tmp/_dummy_corpus_score.txt", 'w', encoding="utf-8")
pp = pprint.PrettyPrinter(indent=4, stream=tmp2)
pp.pprint( [ ( r[0], r[1].freq , r[1].cohesion_r, r[1].cohesion_l ) for r in rst ] )

print("ending")

ending


In [13]:
te.unigram_counter.get("枸")

95

In [74]:
from collections import Counter
c = Counter("huihihu ihuihi") 
d = Counter("werwwqweqriiiiiiiiiiii")
c.update(d)
print(c)

Counter({'i': 17, 'h': 5, 'w': 4, 'u': 3, 'e': 2, 'r': 2, 'q': 2, ' ': 1})


In [11]:
97 / 8

12.125

In [24]:
"c" in {'a':1, 'b':2}.keys()


False

In [25]:
dict( [('a', 1), ('b', 2)] )

{'a': 1, 'b': 2}

In [35]:
import math
def entropy( p ):
    return -1 * p * math.log2( p )
entropy( 100 )

-664.3856189774724

In [5]:
"123456789"[1:]

'23456789'

In [64]:


import os
import psutil
import sys
from collections import defaultdict

def get_available_memory():
    """It returns remained memory as percentage"""

    mem = psutil.virtual_memory()
    return 100 * mem.available / (mem.total)

def get_process_memory():
    """It returns the memory usage of current process"""
    
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 ** 3)


sys.stdout.write('\rtraining ... (%d in %d sents) use memory %.3f Gb' % (100, 100, get_process_memory()))
print('\rtraining ... (%d in %d sents) use memory %.3f Gb' % (100, 100, get_process_memory()))
print('\rtraining was done. used memory %.3f Gb' % (get_process_memory()))

training ... (100 in 100 sents) use memory 1.143 Gbtraining ... (100 in 100 sents) use memory 1.143 Gb
training was done. used memory 1.143 Gb
