In [68]:
from collections import defaultdict
from collections import namedtuple
from collections import Counter
import math
import numpy as np
import re

# KMcorpus

class KMcorpus:
    
    PC = "[,\.!\?！＇，ㆍ．／：；？｀、。·‥…¨〃∼´～˝\(\)\{\}\[\]（）［］｛｝‘’“”〔〕〈〉《》「」『』【】]"
    
    def __init__( self, text, comments_header="#", doc_sep="\r?\n\r?\n" ):
        self.text = text
        self.doc_sep = doc_sep
        self.comments_header = comments_header
        self.docs = []
        print(self.text[0:100])
        
    def remove_comments( self ):
        pattern = "{}.*?$".format( self.comments_header )
        regex = re.compile( pattern, re.MULTILINE|re.DOTALL )
        self.text = re.sub( regex, "", self.text ).strip()
        return self
        
    def remove_punctuation( self ):  
        regex_PC = re.compile( self.PC )
        self.text = re.sub( regex_PC , "", self.text ).strip()
        return self
    
    def remove_chrs( self, chr_types=["Korean", "Alphabet", "Numbers"] ):
        if "Korean" in chr_types:
            self.text = re.sub( re.compile("[가-힣]"), "", self.text )
        if "Alphabet" in chr_types:
            self.text = re.sub( re.compile("[a-zA-Z]"), "", self.text )
        if "Numbers" in chr_types:
            self.text = re.sub( re.compile("[\d]+?"), "", self.text )
        self.text = self.text.strip()
        return self
    
    def merge_spaces( self ):
        self.text = re.sub( re.compile("[ \t]+?"), " ", self.text )
        self.text = re.sub( re.compile("^[ \t]+?", re.MULTILINE|re.DOTALL), "", self.text ).strip()
        return self
        
    def text2docs(self):
        docs = re.split( re.compile( self.doc_sep ), self.text )
        self.docs = [ doc.strip().split() for doc in docs ]
        return self
    
    
    def merge_dupCodepages():
        """"""
    
    def merge_variants():
        """"""
    

# Segment

class TokenExtractor:
    
    def __init__( self, corpus, min_freq = 5 ):
        self.corpus = corpus
        self.min_freq = min_freq
        self.token_counter = Counter()
        self.unigram_counter = Counter( self.corpus.text )
        self.score = {}
        
    def train( self, method="allgram", min_window=2, max_window=8, ngram_size=2 ):
        self.min_window = min_window
        if method == "allgram" :
            for doc in self.corpus.docs:
                for phrase in doc:
                    paticles = self.allgram( phrase, min_window, max_window )
                    self.token_counter.update( Counter(paticles) )
        else:
            for doc in self.corpus.docs:
                for phrase in doc:
                    paticles = self.ngram( phrase, ngram_size )
                    self.token_counter.update( Counter(paticles) )
            
        self.token_counter = Counter( {x : self.token_counter[x] for x in self.token_counter if self.token_counter[x] >= self.min_freq } )
        return self
        
    def ngram( self, text, n):
        return [ text[i:i+n] for i in range( 0, len(text) - n + 1 )  ]

    def allgram( self, text, min_window=2, max_window=8 ):
        len_txt = len(text)
        mx_wd = len_txt if ( len_txt < max_window ) else max_window
        rst = []
        for i in range(min_window, mx_wd + 1):
            rst += ngram(text, i)
        return rst

    def cohesion_score( self, word ):
        word_len = len( word )
        first_chr_freq = self.unigram_counter[ word[0] ]
        whole_word_freq = self.token_counter[ word ]
        
        if (not word) or ( word_len < self.min_window ):
            return (0, 0)
        cohesion = 0 if whole_word_freq == 0 else np.power( ( whole_word_freq / first_chr_freq ), (1 / (word_len - 1)) )
        return cohesion  

    def get_score( self ):
        
        tmp = namedtuple('Score', ['freq', 'cohension_score'])
        
        for x in self.token_counter:
            
            tmp.freq = self.token_counter[x]
            tmp.cohension_score = self.cohesion_score( x )
            self.score[x] = tmp
        
        return self
            
        



"""
corpus_path = "./CORPUS/DYBG.txt"
# corpus_path = "./CORPUS/DYBG_tn.txt"

corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
print(len(corpus)) # 223,357

word_extractor = WordExtractor(min_count=5)
word_extractor.train( corpus )
word_scores = word_extractor.extract()
"""

'\ncorpus_path = "./CORPUS/DYBG.txt"\n# corpus_path = "./CORPUS/DYBG_tn.txt"\n\ncorpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)\nprint(len(corpus)) # 223,357\n\nword_extractor = WordExtractor(min_count=5)\nword_extractor.train( corpus )\nword_scores = word_extractor.extract()\n'

In [72]:
import pprint

text = open("_dummy_corpus.txt", 'r', encoding="utf-8").read()
corpus = KMcorpus(text, comments_header="//")
corpus.remove_comments().remove_punctuation().remove_chrs().merge_spaces().text2docs()

tmp = open("_dummy_corpus_clean.txt", 'w', encoding="utf-8")
pp = pprint.PrettyPrinter(indent=4, stream=tmp)
pp.pprint( corpus.docs )


te = TokenExtractor( corpus )
te.train().get_score()

tmp2 = open("_dummy_corpus_score.txt", 'w', encoding="utf-8")
pp = pprint.PrettyPrinter(indent=4, stream=tmp2)
pp.pprint( [ (x, te.score[x].freq, te.score[x].cohension_score) for x in te.score ] )

print("ending")


// 이 파일은 한국한의학연구원에서 동의보감 원문을 디지타이즈하여 배포한 것입니다. 
// File Info: { encoding: 'UTF-8', end_of_line: 'LF'
ending


In [18]:
"""
text = "1234567890"
text2 = "1234"
text3 = "12"

print( ngram(text, 5) )
print( allgram(text))
print( allgram(text2))
print( allgram(text2, 1))
print( allgram(text3, min_windows=3 ))
"""



'\ntext = "1234567890"\ntext2 = "1234"\ntext3 = "12"\n\nprint( ngram(text, 5) )\nprint( allgram(text))\nprint( allgram(text2))\nprint( allgram(text2, 1))\nprint( allgram(text3, min_windows=3 ))\n'

In [25]:
pattern = "{}[^\r\n]*$".format( "//" )
regex = re.compile( pattern )
re.sub( regex, "", "self.text//cooment\n//jiojoijoijio\nhiuojiojio" )

'self.text'

In [74]:
from collections import Counter
c = Counter("huihihu ihuihi") 
d = Counter("werwwqweqriiiiiiiiiiii")
c.update(d)
print(c)

Counter({'i': 17, 'h': 5, 'w': 4, 'u': 3, 'e': 2, 'r': 2, 'q': 2, ' ': 1})
