In [1]:
import os
import psutil
import sys

class Tools:
    # Frome here : https://github.com/lovit/soynlp/tree/master/soynlp/utils
    @staticmethod
    def get_available_memory():
        """It returns remained memory as percentage"""

        mem = psutil.virtual_memory()
        return 100 * mem.available / (mem.total)

    def get_process_memory():
        """It returns the memory usage of current process"""

        process = psutil.Process(os.getpid())
        return process.memory_info().rss / (1024 ** 3)

    def progress_symbol():
        return ['\\', '|', '/', '―']
    
    # Print iterations progress
    def print_progress(iteration, total, prefix='Progress', suffix='Complete', decimals=1, bar_length=100):
        """
        Call in a loop to create terminal progress bar
        @params:
            iteration   - Required  : current iteration (Int)
            total       - Required  : total iterations (Int)
            prefix      - Optional  : prefix string (Str)
            suffix      - Optional  : suffix string (Str)
            decimals    - Optional  : positive number of decimals in percent complete (Int)
            bar_length  - Optional  : character length of bar (Int)
        """
        str_format = "{0:." + str(decimals) + "f}"
        percents = str_format.format(100 * (iteration / float(total)))
        filled_length = int(round(bar_length * iteration / float(total)))
        bar = '█' * filled_length + '-' * (bar_length - filled_length)

        sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)),

        if iteration == total:
            sys.stdout.write('\n')
        sys.stdout.flush()
        # ref : https://gist.github.com/aubricus/f91fb55dc6ba5557fbab06119420dd6a

In [2]:
# KMcorpus

from collections import defaultdict
from collections import namedtuple
from collections import Counter
import math
import re
import yaml

class KMcorpus:
    
    def __init__( self, text, doc_sep="(\r?\n){2,}" ):
        self.text = text
        self.doc_sep = doc_sep
        self.docs = []
        
    def remove_comments( self, comments_header="#" ):
        _comments_pattern = "{}.*?$".format( comments_header )
        self.comments_pattern = re.compile( _comments_pattern, re.MULTILINE|re.DOTALL )
        self.text = re.sub( self.comments_pattern, " ", self.text ).strip()
        print("# Comments were removed")
        return self
        
    def remove_punctuation( self, punctuations="[,\.!\?！\:;＇，ㆍ．／：；？｀、。·‥…¨〃∼´～˝\-\\\(\)\{\}\[\]（）［］｛｝‘’“”〔〕〈〉《》「」『』【】%\$]" ): 
        self.punctuations_pattern = re.compile( punctuations )
        self.text = re.sub( self.punctuations_pattern , " ", self.text ).strip()
        print("# Punctuations were removed")
        return self
    
    def remove_chrs( self, chr_types=["Korean", "Alphabet", "Numbers"] ):
        if "Korean" in chr_types:
            self.text = re.sub( re.compile("[가-힣]"), " ", self.text )
        if "Alphabet" in chr_types:
            self.text = re.sub( re.compile("[a-zA-Z]"), " ", self.text )
        if "Numbers" in chr_types:
            self.text = re.sub( re.compile("[\d]+?"), " ", self.text )
        self.text = self.text.strip()
        print("# {} were removed".format( ", ".join( chr_types  ) ) )
        return self
    
    def merge_spaces( self ):
        self.text = re.sub( re.compile("[ \t]+"), " ", self.text )
        self.text = re.sub( re.compile("^[ \t]+", re.MULTILINE|re.DOTALL), "", self.text ).strip()
        print("# Spaces were merged")
        return self
        
    def text2docs(self):
        docs = re.split( re.compile( self.doc_sep ), self.text )
        self.docs = [ doc.strip().split() for doc in docs ]
        print("# Text was converted to List Data")
        return self
    
    def merge_duplications(self, dict_path="dicts/duplications.dic" ):
        print("# Duplicated Characters were merged")
        self.text = self.__class__.merge_chrs( self.text, dict_path ) 
        return self
    
    def merge_variants(self, dict_path="dicts/variants.dic" ):
        print("# Variants Characters were merged")
        self.text = self.__class__.merge_chrs( self.text, dict_path ) 
        return self
    
    def export(self, output_filename, plain_text=True ):
        stream = open(output_filename, 'w', encoding="utf-8")
        if plain_text:
            stream.write( self.text )
        else:
            yaml.dump( self.docs, stream, default_flow_style=False,  allow_unicode=True )
        stream.close()
        print( "# File {} was Created".format( output_filename ) )
        
    
    @staticmethod
    def merge_chrs(text, dict_path ):
        dic = open(dict_path, 'r', encoding='utf-8').readlines()
        text_ = text + ""
        for pair in dic:
            a, b = pair.split("\t")
            text_ = text_.replace(a, b)
        return text_
    
    def ngram( text, n):
        return [ text[i:i+n] for i in range( 0, len(text) - n + 1 )  ]

    def allgram( text, min_window=2, max_window=8 ):
        len_txt = len(text)
        mx_wd = len_txt if ( len_txt < max_window ) else max_window
        rst = []
        for i in range(min_window, mx_wd + 1):
            rst += KMcorpus.ngram(text, i)
        return rst

In [3]:
# Segment

from collections import defaultdict
from collections import namedtuple
from collections import Counter
import math


class TokenExtractor:
    
    def __init__( self, corpus ):
        self.corpus = corpus
        self.token_counter = Counter()
        self.unigram_counter = Counter( self.corpus.text )
        self.bigram_counter = Counter()
        
    def _cohesion_score( self, word ):
        word_len = len( word )
        if (not word) or ( word_len < self.min_window ):
            return 0
        
        first_chr_freq = self.unigram_counter[ word[0] ]
        last_chr_freq = self.unigram_counter[ word[-1] ]
        whole_word_freq = self.token_counter[ word ]
        
        cohesion_l = 0 if whole_word_freq == 0 else math.pow( ( whole_word_freq / first_chr_freq ), (1 / (word_len - 1)) )
        cohesion_r = 0 if whole_word_freq == 0 else math.pow( ( whole_word_freq / last_chr_freq ), (1 / (word_len - 1)) )
        cohesion = math.sqrt(cohesion_l * cohesion_r)
        return ( cohesion_l, cohesion_r, cohesion , (cohesion_l + cohesion_r)/2 )

    def _branch_entropy_score( self, word ):
        word_len = len( word )
        whole_word_freq = self.token_counter[ word ]
        token_l, token_r = word[:-1], word[1:]        
        branch_entropy_l = self.__class__.entropy( whole_word_freq / self.token_counter[token_l] ) if ( token_l in self.token_counter ) and (self.token_counter[token_l] != 0 ) else 0
        branch_entropy_r = self.__class__.entropy( whole_word_freq / self.token_counter[token_r] ) if ( token_r in self.token_counter ) and (self.token_counter[token_r] != 0 ) else 0
        
        # debuging ###
        if not( ( token_l in self.token_counter ) and (self.token_counter[token_l] != 0 ) ):
            print("token_l", word, token_l, self.token_counter[token_l] )
            
        if not ( ( token_r in self.token_counter ) and (self.token_counter[token_r] != 0 ) ):
            print("token_r", word, token_r, self.token_counter[token_r] )
        ### 
        
        return ( ( token_l, branch_entropy_l ), ( token_r, branch_entropy_r ) )

   # return self
        
    def train( self, min_freq = 5, min_window=2, max_window=8  ):
        self.min_freq = min_freq
        self.max_window = max_window
        if min_window < 2:
            self.min_window = 2
            print("!!! Min_window must be greater than 2. Automatically set 2")
        else:
            self.min_window = min_window
        
        corpus_size = len(self.corpus.docs)
        
        print("# Training ... ")
        for i, doc in enumerate(self.corpus.docs):
            sys.stdout.flush()
            Tools.print_progress(i, corpus_size, prefix='Progress', suffix='Complete')
            for phrase in doc:
                particles = KMcorpus.allgram( phrase, min_window- 1 , max_window + 1 ) # branch entropy를 구히기 위해 window 범위를 1씩 늘림
                self.token_counter.update( Counter( particles ) )

                bigrams = KMcorpus.ngram( phrase, n=2 )
                self.bigram_counter.update( Counter( bigrams ) )

        # Branch Entropy
        self._total_branch_entropy_score()
        print( "# Training was done. Used memory {:.3f} Gb".format( Tools.get_process_memory() ) )
        return self
        
    def _total_branch_entropy_score( self ):
        branch_entropy_l = defaultdict(lambda: 0)
        branch_entropy_r = defaultdict(lambda: 0)
        for (w, f) in self.token_counter.items():
            if ( len(w) < self.min_window ): continue
            be_l, be_r = self._branch_entropy_score( w )
            branch_entropy_l[ be_l[0] ] += be_l[1]
            branch_entropy_r[ be_r[0] ] += be_r[1]
        self.total_branch_entropy_l = branch_entropy_l
        self.total_branch_entropy_r = branch_entropy_r
        return self
    
    def extract( self ):
        self._score = defaultdict()
        self._score_header = ['freq', 'cohesion_l', 'cohesion_r', 'cohesion', 'cohesion_s', 'branch_entropy_l', 'branch_entropy_r', 'branch_entropy' ]
        _score = namedtuple('Score', self._score_header )
        i = 0
        total_len = len( self.token_counter )
        print("\r# Extracting ..." )
        for (w, f) in self.token_counter.items():
            if len(w) > self.max_window: continue
            if len(w) < self.min_window: continue
            if f < self.min_freq: continue

            _freq = f
            # Cohesion Score
            _cohesion_l, _cohesion_r, _cohesion, _cohesion_s  = self._cohesion_score( w )
            # Branch Entropy Score
            _branch_entropy_l = self.total_branch_entropy_l[w]
            _branch_entropy_r = self.total_branch_entropy_r[w]
            _branch_entropy = ( _branch_entropy_l + _branch_entropy_r ) / 2
            self._score[ w ] = _score( _freq, _cohesion_l, _cohesion_r, _cohesion, _cohesion_s, _branch_entropy_l, _branch_entropy_r, _branch_entropy )
            
            # Report progress
            Tools.print_progress(i, total_len, prefix='Progress', suffix='Complete')
            i += 1

        print("# Extrating was done. System memory {:.3f} Gb used".format( Tools.get_process_memory()) )
        return self
    
    # get score
    def score(self):
        return self._score
    
    def report(self, output_filename, sep="\t", order="cohesion"):
        handler = open(output_filename, 'w', encoding="utf-8")
        header = "token" + sep + sep.join( self._score_header ) + "\n"
        handler.write(header)

        _score_list = self.score().items()
        score_list = sorted( _score_list, key=lambda x: getattr( x[1], order ), reverse=True )
        for word, score in score_list:
            handler.write( word + sep + sep.join( [ "{:01.3f}".format( getattr( score, s ) ) for s in self._score_header ]) + "\n" )
        handler.close()
        print("# {:d} of tokens were reported in {}".format( len( score_list) , output_filename  ) )
                          
    @staticmethod
    def entropy( p ):
        return -1 * p * math.log2( p )
    

In [4]:
from collections import defaultdict
from collections import namedtuple
from collections import Counter
import re

class Segmenter:
    
    def __init__( self, token_with_score, target_score="cohesion", score_cutoff=0):
        self.score_list = [ ( tk, getattr( sc, target_score ) ) for tk, sc in token_with_score.items() if getattr( sc, target_score ) >= score_cutoff  ]
        self.score = dict( self.score_list )
        self.tokens = self.score.keys()
        self.target_text = ""
        
    def load( self, text, min_window=2, max_window=8 ):
        _token_candis = set( KMcorpus.allgram(text, min_window, max_window) )
        token_candis_with_score = [ ( it, self.score[it] ) for it in _token_candis if it in self.tokens ]
        self.token_candis = sorted( token_candis_with_score, key=lambda x: (-x[1], -len(x[0] )  ) )
        self.target_text = text
        return self
    
    def segment( self, segment_marker="%" ):
        target_text = self.target_text + ""
        self.segment_marker = segment_marker
        for i, candi in enumerate( self.token_candis ):
            marker = "{0}{0}{1}{0}{0}".format( self.segment_marker, i )
            target_text = marker.join( target_text.split( candi[0] ) )
        
        self.text_segment_marked = target_text
        return self
    
    def to_string( self, verbose=False, keyword_only=False, sep=" " ):
        target_text = self.text_segment_marked + ""
        if keyword_only:
            self.text_segmented = sep.join( self.to_list(verbose=False, keyword_only=True) )
        else:
            for i, candi in enumerate( self.token_candis ):
                marker = "{0}{0}{1}{0}{0}".format( self.segment_marker, i )
                seg = "【{0}/{1:01.3f}】".format( candi[0], candi[1] ) if verbose else "【{}】".format( candi[0] )
                target_text = target_text.replace(marker, seg )
            self.text_segmented = target_text    
        return self.text_segmented

    def to_list( self, verbose=False, keyword_only=False ):
        target_text = self.text_segment_marked + ""
        
        if keyword_only:
            rg = re.compile( "\{0}\{0}\d+?\{0}\{0}".format( self.segment_marker ) )
            _segment_list = re.findall( rg, target_text )
            segment_list = [ self.token_candis[ int(it[2:-2]) ] for it in _segment_list ] if verbose else [ self.token_candis[ int(it[2:-2]) ][0] for it in _segment_list ]
        else:
            segment_list= re.split(r"[【】]", self.to_string( verbose=False, keyword_only=False ) )
        self.list_segmented = list( filter( None, segment_list ) )
        return self.list_segmented
    


In [5]:
from collections import defaultdict
from collections import namedtuple
from collections import Counter
import re
from itertools import chain, combinations

class COQuantifier:  # co-occurrence 
    
    score_header = ['total_freq', 'observed_cooccurrence', 'expected_cooccurrence', 't_score']
    
    def __init__( self ):
        """"""

    def load( self, text_segmented, text_pairing=False, doc_sep="(\r?\n){2,}", token_sep="[\s]+"  ):
        self.text = text_segmented
        self.text_pairing = text_pairing
        self.doc_sep = doc_sep
        self.docs = list( map( lambda x: x.strip(), re.split( re.compile( self.doc_sep ), self.text ) ) )
        self.token_sep = token_sep
        return self
    
    def count( self ):
        self._count_freq()._count_cooccurrence()
        print("# All Tokens were Counted")
        return self
    
    def _count_freq( self ):
        """ 전체 단어 빈도 조사 """
        self.tokens = self.__class__.tokenize( self.text, self.token_sep )
        self.token_size = len( self.tokens )
        self.token_freq = Counter( self.tokens )
        return self
    
    def _count_cooccurrence( self, allow_duplicate_counts=True ): #doc 안에 같이 나오면 같이 등장하는 것으로.
        """ 문단 별 공기어 빈도 누적 (한 문단 안에 중복해 나와도 거듭 카운트) """
        _cooccurrence = defaultdict(lambda: 0)
        doc_size = len( self.docs )
        print("\r# Co-occurrence Counting ... ")
        for i, doc in enumerate(self.docs):        
            _lines = re.split(r"\r?\n", doc)
            _count = Counter()
            for line in _lines:
                _count.update( Counter( self.__class__.tokenize( doc, self.token_sep ) ) )
            _keys = _count.keys()
            for pair in combinations( _keys, 2 ) :
                _cooccurrence[ pair ] += _count[ pair[1] ]
            Tools.print_progress(i, doc_size, prefix='Progress', suffix='Complete')
        self.cooccurrence = _cooccurrence
        sys.stdout.flush()
        print("\n# Co-occurrence Counting was done. System memory {:.3f} Gb used".format( Tools.get_process_memory()) )
        return self
    
    def scoring( self ):
        # t-score
        self.scores = defaultdict( lambda: 0)
        # template
        _tpl = namedtuple("Scores", self.score_header )
        total_len = len( self.cooccurrence )
        print("\r# Co-occurrence Scores Generating ... ") 
        i = 0
        for a, b in self.cooccurrence:
            _freq_a, _freq_b = self.token_freq[a], self.token_freq[b]
            _expected = ( _freq_a * _freq_b ) / self.token_size
            _observed = self.cooccurrence[ (a,b) ]
            _t_score = self.__class__.get_Tscore( _observed, _expected )
            self.scores[ (a,b) ] = _tpl( ( _freq_a, _freq_b ),  _observed, _expected,  _t_score ) 
            Tools.print_progress(i, total_len, prefix='Progress', suffix='Complete')
            i += 1
        sys.stdout.flush()
        print("# Co-occurrence Scores were generated. System memory {:.3f} Gb used".format( Tools.get_process_memory() ) )
        return self
    
    def export( self, file_name, target_token, cutoff=1.5  ):
        stream = open(file_name, 'w', encoding="utf-8")
        stream.write( "\t".join( [ "tokens" ] + self.score_header ) + "\n" )
        _rst = self.report( target_token, cutoff )
        for token, score in _rst:
            _tmp = list( score )
            _total_freq = [ str( _tmp[0][1] ) ]
            _etc = list( map( lambda x: "{:.3f}".format(x), _tmp[1:] ) )
            stream.write( "\t".join(  [ token ] + _total_freq + _etc )  + "\n" )
        print("# Report File was exported ")
            
            
    def report( self, target_token, cutoff=1.5 ):
        _rst = [ ( b, self.scores[(a,b)] ) for (a, b) in self.scores if a == target_token ]
        _rst_gt = [ (b, sc) for b, sc in _rst if sc.t_score >= cutoff ]
        return sorted( _rst_gt, key=lambda x: x[1].t_score, reverse=True )

        
    @staticmethod
    def tokenize( line, sep ):
        _tokens = list( map( lambda x: x.strip(), re.split( re.compile( sep ), line ) ) )
        tokens = list( filter( None, _tokens ) )    # remove empty string
        return tokens
        
    def get_Tscore( observed , expected ): 
        if observed == 0:
            return 0
        else:
            return float( observed - expected ) / math.sqrt( observed )  

    

## TEST

In [14]:
text = open("_tmp/_dummy_formulas.yml", 'r', encoding="utf-8").read()
corpus = KMcorpus(text[0:10000])
corpus.merge_duplications().remove_comments( comments_header="//" ).remove_punctuation().remove_chrs().merge_spaces().text2docs()
corpus.export("_tmp/_dummy_formulas_clean.txt", plain_text=True )
# corpus.export("_tmp/_dummy_formulas_docs.txt", plain_text=False )


# Duplicated Characters were merged
# Comments were removed
# Punctuations were removed
# Korean, Alphabet, Numbers were removed
# Spaces were merged
# Text was converted to List Data
# File _tmp/_dummy_formulas_clean.txt was Created


In [15]:
te = TokenExtractor( corpus )
te.train().extract().report("_tmp/_dummy_formulas_tokens.txt")

# Training ... 
Progress |███████████████████████████████████████████████████████████████████████████████████████████████████-| 99.4% Complete# Training was done. Used memory 0.493 Gb
# Extracting ...
Progress |███-------------------------------------------------------------------------------------------------| 3.1% Complete# Extrating was done. System memory 0.493 Gb used
# 137 of tokens were reported in _tmp/_dummy_formulas_tokens.txt


In [17]:
sg = Segmenter( te.score(), target_score='cohesion' )
input_doc = open("_tmp/_dummy_formulas_clean.txt", 'r', encoding="utf-8")
output_doc = open("_tmp/_dummy_formulas_tokenized.txt", 'w', encoding="utf-8")
for line in input_doc.readlines():
    if line.strip() == "":
        output_doc.write("\n")
    else:
        output_doc.write( sg.load( line ).segment().to_string( verbose=False, keyword_only=False ) )
        output_doc.write("\n")
print("Completed")    

Completed


In [12]:
txt = open("_tmp/_dummy_formulas_tokenized.txt", 'r', encoding="utf-8").read()
cq = COQuantifier().load( txt[0:10000], text_pairing=False ).count().scoring()

# Co-occurrence Counting ... 
Progress |----------------------------------------------------------------------------------------------------| 0.0% Complete
# Co-occurrence Counting was done. System memory 0.493 Gb used
# All Tokens were Counted
# Co-occurrence Scores Generating ... 
# Co-occurrence Scores were generated. System memory 0.493 Gb used


In [13]:
cq.export( "_tmp/_dummy_formulas_대황.txt", "大黄", cutoff=0.1 )

# Report File was exported 


In [None]:
sg = Segmenter( te.score(), target_score='cohesion' )
# sgl = Segmenter( te.score, target_score='cohesion_l' )
# sgr = Segmenter( te.score, target_score='cohesion_r' )
sgE = Segmenter( te.score(), target_score='branch_entropy' )


docs = [
    "治風證眩暈. 山茱萸肉 一兩, 山藥ㆍ甘菊ㆍ人參ㆍ川芎ㆍ茯神 各五錢. 右爲末, 每二錢, 酒調下. 《本事》",
    "眞人養生銘曰人欲勞於形百病不能成飮酒勿大醉諸疾自不生食了行百步數以手摩肚寅丑日剪甲頭髮梳百度飽卽立小便飢則坐漩尿行處勿當風居止無小隙常夜濯足臥飽食終無益思慮最傷神喜怒最傷氣每去鼻中毛常習不唾地平明欲起時下床先左脚一日無災殃去邪兼辟惡如能七星步令人長壽樂酸味傷於筋苦味傷於骨甘卽不益肉辛多敗正氣鹹多促人壽不得偏耽嗜春夏少施泄秋冬固陽事獨臥是守眞愼靜最爲貴錢財生有分知足將爲利强知是大患少慾終無累神靜自常安修道宜終始書之屋壁中將以傳君子",
    "久服明目輕身延年酒浸曝乾蒸之如此九次搗爲末每二錢空心溫酒調服一日二次本草",
    "治折傷後爲風寒濕所侵手足疼痛生蒼朮破古紙半生半炒骨碎補穿山甲桑灰炒爲珠生草烏各二兩茴香一兩半右將草烏剉如麥大同連皮生薑四兩擂爛淹兩宿焙乾同前藥爲末酒糊和丸梧子大溫酒下五十丸少麻無妨得效"
]

for sn in docs:
    sg.load( sn ).segment()
    #sgl.load( sn ).segment()
    #sgr.load( sn ).segment()
    sgE.load( sn ).segment()
#     print( sgs.token_candis )
    #print( sg.target_text )
    print( sg.to_string( verbose=True, keyword_only=True ) ) # not working
    print( sg.to_string( verbose=True, keyword_only=False ) )
    print( sg.to_string( verbose=False, keyword_only=True ) ) 
    print( sg.to_string( verbose=False, keyword_only=False ) )
    
    print( sg.to_list( verbose=True, keyword_only=True ) )
    print( sg.to_list( verbose=True, keyword_only=False ) ) # not working
    print( sg.to_list( verbose=False, keyword_only=True ) )
    print( sg.to_list( verbose=False, keyword_only=False ) )
    
    #print( sgl.show() )
    #print( sgr.show() )
#     print( sgE.show() )
    print("\n")


## SANDBOX

In [None]:
np.power( 0.7, 1/2 )

In [None]:
rst = sorted( te.score.items(), key=lambda it: -1* ( it[1].cohesion_r * it[1].cohesion_l ) )
tmp2 = open("_tmp/_dummy_corpus_score.txt", 'w', encoding="utf-8")
pp = pprint.PrettyPrinter(indent=4, stream=tmp2)
pp.pprint( [ ( r[0], r[1].freq , r[1].cohesion_r, r[1].cohesion_l ) for r in rst ] )

print("ending")

In [None]:
te.unigram_counter.get("枸")

In [None]:
from collections import Counter
c = Counter("huihihu ihuihi") 
d = Counter("werwwqweqriiiiiiiiiiii")
c.update(d)
print(c)

In [None]:
97 / 8

In [None]:
"c" in {'a':1, 'b':2}.keys()


In [None]:
dict( [('a', 1), ('b', 2)] )

In [None]:
import math
def entropy( p ):
    return -1 * p * math.log2( p )
entropy( 100 )

In [None]:
"123456789"[1:]

In [None]:


import os
import psutil
import sys
from collections import defaultdict

def get_available_memory():
    """It returns remained memory as percentage"""

    mem = psutil.virtual_memory()
    return 100 * mem.available / (mem.total)

def get_process_memory():
    """It returns the memory usage of current process"""
    
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 ** 3)


sys.stdout.write('\rtraining ... (%d in %d sents) use memory %.3f Gb' % (100, 100, get_process_memory()))
print('\rtraining ... (%d in %d sents) use memory %.3f Gb' % (100, 100, get_process_memory()))
print('\rtraining was done. used memory %.3f Gb' % (get_process_memory()))

In [None]:
import re
rg = re.compile("{0}\d+?{0}".format("%") )
# rg = re.compile("\d+?" )
txt = "朮破古紙半%1%生半炒骨碎%12%補穿山甲桑%8%灰炒爲珠生"
print( re.findall( rg, txt) )

In [None]:
for x in Counter(['a', 'a', 'b']):
    print(x)

from itertools import chain, combinations
k = Counter(['a', 'a', 'b', 'c'])
combinations( k.keys(), 2) 


In [None]:
k = ['\\', '|', '/', '-']
for i in range(0, 1000):
    sys.stdout.write("\r# {}".format( k[i % 4]) )
    
    