In [2]:
import string
import re

# stemming using by PorterStemmer: https://tartarus.org/martin/PorterStemmer/
%run stemming.ipynb

class Tokenization():
    
    def __init__(self, version='en'):        
        self.stopwords = set()     
        self.porter_stem = PorterStemmer()
        if version == 'en':
            self.pattern = re.compile("[a-zA-Z]+", re.U)
            # self.pattern = re.compile("[a-zA-Z0-9]+", re.U)
        elif version == 'zh-tw':
            self.pattern = ''
            
    def remove_punctuation(self, text):
        output_text = ''
        for c in text:
            if c not in string.punctuation:
                output_text += ''.join(c)
            else:
                output_text += ''.join(' ')
        return output_text
    
    def calc_word_frequency(self, text):
        freq_dict = dict()
        for word in text.split():
            if word not in freq_dict:
                freq_dict[word] = 1
            else:
                freq_dict[word] += 1
                
        freq_list = [ (key, value) for key, value in sorted(freq_dict.items(),
                      key = lambda item:item[1], reverse=True) ]
        return freq_list
    
    
    def load_stopword_defaultdict(self, top_freq=3):
        words_freq = self.calc_word_frequency(self.text)
        for i in range(0, top_freq):
            self.stopwords.add(words_freq[i][0])
    
    def load_stopword_userdict(self, file_name):
        with open(file_name, 'r') as f:
            for word in f.readlines():
                if word not in self.stopwords:
                    self.stopwords.add(word.strip())            
        f.close()
        
    def load_stopword_dict(self):
        if len(self.stopwords) < 1:
            self.load_stopword_defaultdict()
        self.stopwords = list(self.stopwords)
        self.stopwords.sort()
    
    def remove_stopwords(self, text):
        word_sequence = text.split()
        return ' '.join(word for word in word_sequence if word not in self.stopwords) 
            
    def word_segmentation(self, word_sequence):
        return ' '.join(word for word in word_sequence.split() if self.pattern.match(word) != None)
        
    def stemming(self, text):
        output_text = ''
        for word in text.split():
            output_text += self.porter_stem.stem(word, 0, len(word)-1)
            output_text += ' '
        return output_text.split()
    
    def cut(self, text, stopword=True, stemming=True):
        self.text = text.lower()
        self.load_stopword_dict()
        self.text = self.remove_punctuation(self.text)
        if stopword:
            self.text = self.remove_stopwords(self.text)
        if stemming:
            word_sequence = self.stemming(self.word_segmentation(self.text))
        else:
            word_sequence = self.text.split()
        return ' '.join(word for word in word_sequence)
    