In [3]:
import string
import re

# stemming using by PorterStemmer: https://tartarus.org/martin/PorterStemmer/
%run stemming.ipynb

class Tokenization():
    
    def __init__(self, version='en'):        
        self.stopwords = set()     
        self.porter_stem = PorterStemmer()
        if version == 'en':
            # self.pattern = re.compile("[a-zA-Z]+", re.U)
            self.pattern = re.compile("[a-zA-Z0-9]+", re.U)
        elif version == 'zh-tw':
            self.pattern = ''
            
    def remove_punctuation(self, text):
        output_text = ''
        for c in text:
            if c not in string.punctuation:
                output_text += ''.join(c)
            else:
                output_text += ''.join(' ')
        return output_text
    
    def load_stopword_userdict(self, file_name):
        with open(file_name, 'r') as f:
            for word in f.readlines():
                if word not in self.stopwords:
                    self.stopwords.add(word.strip())            
        f.close()
        
    def load_stopword_dict(self):        
        self.load_stopword_userdict()
        self.stopwords = list(self.stopwords)
        self.stopwords.sort()
    
    def remove_stopwords(self, text):
        word_sequence = text.split()
        return ' '.join(word for word in word_sequence if word not in self.stopwords) 
    
    def split_num(self, text):
        output_text = ''
        for word in text.split():
            if word.isalpha():
                output_text += word
            else:
                characters = ''
                for c in word:
                    if not c.isalpha():
                        characters += '*' + c + '*'
                    else:
                        characters += c
                characters = characters.replace('**', '')
                characters = characters.replace('*', ' ')
                output_text += characters
            output_text += ' '
        return output_text
                
    def word_segmentation(self, word_sequence):
        return ' '.join(word for word in word_sequence.split() if self.pattern.match(word) != None)
        
    def stemming(self, text):
        output_text = ''
        for word in text.split():
            output_text += self.porter_stem.stem(word, 0, len(word)-1)
            output_text += ' '
        return output_text
    
    def cut(self, text, splitnum=True, stopword=True, stemming=True):
        self.text = text.lower()
        self.text = self.remove_punctuation(self.text)
        if splitnum:
            self.text = self.split_num(self.text)
        if stopword:
            self.load_stopword_dict()
            self.text = self.remove_stopwords(self.text)
        if stemming:
            self.text = self.stemming(self.word_segmentation(self.text))
        word_sequence = self.text.split()
        return ' '.join(word for word in word_sequence)