In [None]:
import pickle
import sys

if '..' not in sys.path:
    sys.path.append('..')
    
from main_fit_vectorizer import TFIDF_Vectorizer

In [None]:
with open('../data/vectorizer_1234.pck', 'rb') as pf:
    vectorizer = pickle.load(pf)

In [1]:
## do tokenize
## return tokenized list

class Tokenizer:

    def __init__(self, n_gram, stop_en=None, stop_th=None, keyword=None):

        import re
        import deepcut
        import os
        from nltk.tokenize import TreebankWordTokenizer
        from nltk.stem.snowball import EnglishStemmer

        self.test_text = 'ตัวอย่างความต้องการใช้ตัวอย่างความต้องการลีนุ๊กซ์การใช้ยากลำบาก'
        self.pattern_thai_char = re.compile(u'[\u0e00-\u0e7f]')
        self.pattern_new_sentence = re.compile('\.[0-9]+(\)|\.) ')
        self.pattern_th_out = re.compile(u'[\u0e00-\u0e7f][^\u0e00-\u0e7f]')
        self.pattern_th_in = re.compile(u'[^\u0e00-\u0e7f][\u0e00-\u0e7f]')
        self.pattern_num_bullet = re.compile('^[0-9]+(\)|\.)*$')
        self.pattern_end_token = re.compile('^[a-zA-Z]+$')
        self.pattern_number = re.compile('\+*[0-9]+')
        self.pattern_phone_number = re.compile('[0-9]+-[0-9]+-[0-9]+')
        self.pattern_email = re.compile('[a-zA-Z._\-0-9]+@[a-zA-Z._\-0-9]+')
        self.pattern_url = re.compile('(https://|www.)[a-zA-Z0-9]+.[a-z]+[^\s]*')
        self.pattern_sentence_collide = re.compile('[a-z][A-Z]]')
        self.pattern_thai_name = re.compile(u'\u0e04\u0e38\u0e13\s*[\u0e00-\u0e7f]+\s+')
        self.charset = {}
        with open(os.path.join(os.getcwd(), '..', 'Resource', 'charset'), 'rt') as charfile:
            for item in charfile.read().split('\n'):
                if len(item) < 4:
                    self.charset[item] = ord(item)
                else:
                    self.charset[chr(int(item, 16))] = int(item, 16)
        self.eng_tokenizer = TreebankWordTokenizer()
        self.stemming = EnglishStemmer()
        self.n_gram = n_gram
        self.dp = deepcut
        if stop_en:
            with open(os.path.join(os.getcwd(), '..', 'Resource', stop_en), 'rt', encoding='utf-8') as stop_file:
                self.stop_en = set([item for item in stop_file.read().split('\n')])
        else:
            self.stop_en = set([])
        if stop_th:
            with open(os.path.join(os.getcwd(), '..', 'Resource', stop_th), 'rt', encoding='utf-8') as stop_file:
                self.stop_th = set([item for item in stop_file.read().split('\n')])
        else:
            self.stop_th = set([])
        if keyword:
            with open(os.path.join(os.getcwd(), '..', 'Resource', keyword), 'rt', encoding='utf-8') as keyword_file:
                self.keyword = set([item for item in keyword_file.read().split('\n')])
        else:
            self.keyword = set([])
            
    def tokenizer(self, text=None):

        def n_gram_compile(tokens, n):

            tokens = tokens[:]
            n_tokens = []
            if n <= 1:
                return tokens
            for j, token in enumerate(tokens[:-(n - 1)]):
                new_token = ''
                for word in tokens[j:j + n]:
                    if self.pattern_thai_char.search(word) and len(word) > 1:
                        new_token += word
                    else:
                        new_token = ''
                        break
                if new_token:
                    n_tokens.extend([new_token])
            return n_tokens

        def n_grams_compile(tokens, n):

            if n < 2:
                return tokens
            n_tokens = []
            for j in range(2, n + 1):
                n_tokens.extend(n_gram_compile(tokens, j))
            n_tokens = tokens + n_tokens
            return n_tokens

        def validate_char(val_text):
            val_text = val_text.replace('&amp;', ' ')
            val_text = val_text.replace('&nbsp;', ' ')
            ret_text = ''
            for cha in val_text:
                try:
                    self.charset[cha]
                except KeyError:
                    ret_text += ' '
                else:
                    ret_text += cha
            while ret_text.find('  ') != -1:
                ret_text = ret_text.replace('  ', ' ')
            return ret_text
        
        def split_th_en(splt_text):
            insert_pos = []
            splt_text = splt_text[:]
            for pos, item in enumerate(splt_text[:-2]):
                if self.pattern_th_in.search(splt_text[pos:pos+2]) or self.pattern_th_out.search(splt_text[pos:pos+2]):
                    insert_pos.append(pos + 1)
            for pos in reversed(insert_pos):
                splt_text = splt_text[:pos] + ' ' + splt_text[pos:]
            return splt_text

        def remove_thai_stop(th_text):
            stop_pos = [[0, 0]]
            ## TH : do longest matching
            for j in range(len(th_text)-1):
                for k in range(j+1, len(th_text)):
                    if th_text[j:k] in self.stop_th:
                        # found keyword +++ instead of returning string - return positions that is
                        # i to j
                        if j <= stop_pos[-1][1]:
                            stop_pos[-1] = [stop_pos[-1][0], k]
                        else:
                            stop_pos.append([j, k])
                        break
            newstr = ''
            if len(stop_pos) == 1:
                newstr = th_text
            else:
                for j in range(len(stop_pos)-1):
                    newstr += th_text[stop_pos[j][1]:stop_pos[j+1][0]] + ' '
            return newstr

        if text == '-test':
            text = self.test_text
            
        text = text.replace(u'\u0e46', ' ')
        text = self.pattern_email.sub(' ', text)
        text = self.pattern_url.sub(' ', text)
        text = self.pattern_phone_number.sub(' ', text)
        text = self.pattern_thai_name.sub(' ', text)
        text = split_th_en(text)
        text = self.pattern_new_sentence.sub(' . ', text)
        text = text.replace('.', ' . ')
        text = validate_char(text)
        text = remove_thai_stop(text)
        text_split = text.split(' ')
        text_split = [item for item in text_split[:] if item not in self.stop_en
                      and not self.pattern_num_bullet.search(item)]
        text_split = [self.stemming.stem(item) if self.pattern_end_token.search(item) and
                      item not in self.keyword else item for item in text_split[:]]

        first_pass = []
        for i, item in enumerate(text_split):
            if self.pattern_sentence_collide.search(item) and item not in self.keyword:
                c_text = self.pattern_sentence_collide.search(item)
                first_pass.extend([c_text.string[:c_text.span()[0]+1], c_text.string[c_text.span()[1]-1:]])
            else:
                first_pass.append(item)
        second_pass = []
        for i, chunk in enumerate(first_pass):
            if self.pattern_thai_char.search(chunk) and len(chunk) > 1:
                new_chunk = self.dp.tokenize(chunk)
                second_pass.extend(new_chunk)
            else:
                second_pass.append(chunk.lower())

        second_pass = n_grams_compile(second_pass, self.n_gram)

        return set(second_pass)

In [2]:
tkn1 = Tokenizer(1)
tkn2 = Tokenizer(2)
## tkn3 = Tokenizer(3, 'en_stop_word.txt', 'th_stop_word.txt')
tkn4 = Tokenizer(4)

Using TensorFlow backend.


In [3]:
## Construct Dataframe

import pandas as pd

class DataController():
    dataMatrix = pd.DataFrame(columns=["title","desc","tag"])
    
    ## init will create dataMatrix
    def __init__(self, pathToFile):
        import os
        import json
        count = 0
        
        with open(pathToFile, 'r', encoding='utf-8') as fin:
            for line in fin:
                ## for each line, add into dataMatrix, using ["title", "desc", "tag"] structure
                line_dict = json.loads(line, encoding='utf-8')
                self.dataMatrix = self.dataMatrix.append(line_dict, ignore_index=True)
                #count+=1
                #if(count==100): break
    
    def getTrainingSet(self, label_class):
        ## classSet is set of data that has tag = label_class
        targetSet = self.dataMatrix[self.dataMatrix['tag']==label_class]
        restSet = self.dataMatrix[self.dataMatrix['tag']!=label_class]

        if(targetSet.shape[0] < restSet.shape[0]):
            # target has less population than the rest
            trainingSet = pd.concat([targetSet, restSet.sample(n=targetSet.shape[0])])
        else:
            # target has more population than the rest
            trainingSet = pd.concat([targetSet.sample(n=restSet.shape[0]), restSet])
        # shuffle data using sample fraction = 1
        trainingSet = trainingSet.sample(frac=1)
        return trainingSet

In [None]:
## Create data
import os
from sklearn.feature_extraction.text import TfidfVectorizer

file_name = "block1234.json"
file_path = os.getcwd()+"/../data/"+file_name

data = DataController(file_path)

## Create training data
trainingData = data.getTrainingSet("0")

training_Desc = trainingData['desc'] 
training_Title = trainingData['title']
training_Label = trainingData['tag']

## vectorize data

# desc_vectorizer = vectorizer.vectorize_desc
desc_vectorizer = TfidfVectorizer(tokenizer=tkn2.tokenizer, max_df=1.0, min_df=1)
desc_vec = desc_vectorizer.fit_transform(training_Title)

# title_vectorizer = vectorizer.vectorize_title
title_vectorizer = TfidfVectorizer(tokenizer=tkn4.tokenizer, max_df=1.0, min_df=1)
title_vec = title_vectorizer.fit_transform(training_Desc)

## stack title onto desc
from scipy.sparse import hstack
data_vec = hstack([title_vec, desc_vec])

## create label_vec
label_vec = training_Label

In [None]:
## Train using Multinomial NaiveBayes 
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## see crossvalidation score
NBclf = MultinomialNB()
scores = cross_val_score(NBclf, data_vec, label_vec, cv=3, scoring='f1_macro')
print('Cross validation score: ', scores)

## split sample into train_set and test_set
desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)

## In sample accuracy
in_NBclf = MultinomialNB()
in_NBclf = in_NBclf.fit(data_vec, label_vec)
label_predict = in_NBclf.predict(data_vec)
print(classification_report(label_vec, label_predict))

## Test set accuracy
NBclf = NBclf.fit(desc_train, label_train)
label_predict = NBclf.predict(desc_test)
print(classification_report(label_test, label_predict))