In [36]:
## do tokenize
## return tokenized list

class Tokenizer:

    def __init__(self, n_gram, stop=None, keyword=None):

        import re
        import deepcut
        import os
        from nltk.tokenize import TreebankWordTokenizer
        from nltk.stem.snowball import EnglishStemmer

        self.test_text = 'This is a test text. นี่เป็นตัวอย่าง ข้อความtesting ใน Python 3.6. ทดสอบtest.2) ทดสอบ3. test.4. '
        self.thai_pattern = re.compile(u'[\u0e00-\u0e7f]')
        self.new_sentence = re.compile('\.[0-9]+(\)|\.) ')
        self.pattern_th_out = re.compile(u'[\u0e00-\u0e7f][^\u0e00-\u0e7f]')
        self.pattern_th_in = re.compile(u'[^\u0e00-\u0e7f][\u0e00-\u0e7f]')
        self.num_bullet = re.compile('[0-9]+(\)|\.)*')
        self.end_token = re.compile('^[a-zA-Z]+$')
        self.charset = {}
        with open(os.path.join(os.getcwd(), '..', 'dict', 'charset'), 'rt') as charfile:
            for item in charfile.read().split('\n'):
                if len(item) < 4:
                    self.charset[item] = ord(item)
                else:
                    self.charset[chr(int(item, 16))] = int(item, 16)
        self.eng_tokenizer = TreebankWordTokenizer()
        self.stemming = EnglishStemmer()
        self.n_gram = n_gram
        self.dp = deepcut
        if stop:
            with open(os.path.join(os.getcwd(), '..', 'dict', stop), 'rt', encoding='utf-8') as stop_file:
                self.stop = set([item for item in stop_file.read().split('\n')])
        else:
            self.stop = set([])
        if keyword:
            with open(os.path.join(os.getcwd(), '..', 'dict', keyword), 'rt', encoding='utf-8') as keyword_file:
                self.keyword = set([item for item in keyword_file.read().split('\n')])
        else:
            self.keyword = set([])

    def tokenizer(self, text=None):

        def n_gram_compile(tokens, n):

            tokens = tokens[:]
            n_tokens = []
            if n <= 1:
                return tokens
            for j, token in enumerate(tokens[:-(n - 1)]):
                new_token = ''
                for word in tokens[j:j + n]:
                    if self.thai_pattern.search(word) and len(word) > 1:
                        new_token += word
                    else:
                        new_token = ''
                        break
                if new_token:
                    n_tokens.extend([new_token])
            return n_tokens

        def n_grams_compile(tokens, n):

            if n < 2:
                return tokens
            n_tokens = []
            for j in range(2, n + 1):
                n_tokens.extend(n_gram_compile(tokens, j))
            n_tokens = tokens + n_tokens
            return n_tokens

        def validate_char(val_text):
            val_text = val_text.replace('&amp;', ' ')
            ret_text = ''
            for cha in val_text:
                try:
                    self.charset[cha]
                except KeyError:
                    ret_text += ' '
                else:
                    ret_text += cha
            while ret_text.find('  ') != -1:
                ret_text = ret_text.replace('  ', ' ')
            return ret_text

        def split_th_en(splt_text):
            insert_pos = []
            splt_text = splt_text[:]
            for pos, item in enumerate(splt_text[:-2]):
                if self.pattern_th_in.search(splt_text[pos:pos+2]) or self.pattern_th_out.search(splt_text[pos:pos+2]):
                    insert_pos.append(pos + 1)
            for pos in reversed(insert_pos):
                splt_text = splt_text[:pos] + ' ' + splt_text[pos:]
            return splt_text

        if text == '-test':
            text = self.test_text

        text = split_th_en(text)
        text = self.new_sentence.sub(' . ', text)
        text = text.replace('. ', ' . ')
        text = validate_char(text)
        first_pass = text.split(' ')
        first_pass = [item for item in first_pass[:] if item not in self.stop and not self.num_bullet.search(item)]
        first_pass = [self.stemming.stem(item) if self.end_token.search(item) and
                      item not in self.keyword else item for item in first_pass[:]]
        second_pass = []
        for i, chunk in enumerate(first_pass):
            if self.thai_pattern.search(chunk) and len(chunk) > 1:
                new_chunk = self.dp.tokenize(chunk)
                second_pass.extend(new_chunk)
            else:
                second_pass.append(chunk.lower())

        second_pass = n_grams_compile(second_pass, self.n_gram)

        token_list = list(set(second_pass))

        return token_list

In [37]:
## Construct Dataframe

import pandas as pd

class DataController():
    dataMatrix = pd.DataFrame(columns=["title","desc","tag"])
    
    ## init will create dataMatrix
    def __init__(self, pathToFile):
        import os
        import json
        count = 0
        
        with open(pathToFile, 'r', encoding='utf-8') as fin:
            for line in fin:
                ## for each line, add into dataMatrix, using ["title", "desc", "tag"] structure
                line_dict = json.loads(line, encoding='utf-8')
                self.dataMatrix = self.dataMatrix.append(line_dict, ignore_index=True)
                #count+=1
                #if(count==100): break
    
    def getTrainingSet(self, label_class):
        ## classSet is set of data that has tag = label_class
        targetSet = self.dataMatrix[self.dataMatrix['tag']==label_class]
        restSet = self.dataMatrix[self.dataMatrix['tag']!=label_class]

        if(targetSet.shape[0] < restSet.shape[0]):
            # target has less population than the rest
            trainingSet = pd.concat([targetSet, restSet.sample(n=targetSet.shape[0])])
        else:
            # target has more population than the rest
            trainingSet = pd.concat([targetSet.sample(n=restSet.shape[0]), restSet])
        # shuffle data using sample fraction = 1
        trainingSet = trainingSet.sample(frac=1)
        return trainingSet

In [38]:
## Create data
import os

file_name = "block123.json"
file_path = os.getcwd()+"/../data/"+file_name

data = DataController(file_path)

In [39]:
## Create training data
trainingData = data.getTrainingSet("0")

training_Desc = trainingData['desc'] 
training_Title = trainingData['title']
training_Label = trainingData['tag']

In [40]:
data.dataMatrix[(data.dataMatrix['tag'] == "1")].shape

(272, 3)

In [41]:
## create tokenizer
tkn1 = Tokenizer(1)
tkn2 = Tokenizer(2)
tkn3 = Tokenizer(3)
tkn4 = Tokenizer(4)

## open vocab file
#import os
#with open(os.path.abspath(os.path.join(os.getcwd(), '..', 'dict', 'desc_newdict_90p.txt'))  , 'rt', encoding='utf-8') as f_tv:
#    desc_vocab = f_tv.read().split('\n')
#with open(os.path.abspath(os.path.join(os.getcwd(), '..', 'dict', 'title_newdict_90p.txt'))  , 'rt', encoding='utf-8') as f_tv:
#    title_vocab = f_tv.read().split('\n')

## create tfidf term-doc matrix
from sklearn.feature_extraction.text import TfidfVectorizer

desc_vectorizer = TfidfVectorizer(tokenizer=tkn2.tokenizer)
desc_vec = desc_vectorizer.fit_transform(training_Title)

title_vectorizer = TfidfVectorizer(tokenizer=tkn4.tokenizer)
title_vec = title_vectorizer.fit_transform(training_Desc)

## stack title onto desc
from scipy.sparse import hstack
data_vec = hstack([title_vec, desc_vec])

## create label_vec
label_vec = training_Label

In [42]:
## Train using Bernoulli NaiveBayes 
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB

## see crossvalidation score
bclf = BernoulliNB()
scores = cross_val_score(bclf, data_vec, label_vec, cv=3, scoring='f1_macro')
print(scores)

## see train_test_split
from sklearn.model_selection import train_test_split
desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)
bclf = bclf.fit(desc_train, label_train)
label_predict = bclf.predict(desc_test)


from sklearn.metrics import classification_report
print(classification_report(label_test, label_predict))
in_bclf = BernoulliNB()
in_bclf = in_bclf.fit(data_vec, label_vec)
label_predict = bclf.predict(data_vec)

from sklearn.metrics import classification_report
print(classification_report(label_vec, label_predict))

[0.71786376 0.73594691 0.71275697]
             precision    recall  f1-score   support

          0       0.76      0.59      0.66        80
          1       0.68      0.82      0.74        84

avg / total       0.72      0.71      0.70       164

             precision    recall  f1-score   support

          0       0.92      0.72      0.81       272
          1       0.77      0.94      0.85       272

avg / total       0.85      0.83      0.83       544

