In [1]:
## do tokenize
## return tokenized list

class Tokenizer:
    def __init__(self, n_gram, en_stop=None, th_stop=None):
        import re
        import deepcut
        from nltk.tokenize import TreebankWordTokenizer
        self.pattern = re.compile(u'[\u0e01-\u0e2e]')
        self.eng_tokenizer = TreebankWordTokenizer()
        self.n_gram = n_gram
        self.dp = deepcut
        if en_stop:
            with open('\\dict\\' + en_stop, 'rt', encoding='utf-8') as stop_file:
                self.en_stop = set([item for item in stop_file.read().split('\n')])
        else:
            self.en_stop = set([])
            
    def tokenizer(self, text=None):
        def n_gram_compile(tokens, n):
            tokens = tokens[:]
            n_tokens = []
            if n <= 1:
                return tokens
            for j, token in enumerate(tokens[:-(n - 1)]):
                new_token = ''
                for word in tokens[j:j + n]:
                    if self.pattern.search(word) and len(word) > 1:
                        new_token += word
                    else:
                        new_token = ''
                        break
                if new_token:
                    n_tokens.extend([new_token])
            return n_tokens
        
        def n_grams_compile(tokens, n):
            if n < 2:
                return tokens
            n_tokens = []
            for j in range(2, n + 1):
                n_tokens.extend(n_gram_compile(tokens, j))
            n_tokens = tokens + n_tokens
            return n_tokens
        
        in_text = text.replace('.', ' . ').replace(u'\xa0', ' ').replace('  ', ' ')
        first_pass = self.eng_tokenizer.tokenize(in_text)
        first_pass = [item for item in first_pass[:] if item not in self.en_stop]
        second_pass = []
        for i, chunk in enumerate(first_pass):
            if self.pattern.search(chunk) and len(chunk) > 1:
                new_chunk = self.dp.tokenize(chunk)
                second_pass.extend(new_chunk)
            else:
                second_pass.append(chunk.lower())
        second_pass = n_grams_compile(second_pass, self.n_gram)
        return second_pass

In [109]:
## Construct Dataframe

import pandas as pd

class DataController():
    dataMatrix = pd.DataFrame(columns=["title","desc","tag"])
    
    ## init will create dataMatrix
    def __init__(self, pathToFile):
        import os
        import json
        count = 0
        
        with open(pathToFile, 'r', encoding='utf-8') as fin:
            for line in fin:
                ## for each line, add into dataMatrix, using ["title", "desc", "tag"] structure
                line_dict = json.loads(line, encoding='utf-8')
                self.dataMatrix = self.dataMatrix.append(line_dict, ignore_index=True)
                #count+=1
                #if(count==100): break
    
    def getTrainingSet(self, label_class):
        ## classSet is set of data that has tag = label_class
        targetSet = self.dataMatrix[self.dataMatrix['tag']==label_class]
        restSet = self.dataMatrix[self.dataMatrix['tag']!=label_class]

        if(targetSet.shape[0] < restSet.shape[0]):
            # target has less population than the rest
            trainingSet = pd.concat([targetSet, restSet.sample(n=targetSet.shape[0])])
        else:
            # target has more population than the rest
            trainingSet = pd.concat([targetSet.sample(n=restSet.shape[0]), restSet])
        # shuffle data using sample fraction = 1
        trainingSet = trainingSet.sample(frac=1)
        return trainingSet

In [110]:
## Create data
import os

file_name = "data1_tonnytag.json"
file_path = os.getcwd()+"/../data/"+file_name

data = DataController(file_path)

In [127]:
## Create training data
trainingData = data.getTrainingSet("1")

training_Desc = trainingData['desc'] 
training_Title = trainingData['title']
training_Label = trainingData['tag']

In [139]:
data.dataMatrix[(data.dataMatrix['tag'] == "1")].shape

(149, 3)

In [129]:
## create tokenizer
tkn1 = Tokenizer(1)
tkn2 = Tokenizer(2)
tkn3 = Tokenizer(3)
tkn4 = Tokenizer(4)

## open vocab file
import os
with open(os.path.abspath(os.path.join(os.getcwd(), '..', 'dict', 'desc_newdict_90p.txt'))  , 'rt', encoding='utf-8') as f_tv:
    desc_vocab = f_tv.read().split('\n')
with open(os.path.abspath(os.path.join(os.getcwd(), '..', 'dict', 'title_newdict_90p.txt'))  , 'rt', encoding='utf-8') as f_tv:
    title_vocab = f_tv.read().split('\n')

## create tfidf term-doc matrix
from sklearn.feature_extraction.text import TfidfVectorizer

desc_vectorizer = TfidfVectorizer(tokenizer=tkn2.tokenizer, vocabulary=desc_vocab)
desc_vec = desc_vectorizer.fit_transform(training_Title)

title_vectorizer = TfidfVectorizer(tokenizer=tkn4.tokenizer, vocabulary=title_vocab)
title_vec = title_vectorizer.fit_transform(training_Desc)

## stack title onto desc
from scipy.sparse import hstack
data_vec = hstack([title_vec, desc_vec])

## create label_vec
label_vec = training_Label

In [130]:
## Train using Bernoulli NaiveBayes 
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB

## see crossvalidation score
bclf = BernoulliNB()
scores = cross_val_score(bclf, data_vec, label_vec, cv=3, scoring='f1_macro')
print(scores)

## see train_test_split
from sklearn.model_selection import train_test_split
desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)
bclf = bclf.fit(desc_train, label_train)
label_predict = bclf.predict(desc_test)


from sklearn.metrics import classification_report
print(classification_report(label_test, label_predict))
in_bclf = BernoulliNB()
in_bclf = in_bclf.fit(data_vec, label_vec)
label_predict = bclf.predict(data_vec)

from sklearn.metrics import classification_report
print(classification_report(label_vec, label_predict))

[0.66372425 0.71440228 0.65614035]
             precision    recall  f1-score   support

          0       1.00      0.04      0.08        49
          1       0.47      1.00      0.64        41

avg / total       0.76      0.48      0.33        90

             precision    recall  f1-score   support

          0       1.00      0.05      0.09       149
          1       0.51      1.00      0.68       149

avg / total       0.76      0.52      0.38       298

