In [None]:
## clean text
file_name = 'data20180620.json'
new_name = 'data20180620_cleaned.json'

import sys
if not '..' in sys.path:
    sys.path.append('..')
import os
import json
from clean_text import CleanText
Clean_Text = CleanText('en_stop_word.txt', 'th_stop_word.txt')
with open(os.path.join(os.getcwd(), '..', 'data', file_name), 'rt', encoding='utf-8') as f:
    for line in f.readlines():
        job_ad = json.loads(line)
        job_ad['title'] = Clean_Text.clean_text(job_ad['title'])
        job_ad['desc'] = Clean_Text.clean_text(job_ad['desc'])
        with open(os.path.join(os.getcwd(), '..', 'data', new_name), 'at', encoding='utf-8') as cleaned:
            cleaned.write(json.dumps(job_ad, ensure_ascii=False) + '\n')

In [None]:
## Construct Dataframe

import pandas as pd

class DataController():
    dataMatrix = pd.DataFrame(columns=["title","desc","tag"])
    
    ## init will create dataMatrix
    def __init__(self, pathToFile):
        import os
        import json
        count = 0
        
        print('Begin loading')
        with open(pathToFile, 'r', encoding='utf-8') as fin:
            for line in fin:
                ## for each line, add into dataMatrix, using ["title", "desc", "tag"] structure
                line_dict = json.loads(line, encoding='utf-8')
                self.dataMatrix = self.dataMatrix.append(line_dict, ignore_index=True)
                #count+=1
                if count > 100: break
    
    def getTrainingSet(self, label_class):
        ## classSet is set of data that has tag = label_class
        targetSet = self.dataMatrix[self.dataMatrix['tag']==label_class]
        restSet = self.dataMatrix[self.dataMatrix['tag']!=label_class]

        if(targetSet.shape[0] < restSet.shape[0]):
            # target has less population than the rest
            trainingSet = pd.concat([targetSet, restSet.sample(n=targetSet.shape[0])])
        else:
            # target has more population than the rest
            trainingSet = pd.concat([targetSet.sample(n=restSet.shape[0]), restSet])
        # shuffle data using sample fraction = 1
        trainingSet = trainingSet.sample(frac=1)
        return trainingSet
    
    def getData(self):
        return self.dataMatrix

In [None]:
## Create data
import os

file_name = "masterDB_JPA Data - 20180406_flatten.json"
#file_name = 'data20180620.json'
file_path = os.getcwd()+"/../data/"+file_name

data = DataController(file_path)
## Create vectorized data
data = data.getData()
vec_Desc = data['desc'] 
vec_Title = data['title']

In [None]:
## create tokenizer
import sys
if not '..' in sys.path:
    sys.path.append('..')
from tokenizer import Tokenizer
import deepcut as dp
def tokenizer(text):
    return dp.tokenize(text)
#tkn1 = Tokenizer(1, dp.deepcut)
tkn2 = Tokenizer(2, tokenizer)
#tkn3 = Tokenizer(3, dp.deepcut)
tkn4 = Tokenizer(4, tokenizer)

## open vocab file
#import os
#with open(os.path.abspath(os.path.join(os.getcwd(), '..', 'dict', 'desc_newdict_90p.txt'))  , 'rt', encoding='utf-8') as f_tv:
#    desc_vocab = f_tv.read().split('\n')
#with open(os.path.abspath(os.path.join(os.getcwd(), '..', 'dict', 'title_newdict_90p.txt'))  , 'rt', encoding='utf-8') as f_tv:
#    title_vocab = f_tv.read().split('\n')

## create tfidf term-doc matrix
from sklearn.feature_extraction.text import TfidfVectorizer

desc_vectorizer = TfidfVectorizer(tokenizer=tkn2.tokenizer, min_df=0.85)
desc_vectorizer.fit(vec_Desc)
#desc_vec = desc_vectorizer.fit_transform(training_Title)

title_vectorizer = TfidfVectorizer(tokenizer=tkn4.tokenizer, min_df=0.85)
title_vectorizer.fit(vec_Title)
#title_vec = title_vectorizer.fit_transform(training_Desc)

In [None]:
## Create train data
file_name = "data20180620.json"
file_path = os.getcwd()+"/../data/"+file_name

train_data = DataController(file_path)

## Create training data
trainingData = train_data.getTrainingSet("0")

training_Desc = trainingData['desc']
training_Title = trainingData['title']
training_Label = trainingData['tag']

desc_vec = desc_vectorizer.transform(training_Title)
title_vec = title_vectorizer.transform(training_Desc)

from scipy.sparse import hstack
data_vec = hstack([desc_vec, title_vec])
label_vec = training_Label
data_vec

In [None]:
data_vec

In [None]:
## Train using Multinomial NaiveBayes 

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## see crossvalidation score
cv = 3
mclf = MultinomialNB(alpha=1)
scores = cross_val_score(mclf, data_vec, label_vec, cv=cv, scoring='precision_macro')
print('========== cv=' + str(cv) + ' cross-validation scores ==========')
print(scores)
print('================================================================')

desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)
mclf = mclf.fit(desc_train, label_train)
label_predict = mclf.predict(data_vec)
test_predict = mclf.predict(desc_test)

print('===== Classification report from the whole TRAINING SET =====')
print(classification_report(label_vec, label_predict))
print('=============================================================')
print('======= Classification report from the whole TEST set =======')
print(classification_report(label_test, test_predict))
print('=============================================================')

In [None]:
frame = pd.DataFrame(index=label_test[test_predict != label_test].index.values)
label_predict_prob = mclf.predict_proba(desc_test)
frame['Label'] = list(label_test[test_predict != label_test])
frame['prob_0'] = list(label_predict_prob[test_predict != label_test][:,0])
frame['prob_1'] = list(label_predict_prob[test_predict != label_test][:,1])
frame['title'] = [training_Title[item] for item in frame.index.values]
frame['desc'] = [training_Desc[item] for item in frame.index.values]
frame.to_csv('classification_error.csv', encoding='utf-8', sep='\t')
frame