In [1]:
class Classifier():
    
    import sys
    if '..' not in sys.path:
        sys.path.append('..')
    
    def __init__(self, vectorizer, classifier):
        
        self._load_classifier(classifier)
        self._load_vectorizer(vectorizer)
        
    def _load_classifier(self, classifier):
        # load pre-fitted classifier
        
        from src.multinomialNB import MultiNB
        import pickle
        
        with open(classifier, 'rb') as pf:
            self.classifier = pickle.load(pf)
    
    def _load_vectorizer(self, vectorizer):
        # load pre-fitted vectorizer
        
        #from src.Vectorizer import TFIDF_Vectorizer
        #import pickle
        #
        #with open(vectorizer, 'rb') as pf:
        #    self.vectorizer = pickle.load(pf)
        self.vectorizer = vectorizer
    
class DataController():

    import sys
    if '..' not in sys.path:
        sys.path.append('..')
    
    ## init will create dataMatrix
    def __init__(self, pathToFile, num_pool=30, chunk=50):
        import os
        import json
        import pandas as pd
        from multiprocessing import Pool
        from tqdm import tqdm

        print('loading data')
        loaded_data = []
        with open(pathToFile, 'r', encoding='utf-8') as fin:
            for line in fin:
                loaded_data.append(json.loads(line, encoding='utf-8'))
        
        docs_tokens = []
        print('pre-processing')
        pbar = tqdm(total=int(len(loaded_data)/chunk) + (len(loaded_data) % chunk > 0))
        with Pool(num_pool) as pool:
            pool_result = pool.imap(self._preprocess, loaded_data, chunk)
            for item in pool_result:
                docs_tokens.append(item)
                pbar.update()
        pbar.close()
        
        self.dataSet = pd.DataFrame(docs_tokens, columns=['job_id', 'title','desc','tag'])
        
    def sample(self, size=1.0):
        
        if type(size) is float:
            size = int(self.dataSet.shape[0]*size)
        ret_sample = self.dataSet.sample(n=size)
        self.dataSet = self.dataSet.drop(ret_sample.index)
        return ret_sample
    
    def _preprocess(self, doc_dict):
        from src import tokenizer
        import tltk
        import hashlib
        import time
        def tltk_tokenize(text):
            ret = tltk.segment(text).replace('<u/>', '').replace('<s/>', '').split('|')
            return ret
        cleaner = tokenizer.cleanerFactory("../Resource/charset")
        title = tokenizer.tokenize(doc_dict['title'], tltk_tokenize, 5, cleaner)
        desc = tokenizer.tokenize(doc_dict['desc'], tltk_tokenize, 5, cleaner)
        tag = doc_dict['tag']
        in_str = str(time.time()) + title + desc
        job_id = hashlib.md5(bytes(in_str, 'utf-8')).hexdigest()
        return [job_id, title, desc, tag]
    
    def add_data(self, data):
        self.dataSet.add(data)
        
    def predict(self, classifier, vectorizer, sample=1.0):
        
        from scipy.sparse import hstack
        import pandas
        
        prediction = training_sample
        training_sample = self.sample(sample)
        training_Desc = training_sample['desc']
        training_Title = training_sample['title']
        training_Label = training_sample['tag']
        
        #define vectorizer for title and description
        desc_vectorizer = vectorizer.vectorize_desc
        title_vectorizer = vectorizer.vectorize_title
        #transform-vectorize
        desc_vec = vectorizer.transform(training_Title)
        title_vec = title_vectorizer.transform(training_Desc)
        #stack title onto desc
        data_vec = hstack([title_vec, desc_vec])
        #predict class
        label_predict = classifier.predict(data_vec)
        #add prediction to pandas dataframe
        prediction['predict'] = pandas.Series(label_predict, index=prediction.index)
        
        return prediction

In [3]:
#no warning
import warnings
warnings.filterwarnings('ignore')
#init classifier
#beg-addhoc-fix
import sys
if '..' not in sys.path:
    sys.path.append('..')
from main_fit_vectorizer import TFIDF_Vectorizer
with open('../Resource/vectorizer_v02.pck', 'rb') as pf:
    vectorizer = pickle.load(pf)
cClassifier = Classifier(vectorizer, '../Resource/MultiNB_STEMvsNONSTEM_0030vs0070.pck')
#end-addhoc-fix
#DataSet = DataController('../data/masterDB_JPA_Data_-_20180406_flatten.json', 31, 100)
DataSet = DataController('../data/block1234.json', 31, 30)
predicted = DataSet.predict(cClassifier.classifier, cClassifier.vectorizer)

NameError: name 'pickle' is not defined