In [1]:
import pickle
import sys
import warnings
warnings.filterwarnings('ignore')

if '..' not in sys.path:
    sys.path.append('..')
    
from main_fit_vectorizer import TFIDF_Vectorizer

In [2]:
with open('../data/vectorizer_v02.pck', 'rb') as pf:
    vectorizer = pickle.load(pf)

In [3]:
## Construct Dataframe

class DataController():
    import sys
    if '..' not in sys.path:
        sys.path.append('..')
    
    ## init will create dataMatrix
    def __init__(self, pathToFile):
        import os
        import json
        import pandas as pd
        from multiprocessing import Pool

        loaded_data = []
        with open(pathToFile, 'r', encoding='utf-8') as fin:
            for line in fin:
                ## for each line, add into dataMatrix, using ["title", "desc", "tag"] structure
                loaded_data.append(json.loads(line, encoding='utf-8'))
                #self.dataMatrix = self.dataMatrix.append(line_dict, ignore_index=True)
                #count+=1
                #if(count==100): break
        
        docs_tokens = []
        with Pool(30) as pool:
            pool_result = pool.imap(self.wrapper_tokenize, loaded_data, chunksize=30)
            for item in pool_result:
                docs_tokens.append(item)
        
        self.dataMatrix = pd.DataFrame(docs_tokens, columns=["title","desc","tag"])
    
    def getTrainingSet(self, label_class):
        import pandas as pd
        ## classSet is set of data that has tag = label_class
        targetSet = self.dataMatrix[self.dataMatrix['tag']==label_class]
        restSet = self.dataMatrix[self.dataMatrix['tag']!=label_class]

        if(targetSet.shape[0] < restSet.shape[0]):
            # target has less population than the rest
            trainingSet = pd.concat([targetSet, restSet.sample(n=targetSet.shape[0])])
        else:
            # target has more population than the rest
            trainingSet = pd.concat([targetSet.sample(n=restSet.shape[0]), restSet])
        # shuffle data using sample fraction = 1
        trainingSet = trainingSet.sample(frac=1)
        return trainingSet

    def wrapper_tokenize(self, doc_dict):
        from src import tokenizer
        import tltk
        def tltk_tokenize(text):
            ret = tltk.segment(text).replace('<u/>', '').replace('<s/>', '').split('|')
            return ret
        cleaner = tokenizer.cleanerFactory("../Resource/charset")
        title = tokenizer.tokenize(doc_dict['title'], tltk_tokenize, 5, cleaner)
        desc = tokenizer.tokenize(doc_dict['desc'], tltk_tokenize, 5, cleaner)
        tag = doc_dict['tag']
        return [title, desc, tag]

In [4]:
## Create data
import os
from sklearn.feature_extraction.text import TfidfVectorizer

file_name = "block1234.json"
file_path = os.getcwd()+"/../data/"+file_name

data = DataController(file_path)

## Create training data
trainingData = data.getTrainingSet("0")

training_Desc = trainingData['desc']
training_Title = trainingData['title']
training_Label = trainingData['tag']

In [11]:
## vectorize data

desc_vectorizer = vectorizer.vectorize_desc
# desc_vectorizer = TfidfVectorizer(tokenizer=tkn2.tokenizer, max_df=1.0, min_df=1)
desc_vec = desc_vectorizer.transform(training_Title)

title_vectorizer = vectorizer.vectorize_title
# title_vectorizer = TfidfVectorizer(tokenizer=tkn4.tokenizer, max_df=1.0, min_df=1)
title_vec = title_vectorizer.transform(training_Desc)

## stack title onto desc
from scipy.sparse import hstack
data_vec = hstack([title_vec, desc_vec])

## create label_vec
label_vec = training_Label

In [12]:
def predict_cutoff(datavec, cutoff, predict_proba):
    result = predict_proba(datavec)
    answer = ['1' if item[1] > cutoff else '0' for item in result]
    return answer

In [7]:
#from sklearn.naive_bayes import MultinomialNB
#class MultiNB(MultinomialNB):
#    def __init__(self, cutoff):
#        super(MultiNB, self).__init__()
#        self.cutoff = cutoff
#    def predict(self, datavec):
#        result = self.predict_proba(datavec)
#        answer = ['1' if item[1] > self.cutoff else '0' for item in result]
#        return answer

In [13]:
from sklearn.model_selection import cross_val_score
from src.multinomialNB import MultiNB
NBclfProb = MultiNB({'0':0.7,'1':0.3})
scores = cross_val_score(NBclfProb, data_vec, label_vec, cv=6, scoring='f1_macro')
print('Cross validation score F1: ', scores)
print('Average:                   ', scores.mean())

Cross validation score F1:  [0.81463415 0.8157753  0.85031281 0.83270256 0.86816745 0.79423277]
Average:                    0.8293041719251583


In [14]:
## Train using Multinomial NaiveBayes 
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## see crossvalidation score
NBclf = MultinomialNB()
scores = cross_val_score(NBclf, data_vec, label_vec, cv=3, scoring='f1_macro')
print('Cross validation score: ', scores)

## split sample into train_set and test_set
desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)

## In sample accuracy
in_NBclf = MultinomialNB()
in_NBclf = in_NBclf.fit(data_vec, label_vec)
label_predict = predict_cutoff(data_vec, 0.5, in_NBclf.predict_proba)
print(classification_report(label_vec, label_predict))

## Test set accuracy
NBclf = NBclf.fit(desc_train, label_train)
label_predict = predict_cutoff(desc_test, 0.3, NBclf.predict_proba)
print(classification_report(label_test, label_predict))

Cross validation score:  [0.81578947 0.83764122 0.84951038]
             precision    recall  f1-score   support

          0       0.87      0.87      0.87       341
          1       0.87      0.87      0.87       341

avg / total       0.87      0.87      0.87       682

             precision    recall  f1-score   support

          0       0.89      0.74      0.81       108
          1       0.76      0.90      0.82        97

avg / total       0.83      0.81      0.81       205



In [10]:
# fit classifier for later use
MNBclf = MultiNB({'0':0.7,'1':0.3})
MNBclf = MNBclf.fit(data_vec, label_vec)
with open('../Resource/MultiNB_STEMvsNONSTEM_0030vs0070.pck', 'wb') as f:
    pickle.dump(MNBclf, f)

In [15]:
label_predict = MNBclf.predict(desc_test)
print(classification_report(label_test, label_predict))

             precision    recall  f1-score   support

          0       0.92      0.80      0.86       108
          1       0.80      0.93      0.86        97

avg / total       0.87      0.86      0.86       205



In [19]:
data_vec.shape

(682, 4259)