In [1]:
import pickle
import sys
import warnings
warnings.filterwarnings('ignore')

if '..' not in sys.path:
    sys.path.append('..')


In [2]:
from main_fit_vectorizer import TFIDF_Vectorizer
with open('../Resource/vectorizer_v03.pck', 'rb') as pf:
    vectorizer = pickle.load(pf)

In [3]:
## Construct Dataframe

class DataController():
    import sys
    if '..' not in sys.path:
        sys.path.append('..')
    
    ## init will create dataMatrix
    def __init__(self, pathToFile):
        import os
        import json
        import pandas as pd
        from multiprocessing import Pool
        
        if type(pathToFile) is str:
        
            loaded_data = []
            with open(pathToFile, 'r', encoding='utf-8') as fin:
                for line in fin:
                    ## for each line, add into dataMatrix, using ["title", "desc", "tag"] structure
                    data_line = json.loads(line, encoding='utf-8')
                    if len(data_line['desc']) > 60:
                        loaded_data.append(json.loads(line, encoding='utf-8'))
                    #self.dataMatrix = self.dataMatrix.append(line_dict, ignore_index=True)
                    #count+=1
                    #if(count==100): break

            docs_tokens = []
            with Pool(30) as pool:
                pool_result = pool.imap(self.wrapper_tokenize, loaded_data, chunksize=30)
                for item in pool_result:
                    docs_tokens.append(item)

            self.dataMatrix = pd.DataFrame(docs_tokens, columns=["title","desc","tag"])
            
        else:
            
            self.dataMatrix = pathToFile
    
    def getTrainingSet(self, label_class):
        import pandas as pd
        ## classSet is set of data that has tag = label_class
        targetSet = self.dataMatrix[self.dataMatrix['tag']==label_class]
        restSet = self.dataMatrix[self.dataMatrix['tag']!=label_class]

        if(targetSet.shape[0] < restSet.shape[0]):
            # target has less population than the rest
            trainingSet = pd.concat([targetSet, restSet.sample(n=targetSet.shape[0])])
        else:
            # target has more population than the rest
            trainingSet = pd.concat([targetSet.sample(n=restSet.shape[0]), restSet])
        # shuffle data using sample fraction = 1
        trainingSet = trainingSet.sample(frac=1)
        return trainingSet

    def wrapper_tokenize(self, doc_dict):
        from src import tokenizer
        import tltk
        def tltk_tokenize(text):
            ret = tltk.segment(text).replace('<u/>', '').replace('<s/>', '').split('|')
            return ret
        cleaner = tokenizer.cleanerFactory("../Resource/charset")
        title = tokenizer.tokenize(doc_dict['title'], tltk_tokenize, 5, cleaner)
        desc = tokenizer.tokenize(doc_dict['desc'], tltk_tokenize, 5, cleaner)
        tag = doc_dict['tag']
        return [title, desc, tag]

In [4]:
## Create data
import os
import pandas
from sklearn.feature_extraction.text import TfidfVectorizer

file_name = 'validated_0822.pck'
file_path = os.getcwd()+'/../Resource/'+file_name

data = DataController(pandas.read_pickle(file_path))

## Create training data
trainingData = data.getTrainingSet('0')

#training_Desc = trainingData['desc']
#training_Title = trainingData['title']
training_Desc = trainingData['desc_seg']
training_Title = trainingData['title_seg']
training_Label = trainingData['tag']

FileNotFoundError: [Errno 2] No such file or directory: '/home/nuthasid/Dropbox/Python/JobPostAnalysis/Notebook/../Resource/validated_0822.pck'

In [None]:
## vectorize data

desc_vectorizer = vectorizer.vectorize_desc
# desc_vectorizer = TfidfVectorizer(tokenizer=tkn2.tokenizer, max_df=1.0, min_df=1)
desc_vec = desc_vectorizer.transform(training_Title)
print('desc',desc_vec.shape)

title_vectorizer = vectorizer.vectorize_title
# title_vectorizer = TfidfVectorizer(tokenizer=tkn4.tokenizer, max_df=1.0, min_df=1)
title_vec = title_vectorizer.transform(training_Desc)
print('title',title_vec.shape)

## stack title onto desc
from scipy.sparse import hstack
data_vec = hstack([title_vec, desc_vec])
print('data',data_vec.shape)

## create label_vec
label_vec = training_Label

In [None]:
def predict_cutoff(datavec, cutoff, predict_proba):
    result = predict_proba(datavec)
    answer = ['1' if item[1] > cutoff else '0' for item in result]
    return answer

In [None]:
#from sklearn.naive_bayes import MultinomialNB
#class MultiNB(MultinomialNB):
#    def __init__(self, cutoff):
#        super(MultiNB, self).__init__()
#        self.cutoff = cutoff
#    def predict(self, datavec):
#        result = self.predict_proba(datavec)
#        answer = ['1' if item[1] > self.cutoff else '0' for item in result]
#        return answer

In [None]:
from sklearn.model_selection import cross_val_score
from src.multinomialNB import MultiNB
NBclfProb = MultiNB({'0':0.7,'1':0.3})
scores = cross_val_score(NBclfProb, data_vec, label_vec, cv=6, scoring='f1_macro')
print('Cross validation score F1: ', scores)
print('Average:                   ', scores.mean())

In [None]:
## Train using Multinomial NaiveBayes 
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## see crossvalidation score
NBclf = MultinomialNB()
scores = cross_val_score(NBclf, data_vec, label_vec, cv=3, scoring='f1_macro')
print('Cross validation score: ', scores)

## split sample into train_set and test_set
desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)

## In sample accuracy
in_NBclf = MultinomialNB()
in_NBclf = in_NBclf.fit(data_vec, label_vec)
label_predict = predict_cutoff(data_vec, 0.5, in_NBclf.predict_proba)
print(classification_report(label_vec, label_predict))

## Test set accuracy
NBclf = NBclf.fit(desc_train, label_train)
label_predict = predict_cutoff(desc_test, 0.5, NBclf.predict_proba)
print(classification_report(label_test, label_predict))

In [None]:
# fit classifier for later use
MNBclf = MultiNB({'0':0.7,'1':0.3})
MNBclf = MNBclf.fit(data_vec, label_vec)
with open('../Resource/MultiNB_STEMvsNONSTEM_0030vs0070_v02.pck', 'wb') as f:
    pickle.dump(MNBclf, f)

In [None]:
label_predict = MNBclf.predict(desc_test)
print(classification_report(label_test, label_predict))

In [None]:
import pandas
data.dataMatrix['predict'] = pandas.Series(predict_cutoff(data_vec, 0.3, in_NBclf.predict_proba))

In [None]:
data.dataMatrix

In [None]:
from sklearn.linear_model import LinearRegression as LR
LRclf = LR()
LRclf.fit(X=data_vec, y=label_vec)

In [None]:
data.dataMatrix['predict linear'] = pandas.Series([1 if item > 0.5 else 0 for item in LRclf.predict(data_vec)])

In [None]:
data_pred = pandas.DataFrame({'tag':list(label_vec), 'pred':list(pandas.Series(['1' if item > 0.5 else '0' for item in LRclf.predict(data_vec)])})

In [None]:
def pred_linear(data, predict):
    return [1 if item > 0.5 else 0 for item in LRclf.predict(data_vec)]

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## see crossvalidation score

## split sample into train_set and test_set
desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)

## In sample accuracy
LRclf = LR()
LRclf.fit(X=desc_train, y=label_train)
label_predict = ['1' if item > 0.5 else '0' for item in LRclf.predict(data_vec)]
print(classification_report(label_vec, label_predict))

## Test set accuracy
label_predict = ['1' if item > 0.5 else '0' for item in LRclf.predict(desc_test)]
print(classification_report(label_test, label_predict))

In [None]:
from sklearn.model_selection import learning_curve

NBclf = MultinomialNB()
train_sizes, train_scores, valid_scores = learning_curve(NBclf, data_vec, label_vec, train_sizes=[0.2, 0.4, 0.6, 0.8, 1.0], cv=5)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validaLRtion generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


#digits = load_digits()
X, y = data_vec, label_vec
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## see crossvalidation score

## split sample into train_set and test_set
desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)

## In sample accuracy
LRclf = LR()
LRclf.fit(X=desc_train, y=label_train)
label_predict = ['1' if item > 0.5 else '0' for item in LRclf.predict(data_vec)]
print(classification_report(label_vec, label_predict))

## Test set accuracy
label_predict = ['1' if item > 0.5 else '0' for item in LRclf.predict(desc_test)]
print(classification_report(label_test, label_predict))

title = "Learning Curves (Naive Bayes)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

estimator = MultinomialNB()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

#title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
## SVC is more expensive so we do a lower number of CV iterations:
#cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
#estimator = SVC(gamma=0.001)
#plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)

print(plt.show())

In [None]:
title = "Learning Curves (Linear)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

estimator = LR()
plot_learning_curve(LR, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

#title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
## SVC is more expensive so we do a lower number of CV iterations:
#cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
#estimator = SVC(gamma=0.001)
#plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)

print(plt.show())

In [None]:
class LinearClass(LR):
    def __init__(self):
        super().__init__()
        self.cutoff = 0.5
    #def predict_y(self, datavec):
    #    return super().predict(datavec)
    def predict(self, datavec):
        result = super().predict(datavec)
        #print(type(result))
        #print(result.shape)
        answer = [1 if item > self.cutoff else 0 for item in list(result)]
        return answer
    def get_params(self, deep):test = LinearClass(
        return super().get_params(deep)

In [None]:
title = "Learning Curves (Linear)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

plot_learning_curve(LinearClass(), title, X, y, ylim=(-1.0, 1.01), cv=6, n_jobs=-1)

#title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
## SVC is more expensive so we do a lower number of CV iterations:
#cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
#estimator = SVC(gamma=0.001)
#plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)

print(plt.show())

In [None]:
test = LinearClass()

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## see crossvalidation score

## split sample into train_set and test_set
desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)

## In sample accuracy
LRclf = SVC()
LRclf.fit(desc_train, label_train)
label_predict = LRclf.predict(data_vec)
print(classification_report(label_vec, label_predict))

## Test set accuracy
label_predict = label_predict = LRclf.predict(desc_test)
print(classification_report(label_test, label_predict))