In [40]:
class Training_Data:
    
    def __init__(self, data_file):
        self.data_file = data_file
        
    def load_data(self):
        
        import os
    
        file_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', self.data_file))    
        with open(file_path, 'rt', encoding='utf-8') as tf:
            loaded_ad = tf.read().split('\n')
        job_ads = []
        job_ads = [self.parse_data(item) for item in loaded_ad[:-1]]
        self.raw_data = job_ads
        self.classes = set([item['class'] for item in job_ads])
    
    def parse_data(self, input_line):
        
        cur_text = input_line.split('`')
        ret_data = {}
        ret_data['id'] = cur_text[1]
        ret_data['company'] = cur_text[3]
        ret_data['position'] = cur_text[5]
        ret_data['url'] = cur_text[7]
        ret_data['desc'] = cur_text[9]
        ret_data['class'] = ''.join(cur_text[11:]).replace(',NA', '')
    
        return ret_data
    
    def create_training_set(self, labels):
        
        self.label_dict = {}
        for i, item in enumerate(labels):
            self.label_dict[item] = i + 1
        self.label = [self.label_dict[item['class']] if item['class'] in self.label_dict else 0 for item in self.raw_data]
        self.label_names = [item['class'] if item['class'] in self.label_dict else 'Other' for item in self.raw_data]
        
    def vectorize(self):
        
        def remove_white_space(in_text):
            
            on_text = in_text[:].replace(u'\xa0', ' ')
            while on_text.find('  ') > -1:
                on_text = on_text.replace('  ', ' ')
                
            return on_text
        
        self.sample_desc = [remove_white_space(item['desc']) for item in self.raw_data]
        self.sample_title = [remove_white_space(item['position']) for item in self.raw_data]
        
        from sklearn.feature_extraction.text import TfidfVectorizer
        import os
        os.sys.path.append('..')
        from tokenizer import Tokenizer as my_tokenizer
        tkn1 = my_tokenizer(1)
        tkn2 = my_tokenizer(2)
        tkn3 = my_tokenizer(3)
        tkn4 = my_tokenizer(4)
        desc_vectorizer = TfidfVectorizer(tokenizer=tkn2.tokenizer)
        self.sample_desc = desc_vectorizer.fit_transform(TD.sample_desc)
        title_vectorizer = TfidfVectorizer(tokenizer=tkn4.tokenizer)
        self.sample_title = title_vectorizer.fit_transform(TD.sample_title)
        
    def balance_training_set(self, balance_class):
    
        import numpy as np
        prop = np.mean([np.array(self.label) == self.label_dict[balance_class]])
        
        if prop < 0.5:
            prop = prop / (1 - prop)
            mask = [True if item == self.label_dict[balance_class] or np.random.rand() < prop else False for item in self.label]
        else:
            prop = (1 - prop) / prop
            mask = [True if item != self.label_dict[balance_class] or np.random.rand() < prop else False for item in self.label]
        sample_desc = []
        sample_title = []
        label = []
        for i, item in enumerate(mask):
            if item:
                sample_desc.append(self.sample_desc[i])
                sample_title.append(self.sample_title[i])
                label.append(self.label[i])
        return label, sample_title, sample_desc

In [41]:
TD = Training_Data('trainingset_20180406.csv')
TD.load_data()
TD.vectorize()
import pickle
with open('trainingset.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(TD, f, pickle.HIGHEST_PROTOCOL)

In [43]:
import pickle
with open('trainingset.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(TD, f, pickle.HIGHEST_PROTOCOL)

In [45]:
import pickle
with open('trainingset.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    TD = pickle.load(f)
TD.create_training_set(TD.classes)

In [46]:
print(TD.sample_desc.shape, TD.sample_title.shape)
TD.create_training_set(['Other'])
print(len(TD.label))

(2004, 30544) (2004, 8425)
2004


In [47]:
label_vec, title_vec, desc_vec = TD.balance_training_set('Other')

0.7684630738522954


In [56]:
from scipy.sparse import vstack, hstack
data_vec = hstack([vstack(title_vec), vstack(desc_vec)])
import numpy as np
label_vec = np.array(label_vec)

(909, 38969)


In [60]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn import metrics
clf = SVC(degree=200)
scores = cross_val_score(clf, data_vec, label_vec, cv=5, scoring='accuracy')
print(scores)

[0.51098901 0.51098901 0.51098901 0.51098901 0.50828729]


In [59]:
from sklearn.model_selection import train_test_split
desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)
clf = clf.fit(desc_train, label_train)
label_predict = clf.predict(desc_test)
from sklearn.metrics import classification_report
print(classification_report(label_test, label_predict, target_names=['Other', 'STEM']))

             precision    recall  f1-score   support

      Other       0.51      1.00      0.67       139
       STEM       0.00      0.00      0.00       134

avg / total       0.26      0.51      0.34       273



  'precision', 'predicted', average, warn_for)


In [79]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_validate
bclf = BernoulliNB()
scores = cross_validate(bclf, data_vec, label_vec, cv=5, scoring=['precision_macro', 'recall_macro', 'f1_macro'])
print(scores)
from sklearn.model_selection import train_test_split
desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)
bclf = bclf.fit(desc_train, label_train)
label_predict = bclf.predict(desc_test)
from sklearn.metrics import classification_report
print(classification_report(label_test, label_predict, target_names=['Other', 'STEM']))
in_bclf = BernoulliNB()
in_bclf = in_bclf.fit(data_vec, label_vec)
label_predict = bclf.predict(data_vec)
from sklearn.metrics import classification_report
print(classification_report(label_vec, label_predict, target_names=['Other', 'STEM']))

{'fit_time': array([0.00499964, 0.0049963 , 0.00399923, 0.00499725, 0.00399613]), 'score_time': array([0.01000476, 0.01000643, 0.00999951, 0.00900245, 0.01000094]), 'test_precision_macro': array([0.79850467, 0.75497835, 0.76587302, 0.80258548, 0.78363914]), 'train_precision_macro': array([0.97523017, 0.94209656, 0.94310382, 0.95761987, 0.95153471]), 'test_recall_macro': array([0.78941646, 0.74906367, 0.72665217, 0.80258548, 0.77186126]), 'train_recall_macro': array([0.97523017, 0.94148066, 0.93404555, 0.95770995, 0.95010269]), 'test_f1_macro': array([0.78914634, 0.74614918, 0.71916234, 0.8021978 , 0.77068076]), 'train_f1_macro': array([0.97523017, 0.94084566, 0.93490739, 0.95735869, 0.95046533])}
             precision    recall  f1-score   support

      Other       0.76      0.94      0.84       139
       STEM       0.91      0.69      0.79       134

avg / total       0.83      0.82      0.81       273

             precision    recall  f1-score   support

      Other       0.88   

In [None]:
para_bclf = in_bclf.get_params(True)
print(para_bclf)

In [None]:
prob_bclf = in_bclf.predict_proba(desc_test)
print(prob_bclf[:10])

In [None]:
from sklearn import tree
tclf = tree.DecisionTreeClassifier()
scores = cross_val_score(tclf, data_vec, label_vec, cv=6, scoring='f1')
print(scores)
from sklearn.model_selection import train_test_split
desc_train, desc_test, label_train, label_test = train_test_split(data_vec, label_vec, test_size=0.3)
tclf = tclf.fit(desc_train, label_train)
label_predict = tclf.predict(desc_test)
from sklearn.metrics import classification_report
print(classification_report(label_test, label_predict, target_names=['other','Clerks, etc.']))
in_tclf = tclf.fit(data_vec, label_vec)
label_predict = tclf.predict(data_vec)
from sklearn.metrics import classification_report
print(classification_report(label_vec, label_predict, target_names=['other','Clerks, etc.']))

In [None]:
import pickle

In [None]:
predict_all = bclf.predict(data_vec)
false_neg = []
for i, item in enumerate(predict_all):
    if label_vec[i,] == 0 and predict_all[i,] != 0:
        false_neg.append([TD.sample_title[i], TD.sample_desc[i]])
for item in false_neg:
    print(item[0] + '\n' + item[1] + '\n')

In [None]:
for i, item in enumerate(TD.label):
    if item == 1:
        print(TD.sample_title[i] + '\n')

In [None]:
from sklearn.feature_selection import mutual_info_classif as mic
mutual_info_desc = mic(desc_vec, label_vec)

In [None]:
token_desc_info = []
for token in desc_vectorizer.vocabulary_:
    token_desc_info.append([token, mutual_info_desc[desc_vectorizer.vocabulary_[token]]])

In [None]:
import csv

with open('mutual_info_desc.txt', 'wt', encoding='utf-8', newline='') as of:
    csv_write = csv.writer(of, delimiter=',', dialect='excel')
    csv_write.writerows(token_desc_info)

In [None]:
print([item for item in desc_vectorizer.vocabulary_][:10])