In [1]:
import pandas as pd
import gensim
import pickle
import sys

sys.path.append("Job Tag Classifier Tools")
from Pipeline import tag_decoder
from DataCollection import remove_unwanted_rows
from FeatureCreation import aggregate_job_tag_rows
from FeatureProcessing import clean_text, strip_text, stem_text, target_encoder

# Model Training

In [7]:
df = pd.read_csv("/Volumes/SD.Card/ML_Data/Cutback/big_bertha.csv")

In [8]:
df = remove_unwanted_rows(df)
df = aggregate_job_tag_rows(df)
df = clean_text(df)
df = strip_text(df)
y = target_encoder(df)

In [9]:
def preprocess_strings(data):
        # preprocessing step to reduce words to semi-root form
        CUSTOM_FILTERS = [lambda x: x.lower(),
                          gensim.parsing.preprocessing.strip_tags,
                          gensim.parsing.preprocessing.strip_punctuation,
                          gensim.parsing.preprocessing.strip_numeric,
                          gensim.parsing.preprocessing.remove_stopwords,
                          gensim.parsing.preprocessing.strip_short,
                          gensim.parsing.preprocessing.strip_multiple_whitespaces]
        texts = [gensim.parsing.preprocessing.preprocess_string(doc, CUSTOM_FILTERS) for doc in data]
        return texts

In [10]:
text = preprocess_strings(df.job_description.values)
documents = [gensim.models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(text)]

In [14]:
model = gensim.models.doc2vec.Doc2Vec(documents, vector_size=100, window=400, epoch=1, min_count=1, workers=4)
vocab = (set(model.wv.vocab.keys()))

In [16]:
pickle.dump(vocab, open("vocab","wb"))
model.save("doc2vec")

In [18]:
infered_vectors = [model.infer_vector(row) for row in text]

### Train Classifier

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [19]:
clf = RandomForestClassifier(min_samples_leaf=10)

In [20]:
clf.fit(infered_vectors, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
pickle.dump(clf, open("classifier","wb"))

#### Testing Classifier

In [22]:
df = pd.read_csv("/Volumes/SD.Card/ML_Data/Cutback/job_data.csv")

In [23]:
df = remove_unwanted_rows(df)
df = aggregate_job_tag_rows(df)
df = clean_text(df)
df = strip_text(df)

with open("target_tokens.pkl", 'rb') as handle:
    tokenizer = pickle.load(handle)
y = tokenizer.transform(df.job_targets)

text = preprocess_strings(df.job_description.values)

  .format(sorted(unknown, key=str)))


In [24]:
infered_vectors = [model.infer_vector(row) for row in text]

In [25]:
predictions = clf.predict(infered_vectors)

In [30]:
print(classification_report(y,predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.00      0.00      0.00       202
           2       0.00      0.00      0.00        43
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00        70
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00        70
           7       0.00      0.00      0.00        22
           8       0.00      0.00      0.00       138
           9       0.00      0.00      0.00        67
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       0.00    

# Model Inference

In [31]:
test_event = "Hello my name is Matt and this is the second time that i am typing this statement.", "None"

In [32]:
df = pd.DataFrame(columns=["job_description","job_title"], data = [test_event])

In [33]:
df = remove_unwanted_rows(df)
df = clean_text(df)
df = strip_text(df)

In [34]:
def preprocess_strings(data):
        # preprocessing step to reduce words to semi-root form
        CUSTOM_FILTERS = [lambda x: x.lower(),
                          gensim.parsing.preprocessing.strip_tags,
                          gensim.parsing.preprocessing.strip_punctuation,
                          gensim.parsing.preprocessing.strip_numeric,
                          gensim.parsing.preprocessing.remove_stopwords,
                          gensim.parsing.preprocessing.strip_short,
                          gensim.parsing.preprocessing.strip_multiple_whitespaces]
        texts = [gensim.parsing.preprocessing.preprocess_string(doc, CUSTOM_FILTERS) for doc in data]
        return texts

In [35]:
text = preprocess_strings(df.job_description.values)

#### Load Models and Vocab

In [36]:
model = gensim.models.doc2vec.Doc2Vec.load("doc2vec")
with (open("vocab", "rb")) as f:
    vocab = pickle.load(f)
with open("target_tokens.pkl", 'rb') as handle:
    tokenizer = pickle.load(handle)

In [37]:
event = [word for word in text[0] if word in vocab]

In [38]:
list_of_indices = clf.predict([model.infer_vector(event)])

#### Decode Targets

In [39]:
target = []
for i, num in enumerate(list_of_indices[0]):
    if num > .5:
        target.append(str(tokenizer.classes_[i]))

In [40]:
target

['Software', 'Tech']

In [41]:
tokenizer.classes_

array(['Angular', 'Backend', 'Customer Service', 'Database', 'Designer',
       'DynamoDB', 'Express.js', 'Flask', 'Frontend', 'Full-Stack',
       'Hardware', 'MSSQL', 'Management', 'Marketing', 'MongoDB', 'MySQL',
       'NoSQL', 'PostgreSQL', 'Product Manager', 'React', 'Recruiting',
       'Redis', 'SQL', 'Sales', 'Social Media', 'Software', 'Tech', 'Web'],
      dtype=object)