In [1]:
! pip install --quiet fuzzywuzzy
! pip install --quiet nltk
! pip install --quiet diskcache
! pip install --quiet python-Levenshtein
! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [7]:
import nltk 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
%load_ext autoreload 
%autoreload 2

import os
import pandas as pd
import numpy as np
from glob import glob
import re
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from embed_software.preprocess import *
from embed_software.utils import *
from validation.title_matching import layered_matcher, title_matcher, punct_lookup, exact_matcher
from dot_data import get_dictionary, LemmaTokenizer

pd.set_option('max_colwidth',50)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
def get_indeed_texts(path, **kwargs):
    """Reads csv with indeed data that turns into test set"""
    indeed = pd.read_csv(path, **kwargs)
    indeed['description'] = indeed.description.str.replace('Job Summary ', '')
    indeed['title'] = indeed.title.str.lower()
    return indeed

def get_soc2(socs):
    return socs.str.split('-').map(lambda a: a[0]).astype(int)

def make_matcher():
    """Returns function that matches titles to SOC code"""
    lookup = pd.read_csv('classify_dot/crosswalks/soc-title-lookup.csv')    
    matcher = layered_matcher([
        exact_matcher(lookup), 
        title_matcher(lookup, punct_lookup(lookup))
    ])
    return matcher

def indeed_test_data(texts, lim):
    """Make test data from indeed (pre-embedded)"""
    indeed = get_indeed_texts(texts, nrows=lim)
    matcher = make_matcher()
    matches = matcher(indeed.reset_index()).set_index('index')    
    processor = Preprocessor(readme_processor, 1, 1, 6).process
    descriptions = matches.description.map(processor)     
    return matches.description, get_soc2(matches.code) 

class PreEmbeddedVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, embed_path, model, lim):
        self.embed_path = embed_path
        self.model = model
        self.lim = lim
        
    def fit(self, X, y=None):
        self.fit_X = X
        self.embeddings = get_embeddings(self.embed_path, self.lim)
        return self
    
    def transform(self, X):
        # Our test set is pre-embedded, but our train set not!
        # This should simply be a cached embedding...? 
        if self.fit_X is X:
            return embed_docs(self.model, '\n'.join(X))
        else:
            return self.embeddings[X.index] 
            

def dot_train_data():
    """Combine DOT Dictionary and Tasks descriptions for training set"""   

    dot_dict = get_dictionary('classify_dot')
    tasks = pd.read_csv('classify_dot/tasks.txt', sep='\t')
    processor = Preprocessor(readme_processor, 1, 1, 6).process
    X_train = pd.concat([dot_dict.job_description, tasks.Task]).map(processor)    
    y_train = pd.concat([dot_dict.soc2, get_soc2(tasks['O*NET-SOC Code'])])
    return X_train, y_train

In [43]:
SAMPLE_SIZE = 200000

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

models = [
    Pipeline([('tfidf', pd.read_pickle('classify_dot/models/tfidf.pkl')),
              ('lr', pd.read_pickle('classify_dot/models/logistic-model.pkl'))]),
    Pipeline([('tfidf', TfidfVectorizer()),
              ('lr', LogisticRegression(solver="newton-cg", multi_class="multinomial", n_jobs=-1))]),
    Pipeline([('embed', PreEmbeddedVectorizer('ss_embeds/ss_100.txt', 'ss_models/sentencespace', SAMPLE_SIZE)),
             ('lr', LogisticRegression(solver="newton-cg", multi_class="multinomial", n_jobs=-1))]),
    Pipeline([('embed', PreEmbeddedVectorizer('ss_embeds/ss_100.txt', 'ss_models/sentencespace', SAMPLE_SIZE)),
             ('knn', KNeighborsClassifier(7))]),
]

In [45]:
X_train, y_train = dot_train_data()
X_test, y_test = indeed_test_data('everything.csv', SAMPLE_SIZE)

In [46]:
preds = [m.fit(X_train, y_train).predict(X_test) for m in models]

In [47]:
[accuracy_score(p, y_test) for p in preds]

[0.46138278345178141,
 0.47079587706499743,
 0.43380241916505857,
 0.42716618816774132]

In [48]:
from sklearn.metrics import classification_report

for p in preds:
    print(classification_report(y_test, p))

             precision    recall  f1-score   support

         11       0.42      0.35      0.38      3273
         13       0.32      0.50      0.39      2650
         15       0.68      0.61      0.64      6479
         17       0.31      0.49      0.38       929
         19       0.39      0.22      0.28       457
         21       0.13      0.25      0.17       217
         23       1.00      0.16      0.28        56
         25       0.51      0.56      0.53       442
         27       0.71      0.45      0.55      2635
         29       0.67      0.51      0.58       821
         31       0.22      0.17      0.19        65
         33       0.05      0.04      0.04       134
         35       0.87      0.38      0.53       291
         37       0.69      0.12      0.21       166
         39       0.31      0.29      0.30       133
         41       0.37      0.46      0.41       967
         43       0.18      0.28      0.22       722
         45       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)
