In [1]:
! pip install --quiet nltk

In [None]:
import nltk 
nltk.download('wordnet')

In [1]:
import os
import pandas as pd
import numpy as np
from glob import glob
from nltk.stem import WordNetLemmatizer
import re
from validation.title_matching import layered_matcher, title_matcher, punct_lookup, exact_matcher

pd.set_option('max_colwidth',50)

In [2]:
class LemmaTokenizer(object):
    regex = r'\w+'
    def __init__(self):
        self.wnl = WordNetLemmatizer()        

    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in re.findall(self.regex,doc)]
    
def get_indeed_texts(path, **kwargs):
    indeed = pd.read_csv(path, **kwargs)
    indeed['description'] = indeed.description.str.replace('Job Summary ', '')
    text = indeed.title.str.cat(indeed.description, sep=' ')
    indeed['title'] = indeed.title.str.lower()
    return text, indeed

def get_desc_lookup(path):
    xwalk = pd.read_stata(path)
    return xwalk.drop_duplicates(['soc2'])[['desc_soc2', 'soc2', 'soc6']]

In [10]:
text,indeed = get_indeed_texts('data/2018-08-02T09-00-19.csv', nrows=50000)

In [11]:
vectorizer = pd.read_pickle('models/tfidf.pkl')
policy_prob = pd.read_pickle('models/logistic-model.pkl')        

desc_lookup = get_desc_lookup('crosswalks/dot1991_census_soc_crosswalk.dta')
lookup = pd.read_csv('crosswalks/soc-title-lookup.csv')

matcher = layered_matcher([
    exact_matcher(lookup), 
    title_matcher(lookup, punct_lookup(lookup))
])

In [12]:
matches = matcher(indeed)

In [13]:
# Turn SOC code from matched data into SOC2 code and then Description
matches['soc2'] = matches.code.str.split('-').map(lambda a: a[0]).astype(int)
df = matches.merge(desc_lookup, how='left', on='soc2')
X, y = df.description, df.desc_soc2

In [14]:
# Vectorize and predict validation data
vecs = vectorizer.transform(X)
preds = policy_prob.predict(vecs)

In [15]:
accuracy = (preds == y).sum() / preds.shape[0]
accuracy

0.35587842351369403

In [16]:
from sklearn.metrics import classification_report

print(classification_report(y, preds))

                                                            precision    recall  f1-score   support

                  Architecture and Engineering Occupations       0.32      0.48      0.38       188
Arts, Design, Entertainment, Sports, and Media Occupations       0.60      0.54      0.57       998
 Building and Grounds Cleaning and Maintenance Occupations       0.83      0.14      0.23        37
             Business and Financial Operations Occupations       0.39      0.37      0.38      1049
                  Community and Social Service Occupations       0.00      0.00      0.00        20
                     Computer and Mathematical Occupations       0.95      0.10      0.19      1030
                   Construction and Extraction Occupations       0.17      0.03      0.04        39
              Education, Training, and Library Occupations       0.37      0.37      0.37       125
                Farming, Fishing, and Forestry Occupations       0.00      0.00      0.00         8

  'precision', 'predicted', average, warn_for)


In [19]:
y.shape

(5988,)