In [4]:
! pip install --quiet fuzzywuzzy[speedup]

In [1]:
import pandas as pd
import json

In [2]:
with open('./data/sample.jl', 'r') as f:
    jobs = pd.DataFrame([json.loads(d) for d in f])

In [3]:
direct_match = (pd
                .read_excel('./data/SOC/soc_2010_direct_match_title_file.xls', skiprows=6)
                .rename(columns = {'2010 SOC Direct Match Title': 'title', 
                                     '2010 SOC Code': 'code'})[['title', 'code']])

samples = (pd
           .read_excel('./data/SOC/Sample of Reported Titles.xlsx')
           .rename(columns = {'Reported Job Title': 'title', 
                              'O*NET-SOC Code': 'code'})[['title', 'code']])

alternates = (pd
              .read_excel('./data/SOC/Alternate Titles.xlsx')
              .rename(columns = {'Alternate Title': 'title', 
                                 'O*NET-SOC Code': 'code'})[['title', 'code']])

alternates['code'] = alternates.code.str.slice(0,7)
samples['code'] = samples.code.str.slice(0,7)

In [4]:
lookup = pd.concat([direct_match, samples, alternates]).drop_duplicates('title')
lookup['title'] = lookup.title.str.lower()

In [82]:
lookup.to_csv('crosswalks/soc-title-lookup.csv', index=False)

In [None]:
jobs['title'] = jobs.title.str.lower()
j = jobs.merge(lookup, how='left', on = ['title'])

In [62]:
# Percent found in exact string match
j[~j.code.isna()].shape[0] / j.shape[0]

0.2565

### Explore matching with partial matches

In [80]:
def lookup_two(lookup):
    def f(t):
        z = zip(lookup.code, lookup.title)
        split = t.split(' ')
        options = [(code, title) for code,title in z if title in t]
        if len(options) > 0:
            options = sorted(options, key = lambda t: -len(t[1]))
        return options[0][0] if options else None
    return f

In [81]:
from fuzzywuzzy import fuzz, utils, process

def lookup_code(lookup):
    def f(t):
        try:
            prop, score = process.extractOne(t, lookup.title, score_cutoff = 0)
            return prop
        except ValueError:
            return None
    return f

In [None]:
leftovers = j[j.code.isnull()].reset_index(drop=True)
leftovers.title[:20].map(lookup_two(lookup))

In [None]:
t = utils.full_process(j[j.code.isnull()].title.values[5])
lookup_two(lookup)(t)