In [1]:
%load_ext autoreload
%autoreload 2

import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

from researchrobot.embeddings import run_embeddings
from researchrobot import cache_dl, ObjectStore

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

%env RESEARCH_ROBOT_DEFAULT_CACHE ./_cache
os_config = dict(class_='FSObjectStore', bucket='build', path='_cache')
rc = ObjectStore.new(**os_config)



env: RESEARCH_ROBOT_DEFAULT_CACHE=./_cache


In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()

pkg.set_sys_path()
from pylib import *

pkg

In [3]:
spkg = pkg.reference('soc_embed_pkg').package

occ_titles = pkg.reference('onet_occ').dataframe()
occ_titles.columns = ['soc','title','desc']

rep_titles = pkg.reference('onet_rtitles').dataframe()
rep_titles.columns = ['soc', 'soc_title', 'other_title', 'show']

alt_titles = pkg.reference('onet_atitles').dataframe()

alt_titles.columns = ['soc','title','alt_title', 'short_title', 'source' ]

titles = pd.concat([
                    occ_titles[['soc','title']],
                    rep_titles[['soc','soc_title']].rename(columns={'soc_title':'title'}), 
                    rep_titles[['soc','other_title']].rename(columns={'other_title':'title'}),
                    alt_titles[['soc','title']],
                    alt_titles[['soc','alt_title']].rename(columns={'alt_title':'title'}),
                   ]).drop_duplicates().sort_values('soc')

print(titles.shape)
titles.head()

(60379, 2)


Unnamed: 0,soc,title
0,11-1011.00,Chief Executives
62,11-1011.00,Public Works Commissioner
61,11-1011.00,Public Health Director
60,11-1011.00,Private Sector Executive
58,11-1011.00,Police Commissioner


In [4]:
# These are soc codes that are in the alternate titles but are not in the main occupation codes
# file, so they are old and should be removed or mapped to the new SOC
del_titles = titles[~titles.soc.isin(occ_titles.soc.unique())]
remain_titles = titles[titles.soc.isin(occ_titles.soc.unique())]

# Map deleted entries to new soc with the same title
t = del_titles.merge(remain_titles.rename(columns={'soc': 'new_soc'}), how = 'left', on='title')

# Create a dict for the mapping
soc_map = { r.soc:r.new_soc for idx, r in t[~t.new_soc.isnull()].iterrows() }

rows = []
unmappable = set()

for idx, r in t[t.new_soc.isnull()].iterrows():
    try:
        rows.append({'soc': soc_map[r.soc], 'title': r.title})
    except KeyError as e:
        unmappable.add(r.soc)
        
print(len(unmappable), 'unmappable', unmappable)
    
titles = pd.concat([remain_titles, pd.DataFrame(rows)]).sort_values('soc')
len(titles)

2 unmappable {'17-3029.09', '11-9039.01'}


58670

In [5]:
# We need to add titles for 'Board of Directors' and 'Board Member', 
# linked to 11-1011.00, Chief Executives

titles = pd.concat([ titles, pd.DataFrame([ 
    { 'soc': '11-1011.00', 'title': 'Board of Directors' },
    { 'soc': '11-1011.00', 'title': 'Board Member' }
])]).sort_values('soc')

len(titles)


58672

In [6]:
def embed_and_cache(rc, key, df, text_col='text', progress=True, force=False):
    if key in rc and force is False:
        edf = rc[key]
    else:
        edf = run_embeddings(df, n_jobs=1, text_col=text_col, progress=progress)
        rc[key] = edf

    return edf

if False:
    
    # If we ever need the embeddings. 
    
    titles_edf = embed_and_cache(rc, 'titles',titles, text_col='title')

    # Split the "embeddings" column into a separate DataFrame
    edf = pd.DataFrame(titles_edf['embeddings'].tolist())

    # Rename the columns of the new DataFrame
    edf.columns = [f'e{i}' for i in range(len(edf.columns))]

    edf

In [7]:
del_titles = titles[~titles.soc.isin(occ_titles.soc.unique())]
assert(len(del_titles) == 0)