In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

%env RESEARCH_ROBOT_DEFAULT_CACHE ./_cache

from researchrobot.objectstore import ObjectStore

# Object cache using the finesystem directly. 
cfs_config = dict(class_='FSObjectStore', bucket='linkedin', path='_cache')
rc =  ObjectStore.new(**cfs_config)

from researchrobot.embeddings import run_embeddings


env: RESEARCH_ROBOT_DEFAULT_CACHE=./_cache


In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()

pkg.set_sys_path()
from pylib import *

pkg

In [3]:
cip_df = pkg.reference('cip_source').dataframe() 

t = cip_df[~cip_df.Action.isin(['Deleted','Moved from'])]

def clean_family(v):
    return v[1:].strip('"')

lens = t.CIPCode.apply(clean_family).apply(len)


def clean_code(v):
    v =  v[1:].strip('"')
    if '.' not in v:
        v = v + '.'

    return v.ljust(7, '0') 

cip_df = pd.DataFrame({
    'family': t.CIPFamily.apply(clean_family),
    'code': t.CIPCode.apply(clean_code),
    'title': t.CIPTitle.str.strip('.'), 
    'desc': t.CIPDefinition,
    'cat': lens.map({2: 'family', 5: 'discipline', 7: 'program'})
})

import re

def clean_desc(v):
    v = re.sub(
    r'^(A program that|Any instructional program|A program of study which|A residency or fellowship program '
    r'for|A program in|A program that focuses on|A program that prepares individuals to|Includes instruction '
    r'in|A program that generally prepares individuals to|A research program that focuses on|A fellowship '
    r'training program that prepares physicians|Requires prior completion of a residency program in|This CIP '
    r'code is not valid for IPEDS reporting.)\s*', '', v).strip()

    # Capitailize the first letter
    return v[0].upper() + v[1:]

cip_df['desc'] = cip_df.desc.apply(clean_desc)


cip_df.loc[(cip_df.cat == 'discipline'),  'desc'] = cip_df.loc[ (cip_df.cat == 'discipline'), 'title'] 

cip_df.loc[(cip_df.cat == 'discipline'),  'text'] = cip_df.loc[ (cip_df.cat == 'discipline'), 'title'] 

fp = cip_df.loc[ cip_df.cat.isin(('family','program')) ]

cip_df.loc[cip_df.cat.isin(('family','program')),  'text'] = fp.title+". "+fp.desc

cip_df['discipline'] = cip_df['code'].str.slice(0,5) 


cip_df = cip_df[['code','cat','family','discipline','title','desc','text']]

cip_df.head()

Unnamed: 0,code,cat,family,discipline,title,desc,text
0,1.0,family,1,1.0,AGRICULTURAL/ANIMAL/PLANT/VETERINARY SCIENCE A...,Instructional programs that focus on agricultu...,AGRICULTURAL/ANIMAL/PLANT/VETERINARY SCIENCE A...
1,1.0,discipline,1,1.0,"Agriculture, General","Agriculture, General","Agriculture, General"
2,1.0,program,1,1.0,"Agriculture, General",Focuses on the general principles and practice...,"Agriculture, General. Focuses on the general p..."
3,1.01,discipline,1,1.01,Agricultural Business and Management,Agricultural Business and Management,Agricultural Business and Management
4,1.0101,program,1,1.01,"Agricultural Business and Management, General",A general program that focuses on modern busi...,"Agricultural Business and Management, General...."


In [4]:
xw_df = pkg.reference('xwalk_source').dataframe()
xw_df.columns = ['cip', 'cip_title', 'soc', 'soc_title']
xw_df

Unnamed: 0,cip,cip_title,soc,soc_title
0,01.0000,"Agriculture, General.",19-1011,Animal Scientists
1,01.0000,"Agriculture, General.",19-1012,Food Scientists and Technologists
2,01.0000,"Agriculture, General.",19-1013,Soil and Plant Scientists
3,01.0000,"Agriculture, General.",19-4012,Agricultural Technicians
4,01.0000,"Agriculture, General.",25-1041,"Agricultural Sciences Teachers, Postsecondary"
...,...,...,...,...
6092,99.9999,NO MATCH,55-2012,First-Line Supervisors of Weapons Specialists/...
6093,99.9999,NO MATCH,55-2013,First-Line Supervisors of All Other Tactical O...
6094,99.9999,NO MATCH,55-3011,Air Crew Members
6095,99.9999,NO MATCH,55-3013,Armored Assault Vehicle Crew Members


In [5]:
def embed_and_cache(rc, key, df, force=False, progress=False, n_jobs=4):
    if key in rc and force is False:
        edf = rc[key]
    else:
        edf = run_embeddings(df, n_jobs=n_jobs, progress=progress)
        rc[key] = edf

    return edf

titles = cip_df[['code','title']].rename(columns={'title':'text'}).assign(type='title')
desc = cip_df[['code','desc']].rename(columns={'title':'text'}).assign(type='desc')

In [6]:
titles

Unnamed: 0,code,text,type
0,01.0000,AGRICULTURAL/ANIMAL/PLANT/VETERINARY SCIENCE A...,title
1,01.0000,"Agriculture, General",title
2,01.0000,"Agriculture, General",title
3,01.0100,Agricultural Business and Management,title
4,01.0101,"Agricultural Business and Management, General",title
...,...,...,...
2843,61.2801,Urology Residency Program,title
2844,61.2802,Pediatric Urology Fellowship Program,title
2845,61.2899,"Urology Residency/Fellowship Programs, Other",title
2846,61.9900,"Medical Residency/Fellowship Programs, Other",title


In [7]:
titles.sample(20)

Unnamed: 0,code,text,type
2712,61.0903,Laboratory Genetics and Genomics Residency Pro...,title
1422,40.0404,Meteorology,title
2748,61.1699,Osteopathic Medicine Residency/Fellowship Prog...,title
782,16.1407,Thai Language and Literature,title
588,14.4401,Engineering Chemistry,title
2543,60.0727,Neurology Nurse Practitioner Residency/Fellows...,title
622,15.05,Environmental Control Technologies/Technicians,title
1471,41.0204,Industrial Radiologic Technology/Technician,title
1684,47.01,Electrical/Electronics Maintenance and Repair ...,title
1727,47.0703,Solar Energy System Installation and Repair Te...,title
