In [1]:
%load_ext autoreload
%autoreload 2

import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
import os
from tqdm.auto import tqdm
import re 

from researchrobot import cache_dl, require_version, ObjectStore, oscache
from researchrobot.openai import openai_one_completion
from researchrobot.embeddings import run_embeddings

from joblib import Parallel, delayed

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

%env RESEARCH_ROBOT_DEFAULT_CACHE ./cache
os_config = dict(class_='LocalLargeObjectStore', bucket='linkedin', path='cache')
rc = ObjectStore.new(**os_config)

cfs_config = dict(class_='FSObjectStore', bucket='linkedin', path='cache_ns')
rc_ns =  ObjectStore.new(**cfs_config)


env: RESEARCH_ROBOT_DEFAULT_CACHE=./cache


In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()

pkg.set_sys_path()
from pylib import *

pkg

In [3]:
# Create texts for the 
occ_df = pkg.reference('onet_occupations').dataframe()
occ_df.columns = ['soc','title','desc']

task_df = pkg.reference('onet_tasks').dataframe()
task_df.columns = ['soc', 'title', 'task_id', 'task', 'task_type','responses', 'date', 'source']

d  = pd.DataFrame([  {'soc':gn, 'tasks':g.task.to_list()} for gn, g in task_df.groupby('soc') ])

ot_df = occ_df.merge(d, on='soc')

def make_text(r):
    nl = '\n'
    return f"""{r.title}

{r.desc}

{nl.join(['  * ' + s for s in r.tasks])}
"""
    
ot_df['text'] = ot_df.apply(make_text, axis=1)

ot_df.head()

Unnamed: 0,soc,title,desc,tasks,text
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...,[Direct or coordinate an organization's financ...,Chief Executives\n\nDetermine and formulate po...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",[Monitor and evaluate effectiveness of sustain...,Chief Sustainability Officers\n\nCommunicate a...
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ...","[Review financial statements, sales or activit...","General and Operations Managers\n\nPlan, direc..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes...",[Analyze and understand the local and national...,"Legislators\n\nDevelop, introduce, or enact la..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...",[Plan and prepare advertising and promotional ...,"Advertising and Promotions Managers\n\nPlan, d..."


In [4]:
def embed_and_cache(rc, key, df, force=False, progress=False, n_jobs=4):
    if key in rc and force is False:
        edf = rc[key]
    else:
        edf = run_embeddings(df, n_jobs=n_jobs, progress=progress)
        rc[key] = edf

    return edf

In [5]:
%time ot_edf = embed_and_cache(rc, 'ot_df', ot_df, progress=True)

CPU times: user 12.3 ms, sys: 14.1 ms, total: 26.4 ms
Wall time: 28.7 ms


In [6]:
ot_edf.head()

Unnamed: 0,soc,title,desc,tasks,text,embeddings
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...,[Direct or coordinate an organization's financ...,Chief Executives\n\nDetermine and formulate po...,"[-0.004474063729759382, -0.0006731285037611346..."
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",[Monitor and evaluate effectiveness of sustain...,Chief Sustainability Officers\n\nCommunicate a...,"[-0.0033432542393099667, -0.005450184226689635..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ...","[Review financial statements, sales or activit...","General and Operations Managers\n\nPlan, direc...","[-0.01695389303068825, -0.013382219729063184, ..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes...",[Analyze and understand the local and national...,"Legislators\n\nDevelop, introduce, or enact la...","[-0.009759281621141966, 0.017985621207744613, ..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...",[Plan and prepare advertising and promotional ...,"Advertising and Promotions Managers\n\nPlan, d...","[-0.02011748953777268, -0.006563918637882019, ..."


In [7]:
%%time
# Load some profiles from minio on Barker. These profiles are previously classified, so this notebook
# can't really be run from scratch. 

from random import seed, choices, shuffle
from researchrobot import cache_dl, require_version, ObjectStore, oscache
from researchrobot.cmech.cache import * 

psoc_key = 'data/psoc'

if  psoc_key in rc:
    psoc = rc[psoc_key]
    
else:
    rc2 = ObjectStore.new(name='barker_minio', bucket='linkedin')
    q = get_classification_queues(rc2, version = 3)

    mkeys = list(q.parts.list('matches'))
    seed(0)
    shuffle(mkeys)


    # Load all of the matches and compile the SOC codes and embeddings
    frames = []
    n = 0
    for mk in tqdm(mkeys):
        o = q.parts[mk]
        n += len(o)
        frames.append(o[['title','summary', 'soc','embeddings']].copy())

        if n > 1_000_000:
            break 

    psoc = pd.concat(frames) # profile SOC matches
    
    # Take a sample of up to 50 profiles from each SOC
    frames = []
    for gn , g in  psoc.groupby('soc'):
        frames.append(g.sample(min(len(g), 50)))
    
    rc[psoc_key] = psoc = pd.concat(frames)


CPU times: user 211 ms, sys: 486 ms, total: 698 ms
Wall time: 709 ms


In [8]:
def prompt_matchstyle(experiences, onet_title, onet_desc, onet_tasks):
    
   
    return f"""
Rewrite descriptions of a job experience from user's resumes ( "Resume Experience" ) 
to better match a formal job description. The Resume Experience entries are not well
written and do not describe the job very well. You will rewrite them, maintaining the
same style and tone, but with more detail, drawn from the Duties and Tasks listed in
the formal Job Description.

For instance,  if the Resume Experience is very short and uninformative, your 
rewrite should also be short and uninformative. If the Resume Experience does 
not use a bullet list for tasks,  your re-write should not use a bullet list. 

You will be given 5 Resume Experiences, so you will produce 5 rewritten experiences. 

Wrap text to 72 characters. Do not include more than 5 tasks in your re-written description. 

Important Points
----------------

1. Include 1 to 5 of the Job description Tasks in your rewrite. Rewrite the tasks to match
the style of the Resume Experience
2. Adjust the text of the Resume Experience to be noticably similar to the Job Description. 
3. You rewrite is a cross between the Job Description and the Resume Experience. It should have 
elements from both. 

Instructions
------------
1. First, analyze the job description, pay attention to the job duties and the tasks
2. Second, describe the basic structure of the Example of Style
3. Print a separator line, '<response>'
4. For each example of style, re-write the Resume Experiences using you analysis of the style, 
tone and structure from the Resume Experience, but with more of the Duties and Tasks 
from the Job Description. 
5. Your rewrite should include idea, phrases and words from both the Resume Experience and the
Job Description.
6. Start each rewrite with a job title, which is also re-written to be more similar to the 
Job Description job title. 
7. Seperate each of the rewritten Resume Experience with a single line containing "<sep>"

Job Description
===============

Job Title: {onet_title}

Duties : {onet_desc}

Tasks:

{onet_tasks}

Resume Experiences
==================

{experiences}

"""

profs = psoc.set_index('soc')

In [9]:
# Create the prompts

def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    import tiktoken
    
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens
  
prompts = []
    
for gn, g in profs.groupby('soc'):
    
    rexp = g.sample(5, replace=len(g)<5)
    
    soc_map = ot_df.set_index('soc')
    jd = soc_map.loc[gn]
    
    from textwrap import fill
    experiences = ''
    for i, (idx, r) in enumerate(rexp.iterrows(), 1):
        experiences += fill(f"{i}) {r.title}. {r.summary}")+"\n\n"

    tasks_str = ''

    tasks = list(jd.tasks)
    tasks = choices(tasks, k=10)
    
    for t in tasks:
        tasks_str +=  fill('  * '+t,72)+"\n"
    
    #print(n_tok, n_tok/1000*.03, prompt)
    
    args = {'experiences':experiences, 'onet_title':jd.title, 'onet_desc':jd.desc, 'onet_tasks':tasks_str }
    
    prompts.append( {'soc':gn, 'profiles':rexp, 'args': args, 'ntok': num_tokens_from_string(prompt_matchstyle(**args)) } )
                        

In [27]:
def run_rewrite(cfs_config, e):
    from researchrobot.openai import openai_one_completion
    
    rc_ns =  ObjectStore.new(**cfs_config)

    key = f"responses/{e['soc']}"
    
    if key in rc_ns:
         return (False, key) 
    
    prompt = prompt_matchstyle(**e['args'])
    
    e['response'] = openai_one_completion(prompt, model='gpt-4')
    
    rc_ns[key] = e
    
    return (True, key)
 

In [28]:
remain_prompts = [ p for p in prompts if f"responses/{p['soc']}" not in rc_ns ]
len(remain_prompts)

25

In [33]:
results = Parallel(n_jobs=4)(delayed(run_rewrite)(cfs_config, e) for e in tqdm(remain_prompts))

  0%|          | 0/25 [00:00<?, ?it/s]

In [34]:
# Compile the results and look for bad records. 

def compile_responses():
    
    recs = []
    frames = []

    rep = r'Rewrit.*([=-]{4,})'

    for e in rc_ns.list('responses'):

        o = rc_ns[e]
        try:
            anl, response = o['response'].split('<response>')
        except ValueError:
            anl = None
            response = o['response']

        # GPT4 is occasionally adding extra headings in the response    
        rewrites = [ re.sub(rep,'',e, flags=re.DOTALL).strip() for e in response.split('<sep>')]
        rewrites = [ re.sub(r'\d\)','',e).strip() for e in rewrites]

        o['rewrites'] = rewrites

        try:
            p = o['profiles'].drop(columns=['embeddings']).assign(rewrite = rewrites, mismatch=False)

        except ValueError:

            # There is a mismatch in number, so fake it.
            p = o['profiles'].drop(columns=['embeddings'])
            l = min(len(p), len(rewrites))

            p = p[:l]
            rewrites = rewrites[:l]

            p = p.assign(rewrite = rewrites, mismatch=True)

        o['profiles'] = p.assign(cache_path=e)
        o['cache_path'] = e

        recs.append(o)
        frames.append(o['profiles'])

    rewrites = pd.concat(frames).reset_index()

    return rewrites, recs
    

rewrites, recs  = compile_responses()
    
    
# Maybe we could edit out the errors, but it would be easier to regenerate them entirely. 
# If you get problems here, run the whole script again. 

def is_bad(r):
    return 'Example of Style' in r.rewrite



bad = list(rewrites[(rewrites.apply(is_bad,axis=1)) | (rewrites.mismatch) ].cache_path.unique())
        
print(len(set(bad)), 'bad records')

if len(bad) > 0:
    for key in set(bad):
        del rc_ns[key]
        
    rewrites, recs  = compile_responses()
    
rc_ns['data/rewrites'] = rewrites
rc_ns['data/records'] = recs

len(rewrites)

0 bad records


4615

In [35]:
rewrites_embd = run_embeddings(rewrites, progress=True, text_col='rewrite')

embedding 0:   0%|          | 0/24 [00:00<?, ?it/s]

In [36]:
rewrites_embd.head()

Unnamed: 0,soc,title,summary,rewrite,mismatch,cache_path,embeddings
0,53-2022.00,armour reconnaissance commander,Worked and trained as a Sergeant in the Singap...,Airfield Operations Sergeant. As a leader in t...,False,responses/53-2022.00,"[-0.018098911442292063, -0.019834604302561887,..."
1,53-2022.00,usaf,Luke AFB was my last assignment from September...,Airfield Operations Specialist at USAF. I was ...,False,responses/53-2022.00,"[-0.03129883314331891, -0.011225900007625016, ..."
2,53-2022.00,manager of state airports,Implemented scheduled air carrier service betw...,State Airports Manager. Managed multiple airpo...,False,responses/53-2022.00,"[-0.014473030065163572, 0.0019780241038021007,..."
3,53-2022.00,specialist,Worked in a Supply Transportaion Unit 11th ACR...,Supply Transportation Specialist. My responsib...,False,responses/53-2022.00,"[-0.02199234334121973, -0.031722285999662696, ..."
4,53-2022.00,clinical manager,"Assignments include: Lackland AFB, TX, Sheppar...",Clinical Manager in Airfield Operations. My as...,False,responses/53-2022.00,"[-0.022080293475614975, -0.00654194427196879, ..."
