In [1]:
%load_ext autoreload
%autoreload 2

import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
import os
from tqdm.auto import tqdm
import re 
import json 

from researchrobot import cache_dl, require_version, ObjectStore, oscache
from researchrobot.openai import openai_one_completion
from researchrobot.embeddings import run_embeddings

from joblib import Parallel, delayed

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

%env RESEARCH_ROBOT_DEFAULT_CACHE ./_cache

cfs_config = dict(class_='FSObjectStore', bucket='linkedin/build', path='_cache')
rc =  ObjectStore.new(**cfs_config)


env: RESEARCH_ROBOT_DEFAULT_CACHE=./_cache


In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()

pkg.set_sys_path()
from pylib import *

pkg

In [3]:
# Create texts for the 
occ_df = pkg.reference('onet_occupations_source').dataframe()
occ_df.columns = ['soc','title','desc']

task_df = pkg.reference('onet_tasks_source').dataframe()
task_df.columns = ['soc', 'title', 'task_id', 'task', 'task_type','responses', 'date', 'source']

d  = pd.DataFrame([  {'soc':gn, 'tasks':g.task.to_list()} for gn, g in task_df.groupby('soc') ])

ot_df = occ_df.merge(d, on='soc')

def make_text(r):
    nl = '\n'
    return f"""{r.title}

{r.desc}

{nl.join(['  * ' + s for s in r.tasks])}
"""
    
ot_df['text'] = ot_df.apply(make_text, axis=1)

ot_df.head()

Unnamed: 0,soc,title,desc,tasks,text
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...,[Direct or coordinate an organization's financ...,Chief Executives\n\nDetermine and formulate po...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",[Monitor and evaluate effectiveness of sustain...,Chief Sustainability Officers\n\nCommunicate a...
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ...","[Review financial statements, sales or activit...","General and Operations Managers\n\nPlan, direc..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes...",[Analyze and understand the local and national...,"Legislators\n\nDevelop, introduce, or enact la..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...",[Plan and prepare advertising and promotional ...,"Advertising and Promotions Managers\n\nPlan, d..."


In [4]:
def embed_and_cache(rc, key, df, force=False, progress=False, n_jobs=4):
    if key in rc and force is False:
        edf = rc[key]
    else:
        edf = run_embeddings(df, n_jobs=n_jobs, progress=progress)
        rc[key] = edf

    return edf

In [5]:
%time ot_edf = embed_and_cache(rc, 'ot_df', ot_df, progress=True)

CPU times: user 9.55 ms, sys: 13.4 ms, total: 22.9 ms
Wall time: 22.1 ms


In [6]:
ot_edf['embeddings'] = ot_edf.embeddings.apply(lambda v: json.dumps(v.tolist()))

0      [-0.004486817514321507, -0.0031944715704036036...
1      [-0.003343254137093117, -0.005450184146155375,...
2      [-0.01732268411230562, -0.013479049633007011, ...
3      [-0.009923376172845812, 0.017930787110652154, ...
4      [-0.01950459439859871, -0.006462327289510824, ...
                             ...                        
918    [-0.011252905076654291, -0.004154553575809618,...
919    [-0.02303490925883954, -0.008605153815915749, ...
920    [-0.009864054096123403, 0.01190944325226033, 0...
921    [-0.012732554579237188, -0.013722270102936697,...
922    [-0.00417200721036109, -0.006684073443664963, ...
Name: embeddings, Length: 923, dtype: object