# Data Preparation and Processing
This notebook contains code for preparing and processing O*NET data.


## Imports

In [1]:
# Import necessary libraries.
import pandas as pd
import numpy as np

## Constants

In [2]:
# Define constants used in the notebook.
BASE_URL = 'https://www.onetcenter.org/dl_files/database/db_29_1_text/'

# Columns to Keep
COLUMNS_KEEP_MEASUREMENTS = ["ONET_SOC_CODE", "ELEMENT_ID", "SCALE_ID", "JOB_ZONE",
                            "TASK_TYPE", "TASK_ID", "CATEGORY", "DATA_VALUE", 
                            "RECOMMEND_SUPPRESS", "NOT_RELEVANT", "COMMODITY_CODE",
                            "HOT_TECHNOLOGY", "IN_DEMAND"]

## Helper Functions

In [121]:
# Define helper functions for data retrieval and preparation.
# Prepare the data
def prepare_data(df):
    df = df.copy()
    # Rename the columns to be all caps and no spaces
    df.columns = [col.upper().replace(' ', '_').replace('-', '_') for col in df.columns]
    # Remove the "*" from O*NET
    df.columns = [col.replace('*', '') for col in df.columns]
    return df

# Get the data
def get_data( data_set_name ):
    data_set_name = data_set_name.replace(' ', '%20').replace(',', '%2C')
    data_url = BASE_URL + data_set_name + '.txt'
    df = pd.read_csv( data_url, sep='\t' )
    df = prepare_data(df)    
    return df


def process_measurements(df):

    if type(df) == str:
        df = get_data(df)
    
    if ("ELEMENT_ID" not in df.columns):
        return df[[c for c in  COLUMNS_KEEP_MEASUREMENTS if c in df.columns]]

    if df.ELEMENT_ID.nunique() > 1:
        df_wide = {}
        for group, data in df.groupby('ELEMENT_ID'):
            if 'CATEGORY' in data.columns:
                if len( data.CATEGORY.unique() ) > 1:
                    data['CATEGORY'] = data['CATEGORY'].fillna("")
                    data['SCALE_ID'] = data['SCALE_ID'] + data['CATEGORY'].apply(lambda x: f"_{int(x):02d}" if x else "")
            df_wide[group] = process_measurements(data) 
            
        return df_wide
    
    else:
        df = df[[c for c in  COLUMNS_KEEP_MEASUREMENTS if c in df.columns]]
        # Make sure RECOMMEND_SUPPRESS is a in a column
        # if 'RECOMMEND_SUPPRESS' not in df.columns:
        #     df['RECOMMEND_SUPPRESS'] = np.nan

        # Convert to wide format
        df_wide = df.pivot(
            index=['ONET_SOC_CODE', 'ELEMENT_ID'],#, 'RECOMMEND_SUPPRESS'],
            columns='SCALE_ID',
            values='DATA_VALUE'
        ).reset_index()

        # Drop any columns that are all NaN
        df_wide = df_wide.dropna(axis=1, how='all')

        # Add the mapping
        if ('NOT_RELEVANT' not in df_wide.columns) and ('NOT_RELEVANT' in df.columns):
            mapping = df[df['SCALE_ID'] == 'LV'].set_index(['ONET_SOC_CODE', 'ELEMENT_ID'])['NOT_RELEVANT'].to_dict()
            df_wide['NOT_RELEVANT'] = df_wide.apply(lambda x: mapping.get((x['ONET_SOC_CODE'], x['ELEMENT_ID']), None), axis=1)

        df_wide.head()
        return df_wide

# Create a DataFrame with repeated parent levels.
def create_parent_levels(df):
    # Get all unique levels from Element ID
    all_levels = []
    
    # Process each ELEMENT_ID to get its hierarchical parts
    for element_id in df['ELEMENT_ID']:
        parts = element_id.split('.')
        current = parts[0]
        all_levels.append([current])
        
        # Build up each level (1, 1.A, 1.A.1, etc.)
        for part in parts[1:]:
            current = f"{current}.{part}"
            all_levels[-1].append(current)
    
    # Find maximum depth
    max_depth = max(len(x) for x in all_levels)
    
    # Create column names for each level
    column_names = [f'LEVEL_{i+1}' for i in range(max_depth)]
    
    # Create DataFrame with the hierarchical structure
    level_df = pd.DataFrame(all_levels, columns=column_names)
    
    # Replace None with NaN
    level_df = level_df.fillna(np.nan)
    
    return level_df



## Data Retrieval and Preparation

### Reference Data

#### Occupation Data

In [4]:
data_var = "Occupation Data"
data_occ = get_data(data_var)
data_occ = prepare_data(data_occ)
data_occ.head()

Unnamed: 0,ONET_SOC_CODE,TITLE,DESCRIPTION
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."


#### Scales Reference

In [42]:
data_var = "Scales Reference"
data_scales = get_data(data_var)
data_scales = prepare_data(data_scales)
data_scales.head()

Unnamed: 0,SCALE_ID,SCALE_NAME,MINIMUM,MAXIMUM
0,AO,Automation,1,5
1,CF,Frequency,1,5
2,CN,Amount of Contact,1,5
3,CT,Context,1,3
4,CTP,Context (Categories 1-3),0,100


#### Model Reference Data

In [7]:
data_var = "Content Model Reference"
data_model = get_data(data_var)
data_model = prepare_data(data_model)
# Add a level column to the data DataFrame
data_model['LEVEL'] = data_model["ELEMENT_ID"].str.split('.').apply(lambda x: len(x))	
data_model.head()

Unnamed: 0,ELEMENT_ID,ELEMENT_NAME,DESCRIPTION,LEVEL
0,1,Worker Characteristics,Worker Characteristics,1
1,1.A,Abilities,Enduring attributes of the individual that inf...,2
2,1.A.1,Cognitive Abilities,Abilities that influence the acquisition and a...,3
3,1.A.1.a,Verbal Abilities,Abilities that influence the acquisition and a...,4
4,1.A.1.a.1,Oral Comprehension,The ability to listen to and understand inform...,5


#### Level Data of Model Reference Elements

In [31]:
level_df = create_parent_levels(data_model)
level_df.head(10)

Unnamed: 0,LEVEL_1,LEVEL_2,LEVEL_3,LEVEL_4,LEVEL_5,LEVEL_6,LEVEL_7
0,1,,,,,,
1,1,1.A,,,,,
2,1,1.A,1.A.1,,,,
3,1,1.A,1.A.1,1.A.1.a,,,
4,1,1.A,1.A.1,1.A.1.a,1.A.1.a.1,,
5,1,1.A,1.A.1,1.A.1.a,1.A.1.a.2,,
6,1,1.A,1.A.1,1.A.1.a,1.A.1.a.3,,
7,1,1.A,1.A.1,1.A.1.a,1.A.1.a.4,,
8,1,1.A,1.A.1,1.A.1.b,,,
9,1,1.A,1.A.1,1.A.1.b,1.A.1.b.1,,


#### Education, Training, and Experience Categories

In [45]:
data_var = "Education, Training, and Experience Categories" # The %2C is a comma
data_educ_train_exp_cat = get_data(data_var)
data_educ_train_exp_cat = prepare_data(data_educ_train_exp_cat)

data_educ_train_exp_cat

Unnamed: 0,ELEMENT_ID,ELEMENT_NAME,SCALE_ID,CATEGORY,CATEGORY_DESCRIPTION
0,2.D.1,Required Level of Education,RL,1,Less than a High School Diploma
1,2.D.1,Required Level of Education,RL,2,High School Diploma - or the equivalent (for e...
2,2.D.1,Required Level of Education,RL,3,Post-Secondary Certificate - awarded for train...
3,2.D.1,Required Level of Education,RL,4,Some College Courses
4,2.D.1,Required Level of Education,RL,5,Associate's Degree (or other 2-year degree)
5,2.D.1,Required Level of Education,RL,6,Bachelor's Degree
6,2.D.1,Required Level of Education,RL,7,Post-Baccalaureate Certificate - awarded for c...
7,2.D.1,Required Level of Education,RL,8,Master's Degree
8,2.D.1,Required Level of Education,RL,9,Post-Master's Certificate - awarded for comple...
9,2.D.1,Required Level of Education,RL,10,First Professional Degree - awarded for comple...


#### Job Zone Reference

In [46]:
data_var = "Job Zone Reference"
data_job_zone_ref = get_data(data_var)
data_job_zone_ref = prepare_data(data_job_zone_ref)
data_job_zone_ref

Unnamed: 0,JOB_ZONE,NAME,EXPERIENCE,EDUCATION,JOB_TRAINING,EXAMPLES,SVP_RANGE
0,1,Job Zone One: Little or No Preparation Needed,"Little or no previous work-related skill, know...",Some of these occupations may require a high s...,Employees in these occupations need anywhere f...,These occupations involve following instructio...,(Below 4.0)
1,2,Job Zone Two: Some Preparation Needed,"Some previous work-related skill, knowledge, o...",These occupations usually require a high schoo...,Employees in these occupations need anywhere f...,These occupations often involve using your kno...,(4.0 to < 6.0)
2,3,Job Zone Three: Medium Preparation Needed,"Previous work-related skill, knowledge, or exp...",Most occupations in this zone require training...,Employees in these occupations usually need on...,These occupations usually involve using commun...,(6.0 to < 7.0)
3,4,Job Zone Four: Considerable Preparation Needed,"A considerable amount of work-related skill, k...",Most of these occupations require a four-year ...,Employees in these occupations usually need se...,Many of these occupations involve coordinating...,(7.0 to < 8.0)
4,5,Job Zone Five: Extensive Preparation Needed,"Extensive skill, knowledge, and experience are...",Most of these occupations require graduate sch...,"Employees may need some on-the-job training, b...","These occupations often involve coordinating, ...",(8.0 and above)


#### Task Categories

In [189]:
data_var = "Task Categories"
data_task_cat = get_data(data_var)
data_task_cat = prepare_data(data_task_cat)


data_task_cat["CATEGORY"] = data_task_cat.SCALE_ID + "_" + data_task_cat.CATEGORY.astype(str)

data_task_cat["CATEGORY_DESCRIPTION"] = data_task_cat.CATEGORY_DESCRIPTION.str.upper().replace(" ", "_", regex=True)
data_task_cat["CATEGORY_DESCRIPTION"] = data_task_cat.CATEGORY_DESCRIPTION.str.replace("_OR|_THAN", "", regex=True)

data_task_cat_dict = data_task_cat.set_index('CATEGORY')['CATEGORY_DESCRIPTION'].to_dict()

#### UNSPSC Reference
United Nations Standard Products and Services Code (UNSPSC), version 23.0701.

In [209]:
data_var = "UNSPSC Reference"
data_unspsc_ref = get_data(data_var)
data_unspsc_ref = prepare_data(data_unspsc_ref)
data_unspsc_ref

Unnamed: 0,COMMODITY_CODE,COMMODITY_TITLE,CLASS_CODE,CLASS_TITLE,FAMILY_CODE,FAMILY_TITLE,SEGMENT_CODE,SEGMENT_TITLE
0,10111302,Pet grooming products,10111300,Domestic pet treatments and accessories and eq...,10110000,Domestic pet products,10000000,Live Plant and Animal Material and Accessories...
1,10111306,Domestic pet training kits,10111300,Domestic pet treatments and accessories and eq...,10110000,Domestic pet products,10000000,Live Plant and Animal Material and Accessories...
2,10131601,Cages or its accessories,10131600,Animal containment,10130000,Animal containment and habitats,10000000,Live Plant and Animal Material and Accessories...
3,10131602,Kennels,10131600,Animal containment,10130000,Animal containment and habitats,10000000,Live Plant and Animal Material and Accessories...
4,10131605,Animal transport cage,10131600,Animal containment,10130000,Animal containment and habitats,10000000,Live Plant and Animal Material and Accessories...
...,...,...,...,...,...,...,...,...
4257,60141111,Game accessories,60141100,Games,60140000,Toys and games,60000000,Musical Instruments and Games and Toys and Art...
4258,60141201,Balance or gross motor equipment,60141200,Active play equipment and accessories,60140000,Toys and games,60000000,Musical Instruments and Games and Toys and Art...
4259,60141401,Costumes or accessories,60141400,Dramatic play equipment and accessories,60140000,Toys and games,60000000,Musical Instruments and Games and Toys and Art...
4260,95121601,Steel bridge,95121600,Transport buildings and structures,95120000,Permanent buildings and structures,95000000,Land and Buildings and Structures and Thorough...


#### IWA Reference

In [232]:
data_var = "IWA Reference"
data_iwa_ref = get_data(data_var)
data_iwa_ref = prepare_data(data_iwa_ref)
data_iwa_ref

Unnamed: 0,ELEMENT_ID,IWA_ID,IWA_TITLE
0,4.A.1.a.1,4.A.1.a.1.I01,Study details of artistic productions.
1,4.A.1.a.1,4.A.1.a.1.I02,Read documents or materials to inform work pro...
2,4.A.1.a.1,4.A.1.a.1.I03,Investigate criminal or legal matters.
3,4.A.1.a.1,4.A.1.a.1.I04,Gather information from physical or electronic...
4,4.A.1.a.1,4.A.1.a.1.I05,Consult legal materials or public records.
...,...,...,...
327,4.A.4.c.3,4.A.4.c.3.I03,"Distribute materials, supplies, or resources."
328,4.A.4.c.3,4.A.4.c.3.I04,Collect fares or payments.
329,4.A.4.c.3,4.A.4.c.3.I05,Purchase goods or services.
330,4.A.4.c.3,4.A.4.c.3.I06,Prescribe medical treatments or devices.


#### DWA Reference

In [233]:
data_var = "DWA Reference"
data_dwa_ref = get_data(data_var)
data_dwa_ref = prepare_data(data_dwa_ref)
data_dwa_ref


Unnamed: 0,ELEMENT_ID,IWA_ID,DWA_ID,DWA_TITLE
0,4.A.1.a.1,4.A.1.a.1.I01,4.A.1.a.1.I01.D01,Review art or design materials.
1,4.A.1.a.1,4.A.1.a.1.I01,4.A.1.a.1.I01.D02,Study details of musical compositions.
2,4.A.1.a.1,4.A.1.a.1.I01,4.A.1.a.1.I01.D03,Review production information to determine cos...
3,4.A.1.a.1,4.A.1.a.1.I01,4.A.1.a.1.I01.D04,Study scripts to determine project requirements.
4,4.A.1.a.1,4.A.1.a.1.I01,4.A.1.a.1.I01.D05,Review audio or video recordings.
...,...,...,...,...
2082,4.A.4.c.3,4.A.4.c.3.I07,4.A.4.c.3.I07.D01,Monitor availability of equipment or supplies.
2083,4.A.4.c.3,4.A.4.c.3.I07,4.A.4.c.3.I07.D02,Inventory materials or equipment.
2084,4.A.4.c.3,4.A.4.c.3.I07,4.A.4.c.3.I07.D03,Inventory medical supplies or equipment.
2085,4.A.4.c.3,4.A.4.c.3.I07,4.A.4.c.3.I07.D04,Monitor inventories of products or materials.


#### Tasks to DWAs

In [234]:
data_var = "Tasks to DWAs"
data_task_dwa = get_data(data_var)
data_task_dwa = prepare_data(data_task_dwa)
data_task_dwa

Unnamed: 0,ONET_SOC_CODE,TASK_ID,DWA_ID,DATE,DOMAIN_SOURCE
0,11-1011.00,20461,4.A.2.a.4.I09.D03,07/2014,Analyst
1,11-1011.00,20461,4.A.4.b.6.I08.D04,07/2014,Analyst
2,11-1011.00,8823,4.A.4.b.4.I09.D02,03/2014,Analyst
3,11-1011.00,8824,4.A.4.a.2.I03.D14,03/2014,Analyst
4,11-1011.00,8825,4.A.2.a.4.I07.D09,03/2014,Analyst
...,...,...,...,...,...
23228,53-7121.00,12807,4.A.3.a.2.I34.D01,03/2014,Analyst
23229,53-7121.00,12808,4.A.1.b.1.I01.D03,03/2014,Analyst
23230,53-7121.00,12809,4.A.3.a.3.I02.D03,03/2014,Analyst
23231,53-7121.00,12810,4.A.1.b.3.I01.D14,03/2014,Analyst


#### Related Occupations

The “Relatedness Tier” column assigns one of three categories to each link:

- *Primary-Short* — Five most strongly related occupations after expert review.
- *Primary-Long* — 6th to 10th most strongly related occupations after expert review.
- *Supplemental* — 11th to 20th most strongly related occupations after expert review.


In [239]:
data_var = "Related Occupations"
data_work_act = get_data(data_var)
data_work_act = prepare_data(data_work_act)
data_work_act["RELATEDNESS_TIER"] = data_work_act["RELATEDNESS_TIER"].apply(lambda x: "".join([word[0] for word in x.split("-")]))
data_work_act

Unnamed: 0,ONET_SOC_CODE,RELATED_ONET_SOC_CODE,RELATEDNESS_TIER,INDEX
0,11-1011.00,11-1021.00,PS,1
1,11-1011.00,11-2032.00,PS,2
2,11-1011.00,11-9151.00,PS,3
3,11-1011.00,11-3031.01,PS,4
4,11-1011.00,11-9199.02,PS,5
...,...,...,...,...
18455,53-7121.00,51-9012.00,S,16
18456,53-7121.00,53-7071.00,S,17
18457,53-7121.00,51-8091.00,S,18
18458,53-7121.00,53-7062.04,S,19


#### Abilities to Work Activities

In [244]:
data_var = "Abilities to Work Activities"
data_ability_work_act = get_data(data_var)
data_ability_work_act = prepare_data(data_ability_work_act)
data_ability_work_act = data_ability_work_act.filter(regex='_ID')
data_ability_work_act

Unnamed: 0,ABILITIES_ELEMENT_ID,WORK_ACTIVITIES_ELEMENT_ID
0,1.A.1.a.1,4.A.1.a.1
1,1.A.1.a.1,4.A.1.a.2
2,1.A.1.a.1,4.A.1.b.1
3,1.A.1.a.1,4.A.2.a.1
4,1.A.1.a.1,4.A.2.a.2
...,...,...
376,1.A.4.b.5,4.A.4.b.3
377,1.A.4.b.5,4.A.4.b.4
378,1.A.4.b.5,4.A.4.b.5
379,1.A.4.b.5,4.A.4.b.6


#### Abilities to Work Context

In [245]:
data_var = "Abilities to Work Context"
data_ability_work_act = get_data(data_var)
data_ability_work_act = prepare_data(data_ability_work_act)
data_ability_work_act = data_ability_work_act.filter(regex='_ID')
data_ability_work_act

Unnamed: 0,ABILITIES_ELEMENT_ID,WORK_CONTEXT_ELEMENT_ID
0,1.A.1.a.1,4.C.1.a.2.c
1,1.A.1.a.1,4.C.1.a.2.f
2,1.A.1.a.1,4.C.1.a.2.l
3,1.A.1.a.1,4.C.1.a.4
4,1.A.1.a.1,4.C.1.b.1.e
...,...,...
134,1.A.4.b.5,4.C.1.c.1
135,1.A.4.b.5,4.C.1.c.2
136,1.A.4.b.5,4.C.1.d.1
137,1.A.4.b.5,4.C.1.d.2


##### Skills to Work Activities

In [246]:
data_var = "Skills to Work Activities"
data_ability_work_act = get_data(data_var)
data_ability_work_act = prepare_data(data_ability_work_act)
data_ability_work_act = data_ability_work_act.filter(regex='_ID')
data_ability_work_act

Unnamed: 0,SKILLS_ELEMENT_ID,WORK_ACTIVITIES_ELEMENT_ID
0,2.A.1.a,4.A.1.a.1
1,2.A.1.a,4.A.1.a.2
2,2.A.1.a,4.A.1.b.1
3,2.A.1.a,4.A.2.a.1
4,2.A.1.a,4.A.2.a.2
...,...,...
227,2.B.5.d,4.A.4.b.3
228,2.B.5.d,4.A.4.b.4
229,2.B.5.d,4.A.4.b.5
230,2.B.5.d,4.A.4.c.2


#### Skills to Work Context

In [247]:
data_var = "Skills to Work Context"
data_ability_work_act = get_data(data_var)
data_ability_work_act = prepare_data(data_ability_work_act)
data_ability_work_act = data_ability_work_act.filter(regex='_ID')
data_ability_work_act

Unnamed: 0,SKILLS_ELEMENT_ID,WORK_CONTEXT_ELEMENT_ID
0,2.A.1.a,4.C.1.a.2.h
1,2.A.1.b,4.C.1.a.2.c
2,2.A.1.b,4.C.1.a.2.f
3,2.A.1.b,4.C.1.a.2.l
4,2.A.1.b,4.C.1.a.4
...,...,...
91,2.B.5.a,4.C.1.c.2
92,2.B.5.a,4.C.3.b.8
93,2.B.5.a,4.C.3.d.1
94,2.B.5.d,4.C.1.b.1.g


### Measurements

#### Education, Training, and Experience

In [111]:
data_edu_tran_exp_wide = process_measurements("Education, Training, and Experience")
for group, data in data_edu_tran_exp_wide.items():
    group_name = data_model[data_model['ELEMENT_ID'] == group].ELEMENT_NAME.values[0]
    print(group, " - " , group_name)
    display(data.head())
    

2.D.1  -  Required Level of Education


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,RL_01,RL_02,RL_03,RL_04,RL_05,RL_06,RL_07,RL_08,RL_09,RL_10,RL_11,RL_12
0,11-1011.00,2.D.1,0.0,4.46,0.0,0.0,5.15,32.29,0.0,45.91,3.94,0.55,4.92,2.78
1,11-1011.03,2.D.1,0.0,0.0,0.0,0.0,0.0,18.52,0.0,74.07,7.41,0.0,0.0,0.0
2,11-1021.00,2.D.1,10.35,28.76,5.72,21.59,5.99,27.3,0.14,0.15,0.0,0.0,0.0,0.0
3,11-2011.00,2.D.1,6.16,9.82,0.0,7.67,8.04,60.02,2.42,5.87,0.0,0.0,0.0,0.0
4,11-2021.00,2.D.1,0.0,3.53,0.0,2.8,3.03,55.76,0.0,24.36,0.0,10.52,0.0,0.0


2.D.4.a  -  Job-Related Professional Certification


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,IM
0,11-1011.00,2.D.4.a,3.01
1,11-1011.03,2.D.4.a,2.7
2,11-1021.00,2.D.4.a,1.88
3,11-2011.00,2.D.4.a,1.98
4,11-3013.00,2.D.4.a,2.78


3.A.1  -  Related Work Experience


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,RW_01,RW_02,RW_03,RW_04,RW_05,RW_06,RW_07,RW_08,RW_09,RW_10,RW_11
0,11-1011.00,3.A.1,0.0,0.0,0.0,0.0,0.0,0.0,9.69,5.87,15.09,1.11,68.24
1,11-1011.03,3.A.1,0.0,0.0,0.0,0.0,0.0,3.7,11.11,40.74,18.52,11.11,14.81
2,11-1021.00,3.A.1,10.35,1.5,0.22,2.03,15.26,3.66,18.97,24.23,5.72,0.14,17.93
3,11-2011.00,3.A.1,0.0,0.0,0.0,0.0,14.2,13.73,21.74,26.52,16.49,2.42,4.9
4,11-2021.00,3.A.1,0.0,1.43,0.0,0.0,9.04,7.23,12.08,18.8,14.98,33.99,2.47


3.A.2  -  On-Site or In-Plant Training


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,PT_01,PT_02,PT_03,PT_04,PT_05,PT_06,PT_07,PT_08,PT_09
0,11-1011.00,3.A.2,10.22,3.82,5.15,6.66,16.79,0.52,6.62,15.92,34.32
1,11-1011.03,3.A.2,40.74,22.22,7.41,7.41,7.41,0.0,7.41,0.0,7.41
2,11-1021.00,3.A.2,10.96,8.14,18.09,0.35,18.11,15.09,25.62,3.47,0.17
3,11-2011.00,3.A.2,22.39,25.12,21.26,10.12,7.34,6.45,2.42,0.0,4.9
4,11-2021.00,3.A.2,33.95,14.98,32.96,11.25,5.71,1.15,0.0,0.0,0.0


3.A.3  -  On-the-Job Training


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,OJ_01,OJ_02,OJ_03,OJ_04,OJ_05,OJ_06,OJ_07,OJ_08,OJ_09
0,11-1011.00,3.A.3,4.57,8.5,5.31,13.38,9.67,4.6,6.68,12.03,35.25
1,11-1011.03,3.A.3,25.93,18.52,11.11,22.22,7.41,7.41,0.0,3.7,3.7
2,11-1021.00,3.A.3,3.47,14.53,16.13,10.54,11.56,8.3,26.08,9.21,0.17
3,11-2011.00,3.A.3,10.58,38.54,21.64,7.65,11.03,5.66,0.0,4.9,0.0
4,11-2021.00,3.A.3,2.4,8.91,52.82,15.63,9.71,0.0,10.53,0.0,0.0


3.A.4.a  -  Apprenticeship


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,IM
0,11-1011.00,3.A.4.a,2.2
1,11-1011.03,3.A.4.a,1.81
2,11-1021.00,3.A.4.a,2.33
3,11-2011.00,3.A.4.a,2.11
4,11-3013.00,3.A.4.a,2.36


In [112]:
# ! Old version with recomend suppres
# data_vars = ['Education, Training, and Experience']
# data_var = data_vars[0]
# data_edu_tran_exp = get_data(data_var)
# data_edu_tran_exp = prepare_data(data_edu_tran_exp)
# data_edu_tran_exp = data_edu_tran_exp[[c for c in COLUMNS_KEEP_MEASUREMENTS if c in data_edu_tran_exp.columns]]

# data_long = {}

# for element in data_edu_tran_exp.ELEMENT_ID.unique():
#     print(data_model[data_model.ELEMENT_ID == element].iloc[0]['ELEMENT_NAME'])
#     sub_data = data_edu_tran_exp[(data_edu_tran_exp.ELEMENT_ID == element)].copy()

#     sub_data['CATEGORY'] = sub_data['CATEGORY'].fillna("")
#     sub_data['CATEGORY'] = sub_data['SCALE_ID'] + sub_data['CATEGORY'].apply(lambda x: f"_{int(x):02d}" if x else "")
    
#     sub_data_long = sub_data.pivot(
#             index=['ONET_SOC_CODE'],
#             columns=['CATEGORY'],
#             values=['DATA_VALUE', 'RECOMMEND_SUPPRESS']
#         ).reset_index()
    
#     sub_data_long.columns = [col[1] + ("_SUPPRESS" if col[0] == "RECOMMEND_SUPPRESS" else "") for col in sub_data_long.columns]
    
#     display(
#         sub_data_long.head()
#     )
#     data_long[element] = sub_data_long


#### Job Zones

In [114]:
data_job_zones = process_measurements("Job Zones")
data_job_zones

Unnamed: 0,ONET_SOC_CODE,JOB_ZONE
0,11-1011.00,5
1,11-1011.03,5
2,11-1021.00,4
3,11-1031.00,4
4,11-2011.00,4
...,...,...
918,53-7071.00,2
919,53-7072.00,2
920,53-7073.00,2
921,53-7081.00,2


#### Interests

In [123]:
data_interests_wide = process_measurements("Interests")
for group, data in data_interests_wide.items():
    group_name = data_model[data_model['ELEMENT_ID'] == group].ELEMENT_NAME.values[0]
    print(group, " - " , group_name)
    display(data.head())

1.B.1.a  -  Realistic


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,OI
0,11-1011.00,1.B.1.a,1.3
1,11-1011.03,1.B.1.a,2.04
2,11-1021.00,1.B.1.a,2.22
3,11-1031.00,1.B.1.a,1.54
4,11-2011.00,1.B.1.a,1.07


1.B.1.b  -  Investigative


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,OI
0,11-1011.00,1.B.1.b,3.24
1,11-1011.03,1.B.1.b,4.78
2,11-1021.00,1.B.1.b,2.39
3,11-1031.00,1.B.1.b,3.35
4,11-2011.00,1.B.1.b,1.71


1.B.1.c  -  Artistic


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,OI
0,11-1011.00,1.B.1.c,2.08
1,11-1011.03,1.B.1.c,2.48
2,11-1021.00,1.B.1.c,1.31
3,11-1031.00,1.B.1.c,2.7
4,11-2011.00,1.B.1.c,3.85


1.B.1.d  -  Social


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,OI
0,11-1011.00,1.B.1.d,3.52
1,11-1011.03,1.B.1.d,3.55
2,11-1021.00,1.B.1.d,3.37
3,11-1031.00,1.B.1.d,3.69
4,11-2011.00,1.B.1.d,3.14


1.B.1.e  -  Enterprising


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,OI
0,11-1011.00,1.B.1.e,6.88
1,11-1011.03,1.B.1.e,6.68
2,11-1021.00,1.B.1.e,6.96
3,11-1031.00,1.B.1.e,5.52
4,11-2011.00,1.B.1.e,7.0


1.B.1.f  -  Conventional


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,OI
0,11-1011.00,1.B.1.f,5.0
1,11-1011.03,1.B.1.f,4.49
2,11-1021.00,1.B.1.f,5.32
3,11-1031.00,1.B.1.f,3.62
4,11-2011.00,1.B.1.f,4.3


1.B.1.g  -  First Interest High-Point


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,IH
0,11-1011.00,1.B.1.g,5.0
1,11-1011.03,1.B.1.g,5.0
2,11-1021.00,1.B.1.g,5.0
3,11-1031.00,1.B.1.g,5.0
4,11-2011.00,1.B.1.g,5.0


1.B.1.h  -  Second Interest High-Point


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,IH
0,11-1011.00,1.B.1.h,6.0
1,11-1011.03,1.B.1.h,2.0
2,11-1021.00,1.B.1.h,6.0
3,11-1031.00,1.B.1.h,4.0
4,11-2011.00,1.B.1.h,6.0


1.B.1.i  -  Third Interest High-Point


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,IH
0,11-1011.00,1.B.1.i,0.0
1,11-1011.03,1.B.1.i,6.0
2,11-1021.00,1.B.1.i,0.0
3,11-1031.00,1.B.1.i,6.0
4,11-2011.00,1.B.1.i,3.0


#### Work Values

In [126]:
for group, data in process_measurements("Work Values").items():
    group_name = data_model[data_model['ELEMENT_ID'] == group].ELEMENT_NAME.values[0]
    print(group, " - " , group_name)
    display(data.head())

1.B.2.a  -  Achievement


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,EX
0,11-1011.00,1.B.2.a,6.33
1,11-1011.03,1.B.2.a,6.67
2,11-1021.00,1.B.2.a,5.33
3,11-1031.00,1.B.2.a,5.33
4,11-2011.00,1.B.2.a,5.33


1.B.2.b  -  Working Conditions


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,EX
0,11-1011.00,1.B.2.b,6.33
1,11-1011.03,1.B.2.b,6.33
2,11-1021.00,1.B.2.b,6.0
3,11-1031.00,1.B.2.b,4.33
4,11-2011.00,1.B.2.b,5.33


1.B.2.c  -  Recognition


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,EX
0,11-1011.00,1.B.2.c,7.0
1,11-1011.03,1.B.2.c,6.0
2,11-1021.00,1.B.2.c,5.67
3,11-1031.00,1.B.2.c,5.0
4,11-2011.00,1.B.2.c,5.33


1.B.2.d  -  Relationships


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,EX
0,11-1011.00,1.B.2.d,5.0
1,11-1011.03,1.B.2.d,5.0
2,11-1021.00,1.B.2.d,6.33
3,11-1031.00,1.B.2.d,5.67
4,11-2011.00,1.B.2.d,5.0


1.B.2.e  -  Support


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,EX
0,11-1011.00,1.B.2.e,5.33
1,11-1011.03,1.B.2.e,3.33
2,11-1021.00,1.B.2.e,4.67
3,11-1031.00,1.B.2.e,4.0
4,11-2011.00,1.B.2.e,4.0


1.B.2.f  -  Independence


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,EX
0,11-1011.00,1.B.2.f,7.0
1,11-1011.03,1.B.2.f,6.67
2,11-1021.00,1.B.2.f,6.0
3,11-1031.00,1.B.2.f,5.0
4,11-2011.00,1.B.2.f,5.33


1.B.2.g  -  First Work Value High-Point


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,VH
0,11-1011.00,1.B.2.g,3.0
1,11-1011.03,1.B.2.g,1.0
2,11-1021.00,1.B.2.g,4.0
3,11-1031.00,1.B.2.g,4.0
4,11-2011.00,1.B.2.g,2.0


1.B.2.h  -  Second Work Value High-Point


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,VH
0,11-1011.00,1.B.2.h,6.0
1,11-1011.03,1.B.2.h,6.0
2,11-1021.00,1.B.2.h,6.0
3,11-1031.00,1.B.2.h,1.0
4,11-2011.00,1.B.2.h,1.0


1.B.2.i  -  Third Work Value High-Point


SCALE_ID,ONET_SOC_CODE,ELEMENT_ID,VH
0,11-1011.00,1.B.2.i,1.0
1,11-1011.03,1.B.2.i,2.0
2,11-1021.00,1.B.2.i,2.0
3,11-1031.00,1.B.2.i,3.0
4,11-2011.00,1.B.2.i,6.0


#### Knowledge, Skills, Abilities, Work Styles, Work Activities

In [128]:
for data_var in ["Knowledge", "Skills", "Abilities", "Work Styles", "Work Activities"]:
    print(data_var)
    data = get_data(data_var)
    data = prepare_data(data)
    data = process_measurements(data)
    print(data)
    break
    # display(data.head())

Knowledge
{'2.C.1.a': SCALE_ID ONET_SOC_CODE ELEMENT_ID    IM    LV NOT_RELEVANT
0           11-1011.00    2.C.1.a  4.78  6.50            N
1           11-1011.03    2.C.1.a  4.15  5.00            N
2           11-1021.00    2.C.1.a  4.25  4.38            N
3           11-2011.00    2.C.1.a  4.12  4.41            N
4           11-2021.00    2.C.1.a  4.04  4.61            N
..                 ...        ...   ...   ...          ...
874         53-7071.00    2.C.1.a  3.83  4.17            N
875         53-7072.00    2.C.1.a  2.97  3.08            N
876         53-7073.00    2.C.1.a  2.49  2.49            N
877         53-7081.00    2.C.1.a  2.12  1.40            N
878         53-7121.00    2.C.1.a  2.10  1.74            N

[879 rows x 5 columns], '2.C.1.b': SCALE_ID ONET_SOC_CODE ELEMENT_ID    IM    LV NOT_RELEVANT
0           11-1011.00    2.C.1.b  2.42  2.69            N
1           11-1011.03    2.C.1.b  2.62  4.42            N
2           11-1021.00    2.C.1.b  3.21  3.95            

#### Task Statements

In [138]:
data_var = "Task Statements"
data_task_statements = get_data(data_var)
data_task_statements = prepare_data(data_task_statements)
data_task_statements = data_task_statements[[c for c in COLUMNS_KEEP_MEASUREMENTS if c in data_task_statements.columns]]
data_task_statements.head()

Unnamed: 0,ONET_SOC_CODE,TASK_TYPE,TASK_ID
0,11-1011.00,Core,8823
1,11-1011.00,Core,8824
2,11-1011.00,Core,8827
3,11-1011.00,Core,8826
4,11-1011.00,Core,8834


#### Task Ratings

In [140]:
data_var = "Task Ratings"
data_task_ratings = get_data(data_var)
data_task_ratings = prepare_data(data_task_ratings)
data_task_ratings = data_task_ratings[[c for c in COLUMNS_KEEP_MEASUREMENTS if c in data_task_ratings.columns]]

data_task_ratings.head()

Unnamed: 0,ONET_SOC_CODE,SCALE_ID,TASK_ID,CATEGORY,DATA_VALUE,RECOMMEND_SUPPRESS
0,11-1011.00,FT,8823,1.0,5.92,N
1,11-1011.00,FT,8823,2.0,15.98,N
2,11-1011.00,FT,8823,3.0,29.68,N
3,11-1011.00,FT,8823,4.0,21.18,N
4,11-1011.00,FT,8823,5.0,19.71,N


In [190]:
for scale in data_task_ratings.SCALE_ID.unique():
    print(data_scales[data_scales.SCALE_ID == scale].iloc[0]['SCALE_NAME'])
    sub_data = data_task_ratings[
        (data_task_ratings.SCALE_ID == scale)
        ].copy()
    sub_data['CATEGORY'] = sub_data['CATEGORY'].fillna("")  # Fill NaN values with -1
    sub_data_long = sub_data.pivot(
            index=['ONET_SOC_CODE', 'TASK_ID', "RECOMMEND_SUPPRESS"],
            columns= ['SCALE_ID', "CATEGORY"],
            values='DATA_VALUE'
        ).reset_index()
    
    new_cols = [col[0]  + ("_" + str(int(col[1])) if type(col[1]) == float else col[1]) for col in sub_data_long.columns]
    if scale == "FT":
        sub_data_long.columns = [data_task_cat_dict[col] if col in data_task_cat_dict else col for col in new_cols]
    else:
        sub_data_long.columns = new_cols


    display(
        sub_data_long.head()
    )

Frequency of Task (Categories 1-7)


Unnamed: 0,ONET_SOC_CODE,TASK_ID,RECOMMEND_SUPPRESS,YEARLY_LESS,MORE_YEARLY,MORE_MONTHLY,MORE_WEEKLY,DAILY,SEVERAL_TIMES_DAILY,HOURLY_MORE
0,11-1011.00,8823,N,5.92,15.98,29.68,21.18,19.71,4.91,2.63
1,11-1011.00,8824,N,1.42,14.44,27.31,25.52,26.88,2.52,1.9
2,11-1011.00,8825,N,4.08,22.03,36.36,8.29,24.71,1.97,2.56
3,11-1011.00,8826,N,3.03,17.33,20.3,18.1,33.16,2.01,6.07
4,11-1011.00,8827,N,15.5,38.21,32.73,5.15,5.25,0.19,2.98


Importance


Unnamed: 0,ONET_SOC_CODE,TASK_ID,RECOMMEND_SUPPRESS,IM
0,11-1011.00,8823,N,4.52
1,11-1011.00,8824,N,4.32
2,11-1011.00,8825,N,4.13
3,11-1011.00,8826,N,4.24
4,11-1011.00,8827,N,4.3


Relevance of Task


Unnamed: 0,ONET_SOC_CODE,TASK_ID,RECOMMEND_SUPPRESS,RT
0,11-1011.00,8823,N,74.44
1,11-1011.00,8824,N,81.71
2,11-1011.00,8825,N,98.57
3,11-1011.00,8826,N,97.79
4,11-1011.00,8827,N,93.41


#### Technology Skills

In [196]:
data_var = "Technology Skills"
data_technology_skills = get_data(data_var)
data_technology_skills = prepare_data(data_technology_skills)
data_technology_skills = data_technology_skills[[c for c in COLUMNS_KEEP_MEASUREMENTS if c in data_technology_skills.columns]]
data_technology_skills

Unnamed: 0,ONET_SOC_CODE,COMMODITY_CODE,HOT_TECHNOLOGY,IN_DEMAND
0,11-1011.00,43232202,Y,N
1,11-1011.00,43232306,N,N
2,11-1011.00,43232201,Y,N
3,11-1011.00,43232303,N,N
4,11-1011.00,43231601,N,N
...,...,...,...,...
32622,53-7121.00,43233004,Y,N
32623,53-7121.00,43232110,Y,N
32624,53-7121.00,43231513,Y,N
32625,53-7121.00,43231602,Y,N


#### Tools Used

In [None]:
data_var = "Tools Used"
data_tools_used = get_data(data_var)
data_tools_used = prepare_data(data_tools_used)
data_tools_used = data_tools_used[[c for c in COLUMNS_KEEP_MEASUREMENTS if c in data_tools_used.columns]]
data_tools_used.head()

Unnamed: 0,ONET_SOC_CODE,COMMODITY_CODE
0,11-1011.00,44101809
1,11-1011.00,43211507
2,11-1011.00,43211503
3,11-1011.00,43211508
4,11-1011.00,43211504


In [208]:
data_occ.set_index("ONET_SOC_CODE").loc[data_tools_used.ONET_SOC_CODE.value_counts().index]

Unnamed: 0_level_0,TITLE,DESCRIPTION
ONET_SOC_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1
45-3031.00,Fishing and Hunting Workers,"Hunt, trap, catch, or gather wild animals or a..."
49-3023.00,Automotive Service Technicians and Mechanics,"Diagnose, adjust, repair, or overhaul automoti..."
47-2152.00,"Plumbers, Pipefitters, and Steamfitters","Assemble, install, alter, and repair pipelines..."
27-1012.00,Craft Artists,Create or reproduce handmade objects for sale ...
49-9021.00,"Heating, Air Conditioning, and Refrigeration M...","Install or repair heating, central air conditi..."
...,...,...
43-4141.00,New Accounts Clerks,Interview persons desiring to open accounts in...
15-2041.00,Statisticians,Develop or apply mathematical or statistical t...
21-1022.00,Healthcare Social Workers,"Provide individuals, families, and groups with..."
21-1023.00,Mental Health and Substance Abuse Social Workers,"Assess and treat individuals with mental, emot..."


#### Work Activities

In [210]:
data_var = "Work Activities"
data_work_activities = get_data(data_var)
data_work_activities = prepare_data(data_work_activities)
# data_work_activities = data_work_activities[[c for c in COLUMNS_KEEP_MEASUREMENTS if c in data_work_activities.columns]]
data_work_activities

Unnamed: 0,ONET_SOC_CODE,ELEMENT_ID,ELEMENT_NAME,SCALE_ID,DATA_VALUE,N,STANDARD_ERROR,LOWER_CI_BOUND,UPPER_CI_BOUND,RECOMMEND_SUPPRESS,NOT_RELEVANT,DATE,DOMAIN_SOURCE
0,11-1011.00,4.A.1.a.1,Getting Information,IM,4.56,29.0,0.1559,4.2369,4.8756,N,,08/2023,Incumbent
1,11-1011.00,4.A.1.a.1,Getting Information,LV,4.89,30.0,0.1727,4.5393,5.2458,N,N,08/2023,Incumbent
2,11-1011.00,4.A.1.a.2,"Monitoring Processes, Materials, or Surroundings",IM,4.25,30.0,0.2125,3.8130,4.6823,N,,08/2023,Incumbent
3,11-1011.00,4.A.1.a.2,"Monitoring Processes, Materials, or Surroundings",LV,5.21,30.0,0.3872,4.4133,5.9971,N,N,08/2023,Incumbent
4,11-1011.00,4.A.1.b.1,"Identifying Objects, Actions, and Events",IM,4.23,29.0,0.1544,3.9180,4.5507,N,,08/2023,Incumbent
...,...,...,...,...,...,...,...,...,...,...,...,...,...
72073,53-7121.00,4.A.4.c.1,Performing Administrative Activities,LV,2.27,27.0,0.3184,1.6108,2.9197,N,N,08/2019,Incumbent
72074,53-7121.00,4.A.4.c.2,Staffing Organizational Units,IM,1.93,27.0,0.2132,1.4962,2.3726,N,,08/2019,Incumbent
72075,53-7121.00,4.A.4.c.2,Staffing Organizational Units,LV,1.60,27.0,0.2965,0.9936,2.2125,N,N,08/2019,Incumbent
72076,53-7121.00,4.A.4.c.3,Monitoring and Controlling Resources,IM,2.56,27.0,0.2582,2.0266,3.0881,N,,08/2019,Incumbent
