# Extract titles of each ISCO-08 occupational category

In [1]:
import pandas as pd
%run ../notebook_preamble.ipy

In [2]:
# Import ISCO codes and titles
isco_titles_ = pd.read_csv(data_folder + 'raw/isco/CL_ISCO08_20200615_124613.csv')
isco_titles_.head()

Unnamed: 0,Order,Level,Code,Parent,Code.1,Parent.1,Description,Remark
0,1635901,1,10,,,,<B> CONCEPT: </B> The International Standard C...,
1,1635902,1,20,,TOTAL,,Total,
2,1635903,1,30,,OC1-5,,Non manual workers,
3,1635904,1,40,,OC1-3,,"Managers, professionals, technicians and assoc...",
4,1635905,1,50,,OC1,,Managers,


In [3]:
# Helper function
def select_titles_of_level(level=4):
    
    """
    Extracts ISCO category titles for the specified ISCO hierarchy level
    """ 
    # Select only codes pertaining to the Level
    if level != 1:
        isco_titles = isco_titles_[isco_titles_.Level==level].copy()
    else:
        isco_titles = isco_titles_[
            isco_titles_.Level==1 &
            (isco_titles_['Code.1'].isnull()==False)]
        isco_titles = isco_titles[isco_titles['Code.1'].apply(lambda x: x[0:2]=='OC') &
                                  (isco_titles['Code.1'].apply(lambda x: len(x)==3))]

    # Rename columns
    isco_titles.rename(columns={
        'Code.1': 'isco',
        'Description': 'isco_title'}, inplace=True)
    isco_titles = isco_titles[['isco', 'isco_title']]

    # Convert codes to 4-digit integers
    isco_titles['isco'] = isco_titles['isco'].apply(lambda x: int(x[2:]))
    isco_titles['level'] = level
        
    return isco_titles.reset_index(drop=True)

In [4]:
# Get the titles for each ISCO level
isco_level_titles = pd.concat([select_titles_of_level(i) for i in range(5)])

# Remove army roles for now
isco_level_titles = isco_level_titles[(isco_level_titles.isco_title.str.contains('armed')==False) & 
                  (isco_level_titles.isco_title.str.contains('Armed')==False)]

# Note that some rows may have combined two or more unit groups
isco_level_titles[isco_level_titles.isco.apply(lambda x: len(str(x))) > 4]

Unnamed: 0,isco,isco_title,level
18,22232253215322,"Nurses, midewives, health care assistants and ...",3
19,2223225321,"Nurses, midewives and health care assistants",3
20,222322,Nurses and midwives,3
21,2223222,Nursing professionals and midwives,3
57,22213221,Nurses,4
59,22223222,Midwives,4
270,53215322,Health care assistants and home-based personal...,4


In [5]:
# Remove rows that have combined two or more unit groups
isco_level_titles = isco_level_titles[isco_level_titles.isco.apply(lambda x: len(str(x))) <= 4]

In [6]:
# Check the final dataframe
isco_level_titles.sample(5)

Unnamed: 0,isco,isco_title,level
200,3433,"Gallery, museum and library technicians",4
276,5413,Prison guards,4
116,2642,Journalists,4
158,3240,Veterinary technicians and assistants,4
1,1112,Senior government officials,4


In [7]:
# Check the number of occupations at each digit level
isco_level_titles.groupby('level').count()

Unnamed: 0_level_0,isco,isco_title
level,Unnamed: 1_level_1,Unnamed: 2_level_1
1,9,9
2,40,40
3,127,127
4,433,433


In [8]:
# Export the titles
isco_level_titles.to_csv(data_folder + 'processed/ISCO_occupation_titles.csv', index=False)
