# Clustering of ESCO occupations

To find and analyse groups of related occupations that share similar job requirements and work characteristics, we applied clustering to the newly obtained occupation similarity values. The resulting grouping of occupations organises the 2942 ESCO occupations at two hierarchical levels – which we call skills-based sectors and sub-sectors – with 14 groups at the first level and 54 groups at the second level.

For more details on the clustering methodology, consult pp. 98-102 of the Mapping Career Causeways report.

# 0. Import dependencies and inputs

In [5]:
%run ../notebook_preamble.ipy

import mapping_career_causeways.cluster_utils as cluster_utils
import mapping_career_causeways.cluster_profiling_utils as cluster_profiling_utils

data = load_data.Data()

In [29]:
# Import the occupation similarity matrix
# W_combined = load_data.Similarities().W_combined
W_combined = np.load('/Users/karliskanders/Documents/career_causeways/data/processed/sim_matrices/occupationSimilarity_Combined_old.npy')

# Create a symmetric similarity matrix for clustering
W_cluster = 0.5*W_combined + 0.5*W_combined.T
W_cluster.shape

(2942, 2942)

In [30]:
# Number of occupations
n_occ = W_cluster.shape[0]

# 1. Set up clustering parameters

In [31]:
# Name of this clustering session
session_name = 'ESCO_occ_v1'
# Number of nearest neighbours used for the graph construction
nearest_neighbours = [15, 20, 25, 30, 60]
# Ensemble size for the first step
N = 1000
# Ensemble size for the consensus step
N_consensus = 100
# Number of clustering trials for each nearest neighbour value
N_nn = N // len(nearest_neighbours)
# Which clusters to break down from the partition
clusters = 'all' # Either a list of integers, or 'all'
# Path to save the clustering results
fpath = f'{data_folder}interim/raw_clustering/ESCO_occ_v1/'

clustering_params = {
    'N': N,
    'N_consensus': N_consensus,
    'N_nn': N_nn,
    'clusters': clusters,
    'fpath': fpath,
    'session_name': session_name,
    'nearest_neighbours': nearest_neighbours}

# 2. Perform two steps of clustering

## 2.1 Level-1

In [32]:
# Prepare and save the Level-0 partition file (all in one cluster)
partition_df = pd.DataFrame()
partition_df['id'] = data.occupations.id.to_list()
partition_df['cluster'] = np.zeros((len(data.occupations)))
partition_df.to_csv(fpath+session_name+'_clusters_Level0.csv')

# Set the random_state variable for reproduciblity
clustering_params['random_state'] = 14523

In [33]:
# Perform the clustering
cluster_utils.subcluster_nodes(W=W_cluster, l=0, **clustering_params)

Partitioning cluster 0.0...
Building the graph... done!
Clustering graph with 15 nearest-neighbours...
Setting random seeds...
Generating an ensemble with 200 partitions...
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Elapsed time:  24.00 seconds
Building the graph... done!
Clustering graph with 20 nearest-neighbours...
Setting random seeds...
Generating an ensemble with 200 partitions...
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Elapsed time:  23.00 seconds
Building the graph... done!
Clustering graph with 25 nearest-neighbours...
Setting random seeds...
Generating an ensemble with 200 partitions...
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

In [34]:
# Collect subclusters into one partition
partition_1 = cluster_utils.collect_subclusters(0, fpath, session_name, n_total=n_occ)

# Check that partition contains all nodes
len(partition_1)

Final partition saved in /Users/karliskanders/Documents/mapping-career-causeways/codebase/data/interim/raw_clustering/ESCO_occ_v1/ESCO_occ_v1_clusters_Level1.csv


2942

In [35]:
# Check a summary clustering result
cluster_utils.ConsensusClustering.describe_partition(partition_1.cluster.values)

Clustering with 2942 nodes and 14 clusters.


{'n': 14,
 'sizes': [600, 426, 322, 272, 237, 185, 178, 145, 129, 124, 89, 87, 74, 74]}

In [19]:
# Check a summary clustering result
cluster_utils.ConsensusClustering.describe_partition(partition_1.cluster.values)

Clustering with 2942 nodes and 14 clusters.


{'n': 14,
 'sizes': [600, 470, 323, 272, 237, 186, 179, 145, 129, 89, 87, 77, 74, 74]}

## 2.2 Level-2 clusters

In [155]:
# Load the partition that we wish to further split apart
partition = pd.read_csv(fpath + session_name + '_clusters_Level1.csv')

# Set the random_state variable for reproduciblity
clustering_params['random_state'] = 1

In [156]:
# Check that we have all the nodes
len(partition)

2942

In [157]:
clustering_params

{'N': 1000,
 'N_consensus': 100,
 'N_nn': 200,
 'clusters': 'all',
 'fpath': '../../data/interim/raw_clustering/ESCO_occ_v1/',
 'session_name': 'ESCO_occ_v1',
 'nearest_neighbours': [15, 20, 25, 30, 60],
 'random_state': 1}

In [158]:
# Perform the clustering
cluster_utils.subcluster_nodes(W=W_cluster, l=1, **clustering_params)


Partitioning cluster 0...
Building the graph... done!
Clustering graph with 15 nearest-neighbours...
Setting random seeds...
Generating an ensemble with 200 partitions...
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Elapsed time:  3.00 seconds
Building the graph... done!
Clustering graph with 20 nearest-neighbours...
Setting random seeds...
Generating an ensemble with 200 partitions...
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Elapsed time:  4.00 seconds
Building the graph... done!
Clustering graph with 25 nearest-neighbours...
Setting random seeds...
Generating an ensemble with 200 partitions...
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Elapsed time:  2.00 seconds
Building the graph... done!
Clustering graph with 60 nearest-neighbours...
Setting random seeds...
Generating an ensemble with 200 partitions...
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Elapsed time:  4.00 seconds
Clustering the consensus partition...
Setting random seeds...
Using co-occurrence matrix to do consensus clustering...
Building the graph... done!
Setting random seeds...
Generating an ensemble with 100 partitions...
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Average pairwise AMI across 100 partitions is 0.9744
Clustering with 272 nodes 

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Elapsed time:  1.00 seconds
Building the graph... done!
Clustering graph with 20 nearest-neighbours...
Setting random seeds...
Generating an ensemble with 200 partitions...
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Elapsed time:  1.00 seconds
Building the graph... done!
Clustering graph with 25 nearest-neighbours...
Setting random seeds...
Generating an ensemble with 200 partitions...
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Elapsed time:  1.00 seconds
Building the graph... d

In [15]:
# Adjust the clustering labels and save
partition_2 = cluster_utils.collect_subclusters(1, fpath, session_name, n_total = n_occ)
partition_2.cluster = partition_2.cluster-1
partition_2.to_csv(fpath + 'ESCO_occ_v1_clusters_Level2.csv', index=False)

UnboundLocalError: local variable 'data' referenced before assignment

In [199]:
cluster_utils.ConsensusClustering.describe_partition(partition_2.cluster.values);

Clustering with 2942 nodes and 54 clusters.


# 3. Combine the partitions from both steps

In [201]:
partition_1 = pd.read_csv(fpath + session_name + '_clusters_Level1.csv')
partition_2 = pd.read_csv(fpath + session_name + '_clusters_Level2.csv')

# Create a dataframe with all three partitions
partitions = partition_1.merge(partition_2, on='id')
partitions = partitions.rename(columns={'cluster_x': 'level_1', 'cluster_y': 'level_2'})

# Relabel Level 2 clusters to match the ordering of Level 1 clusters
partitions = partitions.sort_values(['level_1','level_2'])
level_2_labels = partitions.drop_duplicates('level_2').level_2.to_list()
level_2_new_labels = list(range(len(level_2_labels)))
relabel_dict = dict(zip(level_2_labels, level_2_new_labels))
partitions.level_2 = partitions.level_2.apply(lambda x: relabel_dict[x])
partitions = partitions.sort_values('id')

partitions.sort_values(['level_1','level_2'])

Unnamed: 0,id,level_1,level_2
2,2,0,0
51,51,0,0
83,83,0,0
93,93,0,0
113,113,0,0
...,...,...,...
604,604,13,53
1281,1281,13,53
1917,1917,13,53
2850,2850,13,53


In [218]:
# Final dataframe with occupation clusters
occ_cluster = load_data.Data().occupation_hierarchy.merge(partitions, on='id')
occ_cluster = occ_cluster[['id', 'concept_uri', 'preferred_label',
                           'isco_level_4',
                           'level_1', 'level_2']]

In [219]:
len(occ_cluster)

2942

# 4. Profile the clusters

In [220]:
occ_cluster.head(2)

Unnamed: 0,id,concept_uri,preferred_label,isco_level_4,onet_code,onet_title,level_1,level_2
0,0,http://data.europa.eu/esco/occupation/00030d09...,technical director,2166,27-1011.00,art directors,5,25
1,1,http://data.europa.eu/esco/occupation/000e93a3...,metal drawing machine operator,8121,51-4021.00,"extruding and drawing machine setters, operato...",3,16


In [221]:
occ_df = occupations.copy()
occ_df['onet_title'] = occ_risk['onet_title']
occ_df.loc[occ_df.onet_title.isnull(), 'onet_title'] = 'military occupation'

In [227]:
# Clusters keywords
keywords_level_1, keywords_level_1_ = profile_clusters_utils.tfidf_keywords(partitions.level_1.values,
                                                                            occ_df, 'preferred_label', [])
keywords_level_2, keywords_level_2_ = profile_clusters_utils.tfidf_keywords(partitions.level_2.values,
                                                                            occ_df, 'preferred_label', [])


In [233]:
# keywords_level_2

In [234]:
clusters_level_1 = partitions.copy().drop_duplicates('level_1').sort_values('level_1')
clusters_level_1 = clusters_level_1.drop(['level_2','id'], axis=1)
clusters_level_1['keywords'] = keywords_level_1
clusters_level_1 = clusters_level_1.reset_index(drop=True)

clusters_level_2 = partitions.copy().drop_duplicates(['level_1','level_2']).sort_values(['level_2'])
clusters_level_2 = clusters_level_2.drop(['id'], axis=1)
clusters_level_2['keywords'] = keywords_level_2
clusters_level_2 = clusters_level_2.reset_index(drop=True)


In [235]:
clusters_level_1

Unnamed: 0,level_1,keywords
0,0,"technician, operator, inspector, assembler, mi..."
1,1,"manager, officer, policy, policy officer, anal..."
2,2,"shop, shop manager, seller, specialised seller..."
3,3,"operator, machine operator, machine, maker, pr..."
4,4,"engineer, technician, drafter, engineering, en..."
5,5,"artist, director, editor, designer, painter, j..."
6,6,"teacher, school, lecturer, secondary school, s..."
7,7,"leather, leather good, textile, footwear, oper..."
8,8,"operator, food, operator food, machine operato..."
9,9,"import export, export, import, export manager,..."


In [236]:
clusters_level_2

Unnamed: 0,level_1,level_2,keywords
0,0,0,"assembler, technician, inspector, engineering ..."
1,0,1,"supervisor, construction, installer, operator,..."
2,0,2,"technician, repair technician, repair, operato..."
3,0,3,"officer, driver, guard, pilot, aviation, airpo..."
4,0,4,"environmental, inspector, worker, waste, prote..."
5,0,5,"mine, engineer, manager mine, operator, mining..."
6,1,6,"manager, supervisor, assembly supervisor, asse..."
7,1,7,"financial, insurance, analyst, manager, broker..."
8,1,8,"assistant, clerk, officer, court, administrati..."
9,1,9,"manager, business, product manager, officer, p..."


# 5. Export the final partitions

In [240]:
occ_cluster.to_csv(data_folder + f'interim/raw_clustering/partitions_{session_name}.csv', index=False)
clusters_level_1.to_csv(data_folder + f'interim/raw_clustering/partitions_{session_name}_LEVEL1.csv', index=False)
clusters_level_2.to_csv(data_folder + f'interim/raw_clustering/partitions_{session_name}_LEVEL2.csv', index=False)


# 6. Deeper manual profiling and curation

After obtaining two levels of occupational clusters, we performed a manual review of the results to label them and to make adjustments to the cluster membership of nodes if necessary.

## 6.1 Pre-requisites

In [5]:
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px

# Coordinates for the visualisation
X_ = np.load(data_folder + 'interim/ESCO_occ_UMAP_2D_embedding.npy')

# Colors
color_pal = ['#DCDCDC', '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
             '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe',
             '#008080', '#e6beff', '#9a6324', '#808080', '#800000',
             '#aaffc3', '#000000', '#ffd8b1', '#808000', '#000075']

In [6]:
# Cluster data
occ_clust = pd.read_csv(data_folder + 'processed/clusters/ESCO_occupation_clusters_v1.csv')
clusters_level_1 = pd.read_csv(data_folder + 'processed/clusters/ESCO_occupation_clusters_v1_LEVEL1_manual.csv')
clusters_level_2 = pd.read_csv(data_folder + 'processed/clusters/ESCO_occupation_clusters_v1_LEVEL2_manual.csv')


In [7]:
clusters_level_1

Unnamed: 0,level_1,keywords,keywords_manual
0,0,"technician, operator, inspector, assembler, mi...","technicians, construction, transport & securit..."
1,1,"manager, officer, policy, policy officer, anal...",business administration workers
2,2,"shop, shop manager, seller, specialised seller...",sales & services workers
3,3,"operator, machine operator, machine, maker, pr...",manufacturing workers
4,4,"engineer, technician, drafter, engineering, en...",engineers & researchers
5,5,"artist, director, editor, designer, painter, j...",arts & media workers
6,6,"teacher, school, lecturer, secondary school, s...",education workers
7,7,"leather, leather good, textile, footwear, oper...","textile, clothing, leather & footwear manufact..."
8,8,"operator, food, operator food, machine operato...",food & tobacco production workers
9,9,"import export, export, import, export manager,...",logistics workers


In [8]:
def get_cluster_counts(level, clusters_level):
    level_counts = occ_clust.groupby(level).count()
    level_counts[level] = level_counts.index.to_list()
    level_counts = level_counts.rename(columns={'id': 'counts'}).reset_index(drop=True)
    clusters_level = clusters_level.merge(level_counts[[level, 'counts']], on=level, how='left')
    return clusters_level


In [9]:
# Number of skills per cluster
clusters_level_1 = get_cluster_counts('level_1', clusters_level_1)
clusters_level_2 = get_cluster_counts('level_2', clusters_level_2)


In [10]:
# Viz dataframe
def highlight_cluster(cluster_id, level, clusters_level):
    
    occ_viz = occ_clust.copy()
    occ_viz = occ_viz.merge(occ_risk[['id','risk_cat_label']], on='id', how='left')
    occ_viz = occ_viz.merge(clusters_level[[level,'keywords_manual']], on=level, how='left')

    occ_viz['x'] = X_[:,0]
    occ_viz['y'] = X_[:,1]

    # Select a label to highlight
    lab = 'Very high risk'
    occ_viz['color'] = 'other'
    occ_viz.loc[((occ_viz[level]==cluster_id)&(occ_viz.risk_cat_label==lab)),'color'] = 'very high risk'
    occ_viz.loc[((occ_viz[level]==cluster_id)&(occ_viz.risk_cat_label!=lab)),'color'] = 'NOT very high risk'

    color_discrete_map={
        "other": color_pal[0],
        "very high risk": 'red',
        "NOT very high risk": "blue"}

    fig = px.scatter(occ_viz, x='x', y='y',
                     hover_data=['id','preferred_label', 'onet_title', 
                                 'risk_cat_label', 'level_1', 'level_2', 'keywords_manual'],
                     color='color',
                     color_discrete_map = color_discrete_map,
                     width=1000, height=500, opacity=0.66)
    fig.update_traces(textposition='top center', textfont_size=10)
    fig.update_layout(plot_bgcolor='rgb(255,255,255)', title=clusters_level.loc[cluster_id].keywords_manual)
#     fig.update_xaxes(range=[-10,10])
    # fig.update_yaxes(range=[-2, 22])
    fig.show()



In [11]:
# Save the edits here
curated_clusters = occ_clust[['id', 'level_1', 'level_2']].copy()


## 6.2 CHECK: Majority of neighbors

In [222]:
def check_majority_neighbors(j_id, k = 10, level='level_2'):
    # Find closest neighbors
    df = find_closest(j_id, W_combined, occ_clust).iloc[1:k].groupby(level).count()
    # Clusters
    clusts = df.index.to_list()
    # Counts
    counts = df.id.values
    # Proportion of neighbors in each cluster
    vote = counts / counts.sum()
    # Majority cluster
    majority_clust = clusts[np.argsort(vote)[-1]]
    majority_vote = max(vote)
    is_same = (majority_clust == occ_clust.loc[j_id][level])
    
    # Assigned cluster
    own_clust_index = np.where(np.array(clusts)==occ_clust.loc[j_id][level])[0]
    if len(own_clust_index)!=0:
        own_clust_vote = vote[own_clust_index[0]]
    else:
        own_clust_vote=0
        
    return is_same, majority_vote, majority_clust, own_clust_vote


def get_majority_neighbor_df(k = 10, level='level_2', clusters_level=clusters_level_2):
    
    majority_neighbors_df = pd.DataFrame(data={
        'id':[],
        'is_same':[],
        'majority_vote':[],
        'majority_clust':[],
        'own_clust_vote': []})

    for i, row in occ_clust.iterrows():
        is_same, majority_vote, majority_clust, own_clust_vote = check_majority_neighbors(row.id, k, level)
        majority_neighbors_df = majority_neighbors_df.append({
            'id': row.id,
            'is_same': is_same,
            'majority_vote': majority_vote,
            'majority_clust': majority_clust,
            'own_clust_vote': own_clust_vote
        }, ignore_index=True)
    
    # Add titles, cluster labels and order columns
    majority_neighbors_df = majority_neighbors_df.merge(occ_clust[[level,'id', 'preferred_label']], on='id', how='left')
    majority_neighbors_df.rename(columns={level:'own_clust'}, inplace=True)
    majority_neighbors_df = majority_neighbors_df.merge(clusters_level[[level, 'keywords_manual']],
                              left_on='own_clust', right_on=level, how='left')
    majority_neighbors_df = majority_neighbors_df.merge(clusters_level[[level, 'keywords_manual']],
                              left_on='majority_clust', right_on=level, how='left')
    majority_neighbors_df.rename(columns={'keywords_manual_x': 'own_label'}, inplace=True)
    majority_neighbors_df.rename(columns={'keywords_manual_y': 'majority_label'}, inplace=True)
    majority_neighbors_df.drop([level+'_x',level+'_y'], axis=1, inplace=True)
        
    return majority_neighbors_df

In [233]:
majority_neighbors = get_majority_neighbor_df(15, 'level_2', clusters_level_2)

In [234]:
majority_neighbors.groupby('is_same').count()

Unnamed: 0_level_0,id,majority_vote,majority_clust,own_clust_vote,own_clust,preferred_label,own_label,majority_label
is_same,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,231,231,231,231,231,231,231,231
1.0,2711,2711,2711,2711,2711,2711,2711,2711


In [235]:
# Ratio
majority_neighbors['ratio'] = majority_neighbors['majority_vote']/majority_neighbors_level_1['own_clust_vote']


In [237]:
majority_neighbors = majority_neighbors[['id','preferred_label','is_same','own_clust_vote','majority_vote',
                                         'own_clust','majority_clust',
                                         'own_label','majority_label', 'ratio']]

Manually assess occupations that are not in the same cluster as the majority of their neighbours...

In [598]:


# Interesting one...
art_preservation_cluster = [
    569,
    121,
    1153,
    1966,
    1712,
    1787,
    1402
]

In [270]:
# find_closest(43, W_combined, occ_clust).head(25)[['preferred_label','level_1','level_2']]

'Aquaculture mooring managers carry out and supervise the mooring of cages in stable stations, drifting cages or even self-propelled and semi-submerged cages. They safely operate and moor a variety of different types of large-scale cages, manage conditions such as currents, wave climate and seabed profile, in open or semi-open water areas.'

In [322]:
majority_neighbors.sort_values('ratio', ascending=False).loc[820]

id                                   820
preferred_label     ICT capacity planner
is_same                                0
own_clust_vote                  0.214286
majority_vote                   0.285714
own_clust                             38
majority_clust                        45
own_label          distribution managers
majority_label              ict managers
ratio                            1.33333
Name: 820, dtype: object

## 6.3 CHECK: Qualitative check

In [616]:
cluster_id = 21
print(f'{clusters_level_2.loc[cluster_id].counts} occupations in this cluster.')
highlight_cluster(cluster_id, 'level_2', clusters_level_2)

85 occupations in this cluster.


In [541]:
for i, row in occ_clust[occ_clust.level_2==cluster_id].sort_values('preferred_label').iterrows():
    print(f'{row.preferred_label} ({row.id})')

artistic painter (667)
cartoonist (2739)
ceramic painter (42)
ceramicist (2733)
community artist (943)
conceptual artist (1785)
decorative painter (781)
digital artist (2511)
drawing artist (796)
furniture designer (2219)
glass artist (2112)
glass painter (1299)
glass-blower (1204)
interior designer (1393)
performance artist (1969)
porcelain painter (2566)
sculptor (1763)
sign maker (1911)
street artist (108)
video artist (1484)
wood painter (1780)


In [621]:
occupations.loc[2598].description

'Energy consultants advise clients on the advantages and disadvantages of different energy sources. They help clients to understand energy tariffs and try to reduce their energy consumption and carbon footprint by using energy efficient products and methods.'

In [622]:
find_closest(2598, W_combined, occ_clust).head(25)[['preferred_label','level_1','level_2', 'similarity']]

Unnamed: 0,preferred_label,level_1,level_2,similarity
2598,energy consultant,0,1,0.999625
2914,solar energy sales consultant,2,12,0.499103
2932,renewable energy sales representative,2,12,0.485631
124,domestic energy assessor,2,14,0.481013
215,renewable energy consultant,2,12,0.476293
908,electricity sales representative,2,12,0.467958
2412,energy analyst,4,21,0.43926
430,energy conservation officer,4,21,0.438406
2364,energy assessor,4,21,0.429292
1959,specialist dentist,10,41,0.371259


## 6.4 CHECK: Manual curation

In [12]:
switch_labels = {
    131: 33, # textile machine operator to textiles
    1982: 18, # taxidermist to craftsmen
    847: 23, # rental technician to technicians
    1948: 49, # game keeper to animal husbandry
    2497: 50, # aquaculture
    1179: 48,
    2835: 48,
    1219: 48,
    1161: 47,
    613: 45, # picture archiving sys admin
    1531: 41, # speech & language to the advanced
    2331: 41,
    820: 45,
    1405: 53, # graphologist
    1446: 36, # bunch of operators re-assigned to operator cluster
    11: 36,
    2089: 36,
    2651: 36,
    1584: 36,
    2278: 36,
    1345: 36,
    2741: 36,
    507: 36,
    1446: 36,
    1747: 36,
    200: 36,
    1206: 36,
    1343: 36,
    1123: 36,
    2185: 36,
    1266: 11, # aircraft groomer should go to cleaners 
    2134: 11, # airline food service worker should go to food services
    744: 3, # warehouse workers & movers to transport & security
    2754: 3,
    2144: 3,
    2368: 3,
    2424: 3,
    2258: 3,
    2825: 3,
    1884: 28, # trainers to instructors
    145: 28,
    2650: 20, # conservation scientists to research
    690: 43, # personal trainer to physical therapists
    1144: 42, # pharmacists to technicians & scientists
    1486: 42,
    1703: 42,
    2645: 11, # amusement and recreation attendant to services
    1969: 26, # performance artist
    1337: 23, # camera operator to media engineers
    852: 47, ## Language engineer to data? Interesting that it is together with other engineers 
    330: 20, # chemical application specialist to science technician
    435: 2, # Greaser to mechanics
    1673: 16, #
    2359: 28, # fitness instructor to instructors
    778: 11, # body artist 
    904: 3, ## airport CEO to transport
    2639: 10,
    1413: 43, ## weight loss consultant
    913: 3, ## intelligence officer
    1428: 3, ## private detective
    1850: 2, # vending machine operators
    687: 11,
    693: 6,
    611: 1, # construction
    2768: 1, # construction
    1911: 23, # art tech
    2219: 23, # art tech
    569: 18, # book and art restorers to craftsmen
    121: 18,
    1153: 23,
    1966: 20, # museum scientist
    1712: 25,
    1787: 25,
    1402: 23,
    2598: 12,
    889: 51 # social worker to social workers
}

In [13]:
len(switch_labels.keys())

75

In [14]:
df = occ_clust.loc[list(switch_labels.keys())].copy()
df['level_2'] = list(switch_labels.values())
df = df.merge(clusters_level_2[['level_2', 'keywords_manual']], how='left', on='level_2')
df = df.sort_values(['level_2'])
df[['id','preferred_label','level_2','keywords_manual']].iloc[30:]

Unnamed: 0,id,preferred_label,level_2,keywords_manual
68,1153,art handler,23,art & media technicians
72,1402,conservator,23,art & media technicians
2,847,performance rental technician,23,art & media technicians
65,2219,furniture designer,23,art & media technicians
64,1911,sign maker,23,art & media technicians
70,1712,exhibition registrar,25,creative managers & graphic designers
71,1787,collection manager,25,creative managers & graphic designers
46,1969,performance artist,26,perfomers
52,2359,fitness instructor,28,instructors & vocational teachers
39,145,corporate trainer,28,instructors & vocational teachers


### Switch the labels and save the curated version as `v1_1`

In [15]:
# Level-2
for job_i in list(switch_labels.keys()):
    occ_clust.loc[job_i, 'level_2'] = switch_labels[job_i]
    new_clust_level_1 = clusters_level_2[clusters_level_2.level_2==switch_labels[job_i]].level_1.values[0]
    occ_clust.loc[job_i, 'level_1'] = new_clust_level_1
    

In [16]:
occ_clust.to_csv(data_folder + 'processed/clusters/ESCO_occupation_clusters_v1_1_curated.csv', index=False)


In [21]:
clusters_level_1 = pd.read_csv(data_folder + 'processed/clusters/ESCO_occupation_clusters_v1_LEVEL1_manual.csv')
clusters_level_2 = pd.read_csv(data_folder + 'processed/clusters/ESCO_occupation_clusters_v1_LEVEL2_manual.csv')

# Number of skills per cluster
clusters_level_1 = get_cluster_counts('level_1', clusters_level_1)
clusters_level_2 = get_cluster_counts('level_2', clusters_level_2)

clusters_level_1.to_csv(data_folder + 'processed/clusters/ESCO_occupation_clusters_v1_1_LEVEL1.csv', index=False)
clusters_level_2.to_csv(data_folder + 'processed/clusters/ESCO_occupation_clusters_v1_1_LEVEL2.csv', index=False)
