This notebook contains initial analysis _between_ measures of skills-, industries- and occupations on a sample of 100k job adverts

In [3]:
from dap_prinz_green_jobs.getters.ojo_getters import (
    get_extracted_green_measures,
    
)

from dap_prinz_green_jobs.getters.occupation_getters import load_job_title_soc
from dap_prinz_green_jobs import BUCKET_NAME, logger
from dap_prinz_green_jobs.utils.bert_vectorizer import BertVectorizer
import pandas as pd
import numpy as np

import altair as alt

import umap
from sklearn.cluster import KMeans
import random

In [20]:
#allow altair rendering

alt.renderers.enable('mimetype')

RendererRegistry.enable('mimetype')

### 0. Load relevant data for analysis
Load extracted green measures at the skill-, occupation- and industry-level. Also load job titles to contextualise results.

In [5]:
# #step 0. Load extracted green measures from s3 

green_measures = get_extracted_green_measures()

# #step 1. load ojo related data - SOC names for additional context

soc_occ_dict = (load_job_title_soc()
                .set_index("SOC 2020")
                ['SOC 2020 UNIT GROUP DESCRIPTIONS']
                .to_dict())

### 1. Merge and clean data so green measures are in a df
Clean up green measures and produce two dataframes:
1. numerical green measures;
2. extracted green skills

In [6]:
industry_measures_df = pd.DataFrame(green_measures['INDUSTRY MEASURES'])
occ_measures_df = pd.DataFrame(green_measures['OCCUPATION MEASURES'])
occ_measures_df = pd.merge(occ_measures_df, occ_measures_df['SOC'].apply(pd.Series), left_index=True, right_index=True).drop(columns=['SOC'])
occ_measures_df['soc_name'] = occ_measures_df['SOC_2020'].map(soc_occ_dict)

#create exploded skills df with skills info
green_skills_df = (pd.DataFrame(green_measures['SKILL MEASURES'])
                     .explode('skills')
                     .reset_index(drop=True))
green_skills_df[['skill_label', 'extracted_skill']] = pd.DataFrame(green_skills_df.skills.tolist(), index=green_skills_df.index)
green_skills_df[['extracted_skill','extracted_skill_id']] = pd.DataFrame(green_skills_df.extracted_skill.tolist(), index= green_skills_df.index)
green_skills_df.drop(columns=['skills'], inplace=True)
green_skills_df= pd.merge(green_skills_df, occ_measures_df, left_on='job_id', right_on='job_id')[['job_id', 'soc_name', 'skill_label', 'extracted_skill', 'extracted_skill_id']].rename(columns={'soc_name': 'occupation'})
green_skills_df.extracted_skill.replace('', None, inplace=True)

#create green skills measures df WITHOUT extracted skills 
num_green_skills_df = (green_skills_df
                     .groupby('job_id')
                     .extracted_skill_id
                     .agg(['count'])
                     .reset_index()
                     .rename(columns={'count': 'green_skills_count'}))
#add green skill percentage 
num_skills_df = (green_skills_df
                 .groupby('job_id')
                 .skill_label
                 .count()
                 .reset_index()
                 .rename(columns={'skill_label': 'skills_count'}))
skill_measures_df = pd.merge(num_green_skills_df, num_skills_df, how='left', on='job_id').assign(green_skill_percentage = lambda x: x.green_skills_count/x.skills_count)

#merge industry, occ and skills measures WITHOUT extracted skills
green_measures_df = pd.merge(industry_measures_df, occ_measures_df, on='job_id')
green_measures_df.columns = [i.lower() for i in green_measures_df.columns]
green_measures_df_skills = pd.merge(green_measures_df, skill_measures_df, on='job_id', how='left')

all_green_measures_df = (green_measures_df_skills
    [['job_id', 'soc_2020', 'soc_name', 'industry_ghg_emissions', 'green category', 'green/not green', 'green timeshare', 'skills_count', 'green_skills_count', 'green_skill_percentage']]
    .rename(columns={
        'job_title_raw': 'job_title',
        'company_raw': 'company',
        'soc_name': 'occupation',
        'job_location_raw': 'job_location',
        'green category': 'occupation_green_category',
        'green/not green': 'occupation_green_not_green',
        'green timeshare': 'occupation_green_timeshare',
    }))

#drop nas across all measures
all_green_measures_df.dropna(subset=['industry_ghg_emissions', 'occupation_green_category', 'occupation_green_not_green', 'occupation_green_timeshare', 'skills_count', 'green_skills_count', 'green_skill_percentage', 'soc_2020'], inplace=True)
all_green_measures_df = all_green_measures_df.replace(0, np.NaN)

In [7]:
#get occupations for which we have over 50 job adverts for 
representative_occs = (all_green_measures_df
.groupby('occupation')
.job_id
.count()
.sort_values(ascending=False)
.where(lambda x: x >= 50)
.dropna()
.keys()
.tolist())

all_green_measures_df = all_green_measures_df[all_green_measures_df['occupation'].isin(representative_occs)].reset_index(drop=True)

2. Analyse green measures between occupations, industries and skills 

Look at:
- job adverts that have both **high** occupational greenness and **high** % of green skills 
- job adverts with **low** occupation greenness and **high** % of green skills
- job adverts with **low** industry greeness (defined by high ghg emissions) and **high** occupation greenness

Plot:
- relationships between occupational, skill and industry greenness

In [8]:
# high occupation greenness (based on green share) and % of green skills
mean_green_timeshare_per_occ = (all_green_measures_df
                                .groupby('occupation')
                                .occupation_green_timeshare
                                .agg(['mean'])
                                .reset_index()
                                .rename(columns={'mean': 'mean_occupation_green_timeshare'}))

green_skill_occ = (all_green_measures_df
.groupby('occupation')
.green_skill_percentage
.agg(['mean'])
.reset_index()
.rename(columns={'mean': 'total_green_skills'})
.merge(mean_green_timeshare_per_occ, on='occupation', how='left'))

#is there a correlation between the # of green skills requested and the greenness of an occupation?
print('Correlation between the #mean  of green skills requested per occ and the greenness of an occupation: ', green_skill_occ['total_green_skills'].corr(green_skill_occ['mean_occupation_green_timeshare']))

Correlation between the #mean  of green skills requested per occ and the greenness of an occupation:  -0.09990383424687987


In [9]:
# low occupation greenness and high % of green skills

non_green_occ_green_skills = (all_green_measures_df
.query('occupation_green_category == "Non-Green"')
.groupby('occupation')
.green_skill_percentage
.agg(['sum'])
.reset_index()
.rename(columns={'sum': 'total_green_skills'})
.sort_values(by='total_green_skills', ascending=False)
.query('total_green_skills > 0'))

In [10]:
non_green_occ_green_skills_list = non_green_occ_green_skills.occupation.to_list()

(green_skills_df
.query('occupation in @non_green_occ_green_skills_list')
.groupby(['occupation', 'extracted_skill'])
.job_id
.count()
.reset_index()
.rename(columns={'job_id': 'count'})
.query('count > 10'))

#looks like the green skills associated to non-green occupations relate primarily to health and safety regulations 

Unnamed: 0,occupation,extracted_skill,count
34,"Book-keepers, payroll managers and wages clerks",analysing biological data,11
47,"Book-keepers, payroll managers and wages clerks",ensure compliance with environmental regulations,45
65,"Book-keepers, payroll managers and wages clerks",prepare management plans,21
95,Business and financial project management prof...,ensure compliance with environmental regulations,19
101,Business and financial project management prof...,give advice on environmental risk management s...,42
102,Business and financial project management prof...,health and safety regulations,18
108,Business and financial project management prof...,management plans developing,13
190,Care workers and home carers,perform cleaning activity in an environmentall...,26
221,Caretakers,health and safety regulations,16
226,Caretakers,perform cleaning activity in an environmentall...,13


In [11]:
# low industry greeness aka high ghg emissions and high occupation greenness

low_ind_high_occ_green = (all_green_measures_df
.query('occupation_green_not_green == "Green"')
.groupby('occupation')
.industry_ghg_emissions
.agg(['mean'])
.reset_index()
.rename(columns={'mean': 'mean_industry_ghg_emissions'})
.sort_values(by='mean_industry_ghg_emissions', ascending=False))[:10]

print(f'green occupations with high industry ghg emissions include: {low_ind_high_occ_green.occupation.to_list()}')

green occupations with high industry ghg emissions include: ['Construction operatives n.e.c.', 'Managers in storage and warehousing', 'Construction and building trades supervisors', 'Design occupations n.e.c.', 'Laboratory technicians', 'Large goods vehicle drivers', 'Production managers and directors in construction', 'Production and process engineers', 'Marketing, sales and advertising directors', 'Public services associate professionals']


In [12]:
#generate a dataframe with summed green measures per occupation

all_green_measures_df_occ = all_green_measures_df.groupby('occupation').aggregate({'industry_ghg_emissions': ['mean'], 'occupation_green_timeshare': ['mean'], 'green_skills_count': ['mean'], 'green_skill_percentage': ['sum']}).reset_index()
all_green_measures_df_occ.columns = all_green_measures_df_occ.columns.levels[0]
all_green_measures_df_occ.columns  = ['occupation', 'industry_ghg_emissions_mean', 'occupation_green_timeshare_mean', 'green_skills_count_mean', 'green_skill_percentage_sum']

#pick majority occupation greenness
occ_green_cat = all_green_measures_df.groupby('occupation').occupation_green_category.agg(
    lambda x: pd.Series.mode(x)[0])
#pick majority green/non-green occupation
occ_green_nongreen = all_green_measures_df.groupby('occupation').occupation_green_not_green.agg(
    lambda x: pd.Series.mode(x)[0])
all_green_measures_df_occ['occ_green_non_green'] = all_green_measures_df_occ.occupation.map(occ_green_nongreen)
all_green_measures_df_occ['occ_green_category'] = all_green_measures_df_occ.occupation.map(occ_green_cat)

In [13]:
# industry greenness vs. occupational greenness 

alt.Chart(all_green_measures_df_occ).mark_circle(size=60).encode(
    x=alt.X('occupation_green_timeshare_mean', scale=alt.Scale(zero=False)),
    y=alt.Y('industry_ghg_emissions_mean', scale=alt.Scale(zero=False)),
    color=alt.Color('occ_green_category',
                   scale=alt.Scale(
            domain=['Non-Green', 'Green Enhanced Skills', 'Green New & Emerging', 'Green Increased Demand'],
            range=['red', '#013220', 'green', '#90ee90'])),
    tooltip=['occupation', 'green_skills_count_mean', 'green_skill_percentage_sum']
).interactive()

<vega.vegalite.VegaLite at 0x7f9f5975b2b0>



In [14]:
# industry greenness vs. mean # of green skills requested

alt.Chart(all_green_measures_df_occ).mark_circle(size=60).encode(
    x=alt.X('green_skill_percentage_sum', scale=alt.Scale(zero=False)),
    y=alt.Y('industry_ghg_emissions_mean', scale=alt.Scale(zero=False)),
    color=alt.Color('occ_green_non_green',
                   scale=alt.Scale(
            domain=['Non-green', 'Green'],
            range=['red', 'green'])),
    tooltip=['occupation', 'green_skills_count_mean', 'green_skill_percentage_sum']
).interactive()


<vega.vegalite.VegaLite at 0x7f9f59756c40>



In [15]:
print('greening occupations:')
(all_green_measures_df_occ
 .dropna(subset=['industry_ghg_emissions_mean', 'green_skill_percentage_sum'])
 .sort_values(['industry_ghg_emissions_mean', 'green_skill_percentage_sum'], ascending=[True, False])
 [:10]
 .occupation)

greening occupations:


194                                       Welding trades
23                                   Chartered surveyors
121             Nursery education teaching professionals
177                               Solicitors and lawyers
38                             Customer service managers
22     Chartered architectural technologists, plannin...
102                           Legal professionals n.e.c.
55                                  Electrical engineers
96                                Insurance underwriters
101                        Legal associate professionals
Name: occupation, dtype: object

In [16]:
print('green occupations based on industry + green skills:')
(all_green_measures_df_occ
 .dropna(subset=['industry_ghg_emissions_mean', 'green_skill_percentage_sum'])
 .sort_values(['green_skill_percentage_sum', 'industry_ghg_emissions_mean'], ascending=[False, True])
 [:10]
 .occupation)

green occupations based on industry + green skills:


29                                Cleaners and domestics
192                                 Warehouse operatives
78               Health and safety managers and officers
119     Metal working production and maintenance fitters
141                          Printing machine assistants
64                             Environment professionals
6        Book-keepers, payroll managers and wages clerks
94               Industrial cleaning process occupations
138    Plumbers & heating and ventilating installers ...
152       Quality assurance and regulatory professionals
Name: occupation, dtype: object

In [17]:
print('brown occupations based on industry + green skills:')
(all_green_measures_df_occ
 .dropna(subset=['industry_ghg_emissions_mean', 'green_skill_percentage_sum'])
 .sort_values(['industry_ghg_emissions_mean', 'green_skill_percentage_sum'], ascending=[False, False])
 [:10]
 .occupation)

brown occupations based on industry + green skills:


135    Pharmacy and optical dispensing assistants
45                  Delivery drivers and couriers
136                              Physiotherapists
131        Packers, bottlers, canners and fillers
116                          Mental health nurses
134                                   Pharmacists
125                            Office supervisors
35                 Construction operatives n.e.c.
165                   Sales and retail assistants
109           Managers in storage and warehousing
Name: occupation, dtype: object

In [21]:
# occupational greenness vs. # of green skills requested

alt.Chart(all_green_measures_df_occ).mark_circle(size=60).encode(
    x=alt.X('green_skill_percentage_sum', scale=alt.Scale(zero=False)),
    y=alt.Y('occupation_green_timeshare_mean', scale=alt.Scale(zero=False)),
    color=alt.Color('occ_green_non_green',
                   scale=alt.Scale(
            domain=['Non-green', 'Green'],
            range=['red', 'green'])),
    tooltip=['occupation', 'green_skills_count_mean']
).interactive()

#list of "new" green skills

#list of "new" green occupations

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [22]:
#new green occupations (high % of green skills, low occ greeness)
print('Occuptations with high % of green skills and low occupation greenness (based on green timeshare):')
(all_green_measures_df_occ
.dropna(subset=['green_skill_percentage_sum', 'occupation_green_timeshare_mean'])
#sort values by high green skills percentage and low occupation greenness
.sort_values(['green_skill_percentage_sum', 'occupation_green_timeshare_mean'], ascending=[False, True])
[:10]
).occupation.to_list()

Occuptations with high % of green skills and low occupation greenness (based on green timeshare):


['Warehouse operatives',
 'Health and safety managers and officers',
 'Metal working production and maintenance fitters',
 'Environment professionals',
 'Industrial cleaning process occupations',
 'Plumbers & heating and ventilating installers and repairers',
 'Quality assurance and regulatory professionals',
 'Business and financial project management professionals',
 'Sales accounts and business development managers',
 'Electricians and electrical fitters']

In [23]:
#new green occupations (high % of green skills, low occ greeness)
print('occupations with high % of green skills and low occupation greenness (based on green timeshare):')
high_green_skills_low_occ_list = (all_green_measures_df_occ
.dropna(subset=['green_skill_percentage_sum', 'occupation_green_timeshare_mean'])
#sort values by high green skills percentage and low occupation greenness
.sort_values(['green_skill_percentage_sum', 'occupation_green_timeshare_mean'], ascending=[True, False])
[:10]
).occupation.to_list()
print(high_green_skills_low_occ_list)

occupations with high % of green skills and low occupation greenness (based on green timeshare):
['Authors, writers and translators', 'Communication operators', 'Local government administrative occupations', 'Brokers', 'Other health professionals n.e.c.', 'Education managers ', 'Electrical and electronic trades n.e.c.', 'Estate agents and auctioneers', 'Sales related occupations n.e.c.', 'Other vocational and industrial trainers']


In [24]:
new_green_skills = list(set(green_skills_df[(green_skills_df.occupation.isin(high_green_skills_low_occ_list)) & (green_skills_df.extracted_skill.isna())].skill_label))

bert_model = BertVectorizer().fit()
new_green_skills_embeds = bert_model.transform(new_green_skills)

[94;1;1m2023-06-01 17:43:09,727 - SkillsExtractor - INFO - Getting embeddings for 15317 texts ... (bert_vectorizer.py:35)[0m
[94;1;1m2023-06-01 17:43:09,728 - SkillsExtractor - INFO - .. with multiprocessing (bert_vectorizer.py:38)[0m
[94;1;1m2023-06-01 17:43:54,255 - SkillsExtractor - INFO - Took 44.52693700790405 seconds (bert_vectorizer.py:46)[0m


In [25]:
#reduce embeds 
reducer = umap.UMAP()
embedding = reducer.fit_transform(new_green_skills_embeds)
kmeans = KMeans(n_clusters=30, random_state=0).fit(embedding)
labels = kmeans.labels_

In [26]:
new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 'cluster_number': labels, 'x': embedding[:, 0], 'y': embedding[:, 1]})

In [27]:
print('new "green" skill groups based on high green occupation greenness and low % of green skills requested:')
alt.data_transformers.disable_max_rows()

alt.Chart(new_skill_cluster_df).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster_number', legend=None),
    tooltip=['skill', 'cluster_number']
).interactive()

new "green" skill groups based on high green occupation greenness and low % of green skills requested:


<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [28]:
green_occs = list(all_green_measures_df_occ[all_green_measures_df_occ['occ_green_non_green'] == 'Green'].occupation)

In [29]:
# #reduce embeds 
random.seed(50)
green_occ = random.choice(green_occs)

new_green_skills = list(set(green_skills_df[(green_skills_df.occupation == green_occ) & (green_skills_df.extracted_skill.isna())].skill_label))
new_green_skills_embeds = bert_model.transform(new_green_skills)
embedding = reducer.fit_transform(new_green_skills_embeds)
kmeans = KMeans(n_clusters=15, random_state=0).fit(embedding)
labels = kmeans.labels_

new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 'cluster_number': labels, 'x': embedding[:, 0], 'y': embedding[:, 1]})

print(f'new "green" skill groups based on green "{green_occ}"occupation')

alt.data_transformers.disable_max_rows()

alt.Chart(new_skill_cluster_df).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster_number', legend=None),
    tooltip=['skill', 'cluster_number']
).interactive()

[94;1;1m2023-06-01 17:44:11,444 - SkillsExtractor - INFO - Getting embeddings for 3779 texts ... (bert_vectorizer.py:35)[0m
[94;1;1m2023-06-01 17:44:11,444 - SkillsExtractor - INFO - .. with multiprocessing (bert_vectorizer.py:38)[0m
[94;1;1m2023-06-01 17:44:29,094 - SkillsExtractor - INFO - Took 17.649940967559814 seconds (bert_vectorizer.py:46)[0m


new "green" skill groups based on green "Transport and distribution clerks and assistants"occupation


<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [30]:
#reduce embeds 
random.seed(57)
green_occ = random.choice(green_occs)

new_green_skills = list(set(green_skills_df[(green_skills_df.occupation == green_occ) & (green_skills_df.extracted_skill.isna())].skill_label))
new_green_skills_embeds = bert_model.transform(new_green_skills)
embedding = reducer.fit_transform(new_green_skills_embeds)
kmeans = KMeans(n_clusters=10, random_state=0).fit(embedding)
labels = kmeans.labels_

new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 'cluster_number': labels, 'x': embedding[:, 0], 'y': embedding[:, 1]})

print(f'new "green" skill groups based on green "{green_occ}"occupation')

alt.data_transformers.disable_max_rows()

alt.Chart(new_skill_cluster_df).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster_number', legend=None),
    tooltip=['skill', 'cluster_number']
).interactive()

[94;1;1m2023-06-01 17:44:39,562 - SkillsExtractor - INFO - Getting embeddings for 722 texts ... (bert_vectorizer.py:35)[0m
[94;1;1m2023-06-01 17:44:39,562 - SkillsExtractor - INFO - .. with multiprocessing (bert_vectorizer.py:38)[0m
[94;1;1m2023-06-01 17:44:50,498 - SkillsExtractor - INFO - Took 10.936217069625854 seconds (bert_vectorizer.py:46)[0m


new "green" skill groups based on green "Carpenters and joiners"occupation


<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [31]:
#reduce embeds 
random.seed(12)
green_occ = random.choice(green_occs)

new_green_skills = list(set(green_skills_df[(green_skills_df.occupation == green_occ) & (green_skills_df.extracted_skill.isna())].skill_label))
new_green_skills_embeds = bert_model.transform(new_green_skills)
embedding = reducer.fit_transform(new_green_skills_embeds)
kmeans = KMeans(n_clusters=10, random_state=0).fit(embedding)
labels = kmeans.labels_

new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 'cluster_number': labels, 'x': embedding[:, 0], 'y': embedding[:, 1]})

print(f'new "green" skill groups based on green "{green_occ}"occupation')

alt.data_transformers.disable_max_rows()

alt.Chart(new_skill_cluster_df).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster_number', legend=None),
    tooltip=['skill', 'cluster_number']
).interactive()

[94;1;1m2023-06-01 17:44:52,445 - SkillsExtractor - INFO - Getting embeddings for 1494 texts ... (bert_vectorizer.py:35)[0m
[94;1;1m2023-06-01 17:44:52,446 - SkillsExtractor - INFO - .. with multiprocessing (bert_vectorizer.py:38)[0m
[94;1;1m2023-06-01 17:45:06,492 - SkillsExtractor - INFO - Took 14.045575141906738 seconds (bert_vectorizer.py:46)[0m


new "green" skill groups based on green "Science, engineering and production technicians n.e.c."occupation


<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


### Next steps

1. **Skills improvement**: looks like the green skill 'health and safety regulation' heavily skews skills-based results. We will also need to develop a method to determine if unmatched skill clusters are indeed green, even if the occupation is considered green. 

2. **Better sample**: given how few job adverts contain 'green' skills, should we engineer a sample that artificially would contain more green skills?