This notebook contains analysis _between_ measures of skills-, industries- and occupations on a sample of ~40K job adverts, engineered to contain both potentially "green" and "non-green" jobs.

In [1]:
from dap_prinz_green_jobs import PROJECT_DIR
from dap_prinz_green_jobs.utils.bert_vectorizer import BertVectorizer

import dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures as pg

import pandas as pd
import altair as alt
import umap
from sklearn.cluster import KMeans

import random
from datetime import datetime
import os

In [2]:
alt.data_transformers.disable_max_rows()

#save graphs
today = datetime.today().strftime('%y%m%d')
graph_dir = str(PROJECT_DIR / f"outputs/figures/between_measure_analysis/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

Creating /Users/india.kerlenesta/Projects/dap_green_jobs/dap_prinz_green_jobs/outputs/figures/between_measure_analysis/231213 directory


In [3]:
#instantiate reducers and functions for cleaning up skills

reducer = umap.UMAP(random_state=42)

### 0. Load relevant data for analysis
Load extracted green measures at the skill-, occupation- and industry-level. Also load job titles to contextualise results.

In [4]:
skill_measures_df, occs_measures_df, inds_measures_df, soc_name_dict = pg.load_ojo_green_measures()

2023-12-13 14:07:17,369 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2023-12-13 14:07:17,588 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


### 1. Merge and clean data so green measures are in a df
Clean up green measures and produce two dataframes:
1. numerical green measures;
2. extracted green skills

In [5]:
all_green_measures_df = pg.merge_green_measures(skill_measures_df, occs_measures_df, inds_measures_df, soc_name_dict)

There are 1000000 rows in the merged data
There are 1000000 unique job ids


In [6]:
# get occupations for which we have over 50 job adverts for
representative_occs = (
    all_green_measures_df.groupby("SOC_2020_name")
    .job_id.count()
    .sort_values(ascending=False)
    .where(lambda x: x >= 50)
    .dropna()
    .keys()
    .tolist()
)

print(len(all_green_measures_df))
all_green_measures_df = all_green_measures_df[
    all_green_measures_df["SOC_2020_name"].isin(representative_occs)
].reset_index(drop=True)
print(len(all_green_measures_df))

1000000
821992


In [7]:
#create skills df
green_skills_df = pg.create_skill_df(skill_measures_df)

100%|██████████| 13436987/13436987 [00:10<00:00, 1236240.47it/s]


## 2. **between analysis** of occupations, industries and skills 

In [10]:
# high occupation greenness (based on green share) and % of green skills
all_green_measures_df['occupation'] = all_green_measures_df['SOC_2020_name']

mean_green_timeshare_per_occ = (all_green_measures_df
                                .groupby('occupation')
                                ['GREEN TIMESHARE']
                                .agg(['mean'])
                                .reset_index()
                                .rename(columns={'mean': 'mean_occupation_green_timeshare'}))

green_skill_occ = (all_green_measures_df
.groupby('occupation')
['PROP_GREEN']
.agg(['mean'])
.reset_index()
.rename(columns={'mean': 'total_green_skills'})
.merge(mean_green_timeshare_per_occ, on='occupation', how='left'))

#is there a correlation between the # of green skills requested and the greenness of an occupation?
print('Correlation between the mean # of green skills requested per occupation and the "greenness" of an occupation: ', green_skill_occ['total_green_skills'].corr(green_skill_occ['mean_occupation_green_timeshare']))

Correlation between the mean # of green skills requested per occupation and the "greenness" of an occupation:  0.3694987903037365


In [11]:
# low occupation greenness and high total # of green skills

non_green_occ_green_skills = (all_green_measures_df[all_green_measures_df['GREEN CATEGORY'] == 'Non-Green']
.groupby('occupation')
['PROP_GREEN']
.agg(['sum'])
.reset_index()
.rename(columns={'sum': 'total_green_skills'})
.sort_values(by='total_green_skills', ascending=False)
.query('total_green_skills > 0'))

non_green_occ_green_skills

Unnamed: 0,occupation,total_green_skills
41,Cleaners and domestics,213.962867
29,Care workers and home carers,130.059082
37,Chefs,129.716000
22,Business and financial project management prof...,114.450832
210,Printing machine assistants,114.066867
...,...,...
278,Typists and related keyboard occupations,0.100000
0,"Actuaries, economists and statisticians",0.095238
9,Assemblers and routine operatives n.e.c.,0.058824
212,Probation officers,0.050000


In [None]:
all_green_measures_df_skills = pd.merge(all_green_measures_df, green_skills_df, on='job_id', how='left')

In [19]:
all_green_measures_df.head(5)

Unnamed: 0,job_id,NUM_ORIG_ENTS,NUM_SPLIT_ENTS,ENTS,GREEN_ENTS,PROP_GREEN,BENEFITS,GREEN CATEGORY,GREEN/NOT GREEN,GREEN TIMESHARE,...,INDUSTRY GHG PER UNIT EMISSIONS,INDUSTRY PROP HOURS GREEN TASKS,INDUSTRY PROP WORKERS GREEN TASKS,INDUSTRY PROP WORKERS 20PERC GREEN TASKS,INDUSTRY GHG EMISSIONS PER EMPLOYEE,INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE,NUM_GREEN_ENTS,SOC_2020_name,SOC_2020_EXT_name,occupation
0,41547521,3,3,"[[[Porta Cabins on a construction site], SKILL...",[],0.0,,Non-Green,Non-green,0.0,...,,,,,,,0,Vehicle valeters and cleaners,Vehicle valeters and cleaners,Vehicle valeters and cleaners
1,41547559,2,2,"[[[a Full Clean UK Licence], SKILL], [[Multi-d...",[],0.0,,Non-Green,Non-green,0.0,...,,,,,,,0,Delivery drivers and couriers,Delivery drivers and couriers n.e.c.,Delivery drivers and couriers
2,41547583,8,12,"[[[ambitious], SKILL], [[confident], SKILL], [...",[],0.0,,Non-Green,Non-green,0.0,...,0.02,3.8,17.6,6.4,1.1,2328.5,0,Sports and leisure assistants,Leisure and recreation assistants,Sports and leisure assistants
3,41547585,8,12,"[[[ambitious], SKILL], [[confident], SKILL], [...",[],0.0,,Non-Green,Non-green,0.0,...,0.02,3.8,17.6,6.4,1.1,2328.5,0,Sports and leisure assistants,Leisure and recreation assistants,Sports and leisure assistants
4,41547589,1,1,"[[[preparation of management accounts, forecas...",[],0.0,,Non-Green,Non-green,0.0,...,0.0,11.4,50.2,26.6,0.7,1709.4,0,"Book-keepers, payroll managers and wages clerks","Bookkeepers, payroll managers and wage clerks ...","Book-keepers, payroll managers and wages clerks"


In [28]:
non_green_occ_green_skills_list = non_green_occ_green_skills.occupation.to_list()

jobid2socdict = all_green_measures_df.set_index('job_id')['SOC_2020_name'].to_dict()
green_skills_df['occupation'] = green_skills_df['job_id'].map(jobid2socdict)

(green_skills_df
.query('occupation in @non_green_occ_green_skills_list')
.groupby(['occupation', 'extracted_full_skill'])
.job_id
.count()
.reset_index()
.rename(columns={'job_id': 'count'})
.query('count > 10'))

Unnamed: 0,occupation,extracted_full_skill,count
1,"Actuaries, economists and statisticians",Adobe Photoshop,21
5,"Actuaries, economists and statisticians",C#,17
6,"Actuaries, economists and statisticians",C++,62
8,"Actuaries, economists and statisticians",CSS,14
10,"Actuaries, economists and statisticians",DTP,18
...,...,...,...
174856,Youth work professionals,show motivation,12
174868,Youth work professionals,support people,12
174872,Youth work professionals,taking a proactive approach,17
174885,Youth work professionals,work in team,12


In [29]:
#generate a dataframe with summed green measures per occupation

all_green_measures_df_ents = all_green_measures_df[~all_green_measures_df['GREEN_ENTS'].isna()]
all_green_measures_df_ents['GREEN_ENTS_COUNT'] = all_green_measures_df_ents['GREEN_ENTS'].apply(lambda x: len(x))

all_green_measures_df_occ = (all_green_measures_df_ents
                             .groupby('occupation')
                             .aggregate({'INDUSTRY TOTAL GHG EMISSIONS': ['mean'], 
                                         'INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE': ['mean'],
                                         'GREEN TIMESHARE': ['mean'], 
                                         'GREEN_ENTS_COUNT': ['mean'], 
                                         'PROP_GREEN': ['mean']})
                             .reset_index())
all_green_measures_df_occ.columns = all_green_measures_df_occ.columns.levels[0]
all_green_measures_df_occ.columns  = ['occupation', 'industry_ghg_emissions_mean', 'industry_carbon_emissions_employee_mean', 'occupation_green_timeshare_mean', 'green_skills_count_mean', 'green_skill_percentage_mean']

#pick majority occupation greenness
occ_green_cat = all_green_measures_df.groupby('occupation')['GREEN CATEGORY'].agg(
    lambda x: pd.Series.mode(x)[0])
#pick majority green/non-green occupation
occ_green_nongreen = all_green_measures_df.groupby('occupation')['GREEN/NOT GREEN'].agg(
    lambda x: pd.Series.mode(x)[0])
all_green_measures_df_occ['occ_green_non_green'] = all_green_measures_df_occ.occupation.map(occ_green_nongreen)
all_green_measures_df_occ['occ_green_category'] = all_green_measures_df_occ.occupation.map(occ_green_cat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_green_measures_df_ents['GREEN_ENTS_COUNT'] = all_green_measures_df_ents['GREEN_ENTS'].apply(lambda x: len(x))


In [30]:
# industry greenness vs. mean # of green skills requested
ind_skills_greeness = alt.Chart(all_green_measures_df_occ).mark_circle(size=60).encode(
    x=alt.X('green_skill_percentage_mean', title="Green Skill %", scale=alt.Scale(zero=False)),
    y=alt.Y('industry_ghg_emissions_mean', title="Industry GHG Emissions", scale=alt.Scale(zero=False)),
    color=alt.Color('occ_green_non_green',
                   scale=alt.Scale(
            domain=['Non-green', 'Green'],
            range=['red', 'green']),
                   title='Green / Not Green'),
    tooltip=['occupation', 'green_skills_count_mean', 'green_skill_percentage_mean']
)


# occupational greenness vs. # of green skills requested
occ_skill_greeness = alt.Chart(all_green_measures_df_occ, title="").mark_circle(size=60).encode(
    x=alt.X('green_skill_percentage_mean', title="Green Skill %", scale=alt.Scale(zero=False)),
    y=alt.Y('occupation_green_timeshare_mean', title="Occupational Green Timeshare", scale=alt.Scale(zero=False)),
    color=alt.Color('occ_green_non_green',
                   scale=alt.Scale(
            domain=['Non-green', 'Green'],
            range=['red', 'green']),
                   title='Green / Not Green'),
    tooltip=['occupation', 'green_skills_count_mean', 'occupation_green_timeshare_mean']
)

# industry greenness vs. occupational greenness 

ind_occ_greeness = alt.Chart(all_green_measures_df_occ).mark_circle(size=60).encode(
    x=alt.X('occupation_green_timeshare_mean', title="Time Spent on Green Tasks", scale=alt.Scale(zero=False)),
    y=alt.Y('industry_ghg_emissions_mean', title= "Industry GHG Emissions", scale=alt.Scale(zero=False)),
    color=alt.Color('occ_green_non_green',
                   scale=alt.Scale(
            domain=['Non-green', 'Green'],
            range=['red', 'green']),
                   title="Green / Not Green"),
    tooltip=['occupation', 'occ_green_non_green', 'green_skills_count_mean', 'green_skill_percentage_mean']
)

occ_greeness = (ind_skills_greeness | occ_skill_greeness | ind_occ_greeness).properties(
    title={
      "text": ["Occupational Greenness"], 
      "subtitle": ["The graph on the left compares average GHG emissions vs. % of green skills per occupation.", 
                   "The graph on the right compares average time spent on green tasks vs. % of green skills per occupation.",
                   "The last graph compares average GHG emissions vs. average time spent on green tasks per occupation."],
      "color": "black",
      "subtitleColor": "black"
    }
)

In [31]:
occ_greeness

## Top Occupations

investigate "top" occupations by:

- low industry GHG emissions and highest green skill percentage ("green" occupations)
- high industry GHG emissions and highest green skill percentage ("greening" occupations)
- high industry GHG emissions per employee and lowest green skill percentage ("brown" occupations)

In [32]:
#low industry ghg emissions, high green skill percentage
low_ind_high_gskill = (all_green_measures_df_occ
                       .sort_values(['industry_ghg_emissions_mean', 'green_skill_percentage_mean'], ascending=[True, False])
                       .head(10)
                       [['occupation', 'industry_ghg_emissions_mean', 'green_skill_percentage_mean']])

green_occs = alt.Chart(low_ind_high_gskill).mark_bar().encode(
    x=alt.X('green_skill_percentage_mean', title="Green Skill %", scale=alt.Scale(zero=False)),
    #increase label limit
    y=alt.Y('occupation', title="", sort='-x', axis=alt.Axis(labelLimit=5000)),
    #set the color to green
    color=alt.value('green'))

#high industry ghg emissions, high green skill percentage
high_ind_high_gskill = (all_green_measures_df_occ
                       .sort_values(['industry_ghg_emissions_mean', 'green_skill_percentage_mean'], ascending=[False, False])
                       .head(10)
                       [['occupation', 'industry_ghg_emissions_mean', 'green_skill_percentage_mean']])

greening_occs = alt.Chart(high_ind_high_gskill).mark_bar().encode(
    x=alt.X('green_skill_percentage_mean', title="Green Skill %", scale=alt.Scale(zero=False)),
    #increase label limit
    y=alt.Y('occupation', title="", sort='-x', axis=alt.Axis(labelLimit=5000)),
    #set the color to green
    color=alt.value('#90EE90'))

green_greening_occs = (green_occs | greening_occs).properties(
    title={
      "text": ['"Green" and "Greening" Occupations'], 
      "subtitle": ["The graph on the left shows the top ten occupations by lowest GHG emissions and highest % of green skills.", 
                   "The graph on the right shows the top ten occupations by highest GHG emissions and highest % of green skills."],
      "color": "black",
      "subtitleColor": "black"
    }
)

(occ_greeness & green_greening_occs).save(f'{graph_dir}/occ_greenness.html')

In [33]:
(occ_greeness & green_greening_occs)

## New green skills

Investigate extracted skills defined as "non-green" by the green skills classifier.

In [53]:
# high_green_skills_low_occ_list = all_green_measures_df_occ.query('occ_green_non_green == "Green"').sort_values('green_skill_percentage_mean', ascending=True).head(10).occupation.to_list()
# green_skills_df_extractedskill = green_skills_df[green_skills_df['occupation'].isin(high_green_skills_low_occ_list)]

# new_green_skills = green_skills_df_extractedskill[green_skills_df_extractedskill['GREEN_ENTS'].isna()]['skill_label'].unique().tolist()

bert_model = BertVectorizer().fit()
# new_green_skills_embeds = bert_model.transform(new_green_skills)
# #reduce dimensionality
# reduced_embeds = reducer.fit_transform(new_green_skills_embeds)

# kmeans = KMeans(n_clusters=10, random_state=0).fit(reduced_embeds)
# labels = kmeans.labels_

# new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 'cluster_number': labels, 'x': reduced_embeds[:, 0], 'y': reduced_embeds[:, 1]})

# new_green_skills_clust = alt.Chart(new_skill_cluster_df, title='new "green" skill groups based on high green occupation greenness and low % of green skills requested').mark_circle(size=60).encode(
#     x='x',
#     y='y',
#     #make these categorical
#     color=alt.Color('cluster_number:N', title="Cluster Number", legend=None),
#     tooltip=['skill', 'cluster_number']
# ).interactive()

In [54]:
green_occs = list(all_green_measures_df_occ[all_green_measures_df_occ['occ_green_non_green'] == 'Green'].occupation)


random.seed(51)
green_occ = random.choice(green_occs)
print('finding new skills for occupation:', green_occ)

new_green_skills = green_skills_df[(green_skills_df['occupation'] == green_occ) & (green_skills_df['GREEN_ENTS'].isna())]['skill_label'].unique().tolist()

new_green_skills_embeds = bert_model.transform(new_green_skills)

#reduce dimensionality
reduced_embeds = reducer.fit_transform(new_green_skills_embeds)

kmeans = KMeans(n_clusters=10, random_state=0).fit(reduced_embeds)
labels = kmeans.labels_

new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 
                                     'cluster_number': labels, 
                                     'x': reduced_embeds[:, 0], 
                                     'y': reduced_embeds[:, 1]})

occ1_new_skills = alt.Chart(new_skill_cluster_df, title=green_occ).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster_number:N', title="Cluster Number", legend=None),
    tooltip=['skill', 'cluster_number']
).interactive()

finding new skills for occupation: Delivery operatives


  new_green_skills = green_skills_df[(green_skills_df['occupation'] == green_occ) & (green_skills_df_extractedskill['GREEN_ENTS'].isna())]['skill_label'].unique().tolist()
Exception ignored in: <function tqdm.__del__ at 0x7f9ec1cca700>
Traceback (most recent call last):
  File "/Users/india.kerlenesta/opt/anaconda3/envs/dap_prinz_green_jobs/lib/python3.8/site-packages/tqdm/std.py", line 1161, in __del__
    def __del__(self):
KeyboardInterrupt: 


In [309]:
#reduce embeds 
random.seed(57)
green_occ = random.choice(green_occs)
print('finding new skills for occupation:', green_occ)

new_green_skills = green_skills_df[(green_skills_df['occupation'] == green_occ) & (green_skills_df['GREEN_ENTS'].isna())]['skill_label'].unique().tolist()

new_green_skills_embeds = bert_model.transform(new_green_skills)

reduced_embeds = reducer.fit_transform(new_green_skills_embeds)
kmeans = KMeans(n_clusters=10, random_state=0).fit(reduced_embeds)
labels = kmeans.labels_

new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 
                                     'cluster_number': labels, 
                                     'x': reduced_embeds[:, 0], 
                                     'y': reduced_embeds[:, 1]})

occ2_new_skills = alt.Chart(new_skill_cluster_df, title=green_occ).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster_number:N', title="Cluster Number", legend=None),
    tooltip=['skill', 'cluster_number']
).interactive()

finding new skills for occupation: Assemblers (electrical and electronic products)


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [310]:
#reduce embeds 
random.seed(14)
green_occ = random.choice(green_occs)
print('finding new skills for occupation:', green_occ)

new_green_skills = green_skills_df[(green_skills_df['occupation'] == green_occ) & (green_skills_df['GREEN_ENTS'].isna())]['skill_label'].unique().tolist()

new_green_skills_embeds = bert_model.transform(new_green_skills)

reduced_embeds = reducer.fit_transform(new_green_skills_embeds)

kmeans = KMeans(n_clusters=10, random_state=0).fit(reduced_embeds)
labels = kmeans.labels_

new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 
                                     'cluster_number': labels, 
                                     'x': reduced_embeds[:, 0], 
                                     'y': reduced_embeds[:, 1]})

occ3_new_skills = alt.Chart(new_skill_cluster_df, title=green_occ).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster_number:N', title="Cluster Number", legend=None),
    tooltip=['skill', 'cluster_number']
).interactive()

finding new skills for occupation: Business associate professionals n.e.c.


Batches:   0%|          | 0/81 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [311]:
#save skill graphs

new_skills_charts = occ1_new_skills | occ2_new_skills | occ3_new_skills
#add title to the graphs
new_skills_charts.title = 'New "Green" Skills' 

skill_graphs = new_skills_charts
skill_graphs.title = "New Skill Graphs"

skill_graphs.save(f'{graph_dir}/new_green_skills.html')

skill_graphs