This notebook contains analysis _between_ measures of skills-, industries- and occupations on a sample of ~40K job adverts, engineered to contain both potentially "green" and "non-green" jobs.

In [2]:
from dap_prinz_green_jobs.getters.occupation_getters import load_job_title_soc
from dap_prinz_green_jobs import BUCKET_NAME, logger, PROJECT_DIR
from dap_prinz_green_jobs.utils.bert_vectorizer import BertVectorizer
from dap_prinz_green_jobs.getters.data_getters import load_s3_data

import pandas as pd
import numpy as np

import altair as alt

import umap
from sklearn.cluster import KMeans
import random

from datetime import datetime

import os

In [3]:
alt.data_transformers.disable_max_rows()

#save graphs
today = datetime.today().strftime('%y%m%d')
graph_dir = str(PROJECT_DIR / f"outputs/figures/between_measure_analysis/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

Creating /Users/india.kerlenesta/Projects/dap_green_jobs/dap_prinz_green_jobs/outputs/figures/between_measure_analysis/231013 directory


In [4]:
#instantiate reducers and functions for cleaning up skills

reducer = umap.UMAP(random_state=42)

#clean up skills
def merge_ents(ents):
    
    if not isinstance(ents, list):
        return None
    
    elif 'green' in ents[1]:
        return [ents[0]] + [ents[1][0]] + [ents[1][1]] + ents[1][2]
    else:
        return ents[0] + [ents[1]]

### 0. Load relevant data for analysis
Load extracted green measures at the skill-, occupation- and industry-level. Also load job titles to contextualise results.

In [5]:
#date stamps as defined in https://github.com/nestauk/dap_prinz_green_jobs/issues/75

production = "True"
config="base"

date_stamp = "20230914"

green_skills_outputs = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{date_stamp}/ojo_sample_skills_green_measures_production_{production}_{config}.json",
    )

date_stamp = "20231002"

green_occs_outputs = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{date_stamp}/ojo_sample_occupation_green_measures_production_{production}_{config}.json",
    )

date_stamp = "20231013"

green_inds_outputs = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{date_stamp}/ojo_sample_industry_green_measures_production_{production}_{config}.json",
    )

2023-10-13 14:17:36,941 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [6]:
# #step 0. Load extracted green measures from s3 

# download data

skill_measures_df = pd.DataFrame.from_dict(green_skills_outputs, orient='index').reset_index().rename(columns={'index':'id'})
occs_measures_df = pd.DataFrame.from_dict(green_occs_outputs, orient='index').reset_index().rename(columns={'index':'id'})
inds_measures_df = pd.DataFrame.from_dict(green_inds_outputs, orient='index').reset_index().rename(columns={'index':'id'})

# #step 1. load ojo related data - SOC names for additional context

soc_occ_dict = (load_job_title_soc()
                .set_index("SOC 2020")
                ['SOC 2020 UNIT GROUP DESCRIPTIONS']
                .to_dict())

2023-10-13 14:17:43,500 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


### 1. Merge and clean data so green measures are in a df
Clean up green measures and produce two dataframes:
1. numerical green measures;
2. extracted green skills

In [7]:
print(len(skill_measures_df))
print(skill_measures_df["id"].nunique())
print(len(occs_measures_df))
print(occs_measures_df["id"].nunique())
print(len(inds_measures_df))
print(inds_measures_df["id"].nunique())
all_green_measures_df = pd.merge(
     skill_measures_df, occs_measures_df, how="outer", on="id"
 )
all_green_measures_df = pd.merge(
    all_green_measures_df, inds_measures_df, how="outer", on="id"
)
#replace float with 0
all_green_measures_df = all_green_measures_df.fillna("")
all_green_measures_df["NUM_GREEN_ENTS"] = all_green_measures_df["GREEN_ENTS"].apply(len)
# Separate out the SOC columns
for soc_columns in ["SOC_2020_EXT", "SOC_2020", "SOC_2010", "name"]:
    all_green_measures_df[soc_columns] = all_green_measures_df["SOC"].apply(
        lambda x: x[soc_columns] if x else None
    )
all_green_measures_df.drop(columns=["SOC"], inplace=True)

all_green_measures_df.rename(
    columns={"name": "SOC_names", "id": "job_id"}, inplace=True
)
all_green_measures_df["SOC_2020_name"] = all_green_measures_df["SOC_2020"].map(
    soc_occ_dict
)

all_green_measures_df.rename(columns={})

print(len(all_green_measures_df))
print(all_green_measures_df["job_id"].nunique())
print(all_green_measures_df.columns)
all_green_measures_df.head(2)

39866
39866
39866
39866
39866
39866
39866
39866
Index(['job_id', 'NUM_ORIG_ENTS', 'NUM_SPLIT_ENTS', 'ENTS', 'GREEN_ENTS',
       'PROP_GREEN', 'BENEFITS', 'GREEN CATEGORY', 'GREEN/NOT GREEN',
       'GREEN TIMESHARE', 'GREEN TOPICS', 'SIC', 'SIC_name', 'SIC_confidence',
       'SIC_method', 'company_description', 'INDUSTRY TOTAL GHG EMISSIONS',
       'INDUSTRY GHG PER UNIT EMISSIONS', 'INDUSTRY PROP HOURS GREEN TASKS',
       'INDUSTRY PROP WORKERS GREEN TASKS',
       'INDUSTRY PROP WORKERS 20PERC GREEN TASKS',
       'INDUSTRY GHG EMISSIONS PER EMPLOYEE',
       'INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE', 'NUM_GREEN_ENTS',
       'SOC_2020_EXT', 'SOC_2020', 'SOC_2010', 'SOC_names', 'SOC_2020_name'],
      dtype='object')


Unnamed: 0,job_id,NUM_ORIG_ENTS,NUM_SPLIT_ENTS,ENTS,GREEN_ENTS,PROP_GREEN,BENEFITS,GREEN CATEGORY,GREEN/NOT GREEN,GREEN TIMESHARE,...,INDUSTRY PROP WORKERS GREEN TASKS,INDUSTRY PROP WORKERS 20PERC GREEN TASKS,INDUSTRY GHG EMISSIONS PER EMPLOYEE,INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE,NUM_GREEN_ENTS,SOC_2020_EXT,SOC_2020,SOC_2010,SOC_names,SOC_2020_name
0,41549950,16,16,"[[[good work-life balance], SKILL], [[managing...","[[MSc in EIA, Environmental Management, [green...",0.0625,"[Option to purchase up, pension scheme, Group ...",Green New & Emerging,Green,57.1,...,68.8,33.6,7.3,10196.5,1,2151/02,2151,2141,"[Conservation professionals, Biological scient...",Conservation professionals
1,41550510,9,10,"[[[Establish a productive relationship], SKILL...",[],0.0,,Non-Green,Non-green,0.0,...,39.7,17.4,4.2,14945.0,0,4143/99,4143,7220,[Database administrators and web content techn...,Customer service managers


In [8]:
# get occupations for which we have over 50 job adverts for
representative_occs = (
    all_green_measures_df.groupby("SOC_2020_name")
    .job_id.count()
    .sort_values(ascending=False)
    .where(lambda x: x >= 50)
    .dropna()
    .keys()
    .tolist()
)

print(len(all_green_measures_df))
all_green_measures_df = all_green_measures_df[
    all_green_measures_df["SOC_2020_name"].isin(representative_occs)
].reset_index(drop=True)
print(len(all_green_measures_df))
all_green_measures_df.replace('', np.nan, inplace=True)

39866
26830


In [9]:
all_green_measures_df["ENTS_GREEN_ENTS"] = all_green_measures_df.apply(
    lambda x: x["ENTS"] + x["GREEN_ENTS"], axis=1
)

green_skills_df = (
    all_green_measures_df[["job_id", "SOC_2020_name", "ENTS_GREEN_ENTS"]]
    .explode("ENTS_GREEN_ENTS")
    .reset_index(drop=True)
)
green_skills_df['ENTS_GREEN_ENTS'] = green_skills_df['ENTS_GREEN_ENTS'].apply(merge_ents)

green_skills_df["extracted_skill"] = green_skills_df["ENTS_GREEN_ENTS"].apply(
    lambda x: x[0] if isinstance(x, list) else None
)

green_skills_df['green_label'] = green_skills_df['ENTS_GREEN_ENTS'].apply(
    lambda x: x[1] if isinstance(x, list) and len(x) > 4 else 'not_green')

green_skills_df['green_label_prob'] = green_skills_df['ENTS_GREEN_ENTS'].apply(
    lambda x: x[2] if isinstance(x, list) and len(x) > 4 else None)


green_skills_df['skill_label'] = green_skills_df['ENTS_GREEN_ENTS'].apply(
    lambda x: x[3] if isinstance(x, list) and len(x) > 4 else None)


green_skills_df['skill_id'] = green_skills_df['ENTS_GREEN_ENTS'].apply(
    lambda x: x[4] if isinstance(x, list) and len(x) > 4 else None)

green_skills_df = green_skills_df[green_skills_df["skill_label"] != ""]


# # # Remove the duplicate green skills per job advert
print(len(green_skills_df))
green_skills_df.sort_values(by="extracted_skill", inplace=True)
green_skills_df.drop_duplicates(
    subset=["job_id", "skill_label"], keep="first", inplace=True
)
green_skills_df = green_skills_df[~green_skills_df['extracted_skill'].isna()]
print(len(green_skills_df))

green_skills_df.head(2)

510508
51169


Unnamed: 0,job_id,SOC_2020_name,ENTS_GREEN_ENTS,extracted_skill,green_label,green_label_prob,skill_label,skill_id
365929,46873256,Environment professionals,"["" attitude, SKILL]",""" attitude",not_green,,,
147791,44115424,"Book-keepers, payroll managers and wages clerks","[""ad hoc"" requests, SKILL]","""ad hoc"" requests",not_green,,,


## 2. **between analysis** of occupations, industries and skills 

In [10]:
# high occupation greenness (based on green share) and % of green skills
all_green_measures_df['occupation'] = all_green_measures_df['SOC_2020_name']

mean_green_timeshare_per_occ = (all_green_measures_df
                                .groupby('occupation')
                                ['GREEN TIMESHARE']
                                .agg(['mean'])
                                .reset_index()
                                .rename(columns={'mean': 'mean_occupation_green_timeshare'}))

green_skill_occ = (all_green_measures_df
.groupby('occupation')
['PROP_GREEN']
.agg(['mean'])
.reset_index()
.rename(columns={'mean': 'total_green_skills'})
.merge(mean_green_timeshare_per_occ, on='occupation', how='left'))

#is there a correlation between the # of green skills requested and the greenness of an occupation?
print('Correlation between the mean # of green skills requested per occupation and the "greenness" of an occupation: ', green_skill_occ['total_green_skills'].corr(green_skill_occ['mean_occupation_green_timeshare']))

Correlation between the mean # of green skills requested per occupation and the "greenness" of an occupation:  0.5222349660004748


In [11]:
# low occupation greenness and high total # of green skills

non_green_occ_green_skills = (all_green_measures_df[all_green_measures_df['GREEN CATEGORY'] == 'Non-Green']
.groupby('occupation')
['PROP_GREEN']
.agg(['sum'])
.reset_index()
.rename(columns={'sum': 'total_green_skills'})
.sort_values(by='total_green_skills', ascending=False)
.query('total_green_skills > 0'))

non_green_occ_green_skills

Unnamed: 0,occupation,total_green_skills
62,"Property, housing and estate managers",42.676571
68,Sales accounts and business development managers,27.928824
5,Business and financial project management prof...,27.239551
32,Health and safety managers and officers,19.885870
56,Other administrative occupations n.e.c.,16.868904
...,...,...
60,Programmers and software development professio...,0.260408
23,Design occupations n.e.c.,0.192857
44,Large goods vehicle drivers,0.192308
37,"IT business analysts, architects and systems d...",0.066667


In [12]:
non_green_occ_green_skills_list = non_green_occ_green_skills.occupation.to_list()
green_skills_df['occupation'] = green_skills_df['SOC_2020_name']

(green_skills_df
.query('occupation in @non_green_occ_green_skills_list')
.groupby(['occupation', 'extracted_skill'])
.job_id
.count()
.reset_index()
.rename(columns={'job_id': 'count'})
.query('count > 10'))

Unnamed: 0,occupation,extracted_skill,count
244,"Book-keepers, payroll managers and wages clerks",AAT,18
262,"Book-keepers, payroll managers and wages clerks",AAT qualified,15
271,"Book-keepers, payroll managers and wages clerks",ACA,77
278,"Book-keepers, payroll managers and wages clerks",ACCA,37
479,"Book-keepers, payroll managers and wages clerks",Excel,13
...,...,...,...
14758,Telecoms and related network installers and re...,Electrical fault finding and repairs,22
14770,Telecoms and related network installers and re...,Working on CHP and renewable energy systems,18
14788,Telecoms and related network installers and re...,provide low carbon solutions,25
14957,Water and sewerage plant operatives,"5 GCSEs or equivalent in Maths, English and a ...",16


In [17]:
#generate a dataframe with summed green measures per occupation

all_green_measures_df_ents = all_green_measures_df[~all_green_measures_df['GREEN_ENTS'].isna()]
all_green_measures_df_ents['GREEN_ENTS_COUNT'] = all_green_measures_df_ents['GREEN_ENTS'].apply(lambda x: len(x))

all_green_measures_df_occ = (all_green_measures_df_ents
                             .groupby('occupation')
                             .aggregate({'INDUSTRY TOTAL GHG EMISSIONS': ['mean'], 
                                         'INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE': ['mean'],
                                         'GREEN TIMESHARE': ['mean'], 
                                         'GREEN_ENTS_COUNT': ['mean'], 
                                         'PROP_GREEN': ['mean']})
                             .reset_index())
all_green_measures_df_occ.columns = all_green_measures_df_occ.columns.levels[0]
all_green_measures_df_occ.columns  = ['occupation', 'industry_ghg_emissions_mean', 'industry_carbon_emissions_employee_mean', 'occupation_green_timeshare_mean', 'green_skills_count_mean', 'green_skill_percentage_mean']

#pick majority occupation greenness
occ_green_cat = all_green_measures_df.groupby('occupation')['GREEN CATEGORY'].agg(
    lambda x: pd.Series.mode(x)[0])
#pick majority green/non-green occupation
occ_green_nongreen = all_green_measures_df.groupby('occupation')['GREEN/NOT GREEN'].agg(
    lambda x: pd.Series.mode(x)[0])
all_green_measures_df_occ['occ_green_non_green'] = all_green_measures_df_occ.occupation.map(occ_green_nongreen)
all_green_measures_df_occ['occ_green_category'] = all_green_measures_df_occ.occupation.map(occ_green_cat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_green_measures_df_ents['GREEN_ENTS_COUNT'] = all_green_measures_df_ents['GREEN_ENTS'].apply(lambda x: len(x))


In [18]:
# industry greenness vs. mean # of green skills requested
ind_skills_greeness = alt.Chart(all_green_measures_df_occ).mark_circle(size=60).encode(
    x=alt.X('green_skill_percentage_mean', title="Green Skill %", scale=alt.Scale(zero=False)),
    y=alt.Y('industry_ghg_emissions_mean', title="Industry GHG Emissions", scale=alt.Scale(zero=False)),
    color=alt.Color('occ_green_non_green',
                   scale=alt.Scale(
            domain=['Non-green', 'Green'],
            range=['red', 'green']),
                   title='Green / Not Green'),
    tooltip=['occupation', 'green_skills_count_mean', 'green_skill_percentage_mean']
)


# occupational greenness vs. # of green skills requested
occ_skill_greeness = alt.Chart(all_green_measures_df_occ, title="").mark_circle(size=60).encode(
    x=alt.X('green_skill_percentage_mean', title="Green Skill %", scale=alt.Scale(zero=False)),
    y=alt.Y('occupation_green_timeshare_mean', title="Occupational Green Timeshare", scale=alt.Scale(zero=False)),
    color=alt.Color('occ_green_non_green',
                   scale=alt.Scale(
            domain=['Non-green', 'Green'],
            range=['red', 'green']),
                   title='Green / Not Green'),
    tooltip=['occupation', 'green_skills_count_mean', 'occupation_green_timeshare_mean']
)

# industry greenness vs. occupational greenness 

ind_occ_greeness = alt.Chart(all_green_measures_df_occ).mark_circle(size=60).encode(
    x=alt.X('occupation_green_timeshare_mean', title="Time Spent on Green Tasks", scale=alt.Scale(zero=False)),
    y=alt.Y('industry_ghg_emissions_mean', title= "Industry GHG Emissions", scale=alt.Scale(zero=False)),
    color=alt.Color('occ_green_non_green',
                   scale=alt.Scale(
            domain=['Non-green', 'Green'],
            range=['red', 'green']),
                   title="Green / Not Green"),
    tooltip=['occupation', 'occ_green_non_green', 'green_skills_count_mean', 'green_skill_percentage_mean']
)

occ_greeness = (ind_skills_greeness | occ_skill_greeness | ind_occ_greeness).properties(
    title={
      "text": ["Occupational Greenness"], 
      "subtitle": ["The graph on the left compares average GHG emissions vs. % of green skills per occupation.", 
                   "The graph on the right compares average time spent on green tasks vs. % of green skills per occupation.",
                   "The last graph compares average GHG emissions vs. average time spent on green tasks per occupation."],
      "color": "black",
      "subtitleColor": "black"
    }
)

## Top Occupations

investigate "top" occupations by:

- low industry GHG emissions and highest green skill percentage ("green" occupations)
- high industry GHG emissions and highest green skill percentage ("greening" occupations)
- high industry GHG emissions per employee and lowest green skill percentage ("brown" occupations)

In [19]:
#low industry ghg emissions, high green skill percentage
low_ind_high_gskill = (all_green_measures_df_occ
                       .sort_values(['industry_ghg_emissions_mean', 'green_skill_percentage_mean'], ascending=[True, False])
                       .head(10)
                       [['occupation', 'industry_ghg_emissions_mean', 'green_skill_percentage_mean']])

green_occs = alt.Chart(low_ind_high_gskill).mark_bar().encode(
    x=alt.X('green_skill_percentage_mean', title="Green Skill %", scale=alt.Scale(zero=False)),
    #increase label limit
    y=alt.Y('occupation', title="", sort='-x', axis=alt.Axis(labelLimit=5000)),
    #set the color to green
    color=alt.value('green'))

#high industry ghg emissions, high green skill percentage
high_ind_high_gskill = (all_green_measures_df_occ
                       .sort_values(['industry_ghg_emissions_mean', 'green_skill_percentage_mean'], ascending=[False, False])
                       .head(10)
                       [['occupation', 'industry_ghg_emissions_mean', 'green_skill_percentage_mean']])

greening_occs = alt.Chart(high_ind_high_gskill).mark_bar().encode(
    x=alt.X('green_skill_percentage_mean', title="Green Skill %", scale=alt.Scale(zero=False)),
    #increase label limit
    y=alt.Y('occupation', title="", sort='-x', axis=alt.Axis(labelLimit=5000)),
    #set the color to green
    color=alt.value('#90EE90'))

green_greening_occs = (green_occs | greening_occs).properties(
    title={
      "text": ['"Green" and "Greening" Occupations'], 
      "subtitle": ["The graph on the left shows the top ten occupations by lowest GHG emissions and highest % of green skills.", 
                   "The graph on the right shows the top ten occupations by highest GHG emissions and highest % of green skills."],
      "color": "black",
      "subtitleColor": "black"
    }
)

(occ_greeness & green_greening_occs).save(f'{graph_dir}/occ_greenness.html')

In [20]:
(occ_greeness & green_greening_occs)

## New green skills

Investigate extracted skills defined as "non-green" by the green skills classifier.

In [21]:
high_green_skills_low_occ_list = all_green_measures_df_occ.query('occ_green_non_green == "Green"').sort_values('green_skill_percentage_mean', ascending=True).head(10).occupation.to_list()

green_skills_df_extractedskill = green_skills_df.dropna(subset=['extracted_skill'])
new_green_skills = list(set(green_skills_df_extractedskill[(green_skills_df_extractedskill.occupation.isin(high_green_skills_low_occ_list)) & (green_skills_df['green_label'] == 'not_green')].extracted_skill))

bert_model = BertVectorizer().fit()
new_green_skills_embeds = bert_model.transform(new_green_skills)
#reduce dimensionality
reduced_embeds = reducer.fit_transform(new_green_skills_embeds)

kmeans = KMeans(n_clusters=10, random_state=0).fit(reduced_embeds)
labels = kmeans.labels_

new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 'cluster_number': labels, 'x': reduced_embeds[:, 0], 'y': reduced_embeds[:, 1]})

new_green_skills_clust = alt.Chart(new_skill_cluster_df, title='new "green" skill groups based on high green occupation greenness and low % of green skills requested').mark_circle(size=60).encode(
    x='x',
    y='y',
    #make these categorical
    color=alt.Color('cluster_number:N', title="Cluster Number", legend=None),
    tooltip=['skill', 'cluster_number']
).interactive()

2023-10-13 14:20:03,844 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2023-10-13 14:20:04,217 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [23]:
green_occs = list(all_green_measures_df_occ[all_green_measures_df_occ['occ_green_non_green'] == 'Green'].occupation)

random.seed(51)
green_occ = random.choice(green_occs)
print('finding new skills for occupation:', green_occ)

new_green_skills = list(set(green_skills_df_extractedskill[(green_skills_df.occupation == green_occ) & (green_skills_df['green_label'] == 'not_green')].extracted_skill))
new_green_skills_embeds = bert_model.transform(new_green_skills)

#reduce dimensionality
reduced_embeds = reducer.fit_transform(new_green_skills_embeds)

kmeans = KMeans(n_clusters=10, random_state=0).fit(reduced_embeds)
labels = kmeans.labels_

new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 
                                     'cluster_number': labels, 
                                     'x': reduced_embeds[:, 0], 
                                     'y': reduced_embeds[:, 1]})

occ1_new_skills = alt.Chart(new_skill_cluster_df, title=green_occ).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster_number:N', title="Cluster Number", legend=None),
    tooltip=['skill', 'cluster_number']
).interactive()

finding new skills for occupation: Data analysts


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [24]:
#reduce embeds 
random.seed(57)
green_occ = random.choice(green_occs)
print('finding new skills for occupation:', green_occ)

new_green_skills = list(set(green_skills_df_extractedskill[(green_skills_df.occupation == green_occ) & (green_skills_df['green_label'] == 'not_green')].extracted_skill))
new_green_skills_embeds = bert_model.transform(new_green_skills)
reduced_embeds = reducer.fit_transform(new_green_skills_embeds)
kmeans = KMeans(n_clusters=10, random_state=0).fit(reduced_embeds)
labels = kmeans.labels_

new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 
                                     'cluster_number': labels, 
                                     'x': reduced_embeds[:, 0], 
                                     'y': reduced_embeds[:, 1]})

occ2_new_skills = alt.Chart(new_skill_cluster_df, title=green_occ).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster_number:N', title="Cluster Number", legend=None),
    tooltip=['skill', 'cluster_number']
).interactive()

finding new skills for occupation: Architects


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [25]:
#reduce embeds 
random.seed(14)
green_occ = random.choice(green_occs)
print('finding new skills for occupation:', green_occ)

new_green_skills = list(set(green_skills_df_extractedskill[(green_skills_df.occupation == green_occ) & (green_skills_df['green_label'] == 'not_green')].extracted_skill))
new_green_skills_embeds = bert_model.transform(new_green_skills)
reduced_embeds = reducer.fit_transform(new_green_skills_embeds)

kmeans = KMeans(n_clusters=10, random_state=0).fit(reduced_embeds)
labels = kmeans.labels_

new_skill_cluster_df = pd.DataFrame({'skill': new_green_skills, 
                                     'cluster_number': labels, 
                                     'x': reduced_embeds[:, 0], 
                                     'y': reduced_embeds[:, 1]})

occ3_new_skills = alt.Chart(new_skill_cluster_df, title=green_occ).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('cluster_number:N', title="Cluster Number", legend=None),
    tooltip=['skill', 'cluster_number']
).interactive()

finding new skills for occupation: Buyers and procurement officers


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [26]:
#save skill graphs

new_skills_charts = occ1_new_skills | occ2_new_skills | occ3_new_skills
#add title to the graphs
new_skills_charts.title = 'New "Green" Skills' 

skill_graphs = new_green_skills_clust & new_skills_charts
skill_graphs.title = "New Skill Graphs"

skill_graphs.save(f'{graph_dir}/new_green_skills.html')

skill_graphs