## Create new skills plots 

This notebook creates graphs to explore new 'green' skills

In [1]:
#load imports 
import dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures as pg
from dap_prinz_green_jobs import PROJECT_DIR, BUCKET_NAME, analysis_config
from dap_prinz_green_jobs.getters.data_getters import load_s3_data
import dap_prinz_green_jobs.utils.plotting as pt

import pandas as pd
import numpy as np


import os
from datetime import datetime
import altair as alt
import random

from sklearn.feature_extraction.text import TfidfTransformer

Load variables and functions

In [2]:
# save graphs
today = datetime.today().strftime("%y%m%d")
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

Creating /Users/india.kerlenesta/Projects/dap_green_jobs/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/231220 directory


In [27]:
#alt disable max rows

alt.data_transformers.disable_max_rows()
colordict = dict(zip(['low-mid', 'low', 'mid-high', 'high'], ['#C4A484', '#964B00', '#90EE90', '#023020']))

In [10]:
def get_skill_level(skill:str) -> int:
    """Get skill level from ESCO hierarchy

    Args:
        skill (str): skill

    Returns:
        int: skill level
    """
    if not isinstance(skill, str):
        return None
    
    if 'S' in skill:
        if len(skill) == 1:
            return 0 #S
        elif len(skill) == 2:
            return 1 #S1
        elif skill.count('.') == 1:
            return 2 #S1.1
        elif skill.count('.') == 2:
            return 3 #S1.1.1
    else:
        return None
    # elif 'K' in skill:
    #     if len(skill) == 1:
    #         return 0 #k
    #     elif len(skill) == 2:
    #         return 1

## 0. Load Data

load and clean up data

In [5]:
#download aggregated occupation data to get non-green/mid-green/high-green occupations 

occ_date = analysis_config['analysis_files']['agg_soc_date_stamp']
occ_agg = pd.read_csv(f's3://prinz-green-jobs/outputs/data/ojo_application/extracted_green_measures/analysis/occupation_aggregated_data_{occ_date}.csv')

occ_agg = occ_agg.query('num_job_ads > 100').reset_index(drop=True)

#download skills data

green_skills_outputs = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{analysis_config['skills_date_stamp']}/ojo_large_sample_skills_green_measures_production_{analysis_config['production']}.csv",
    )
green_skills_outputs["GREEN_ENTS"] = green_skills_outputs["GREEN_ENTS"].apply(
    pg.safe_literal_eval
)
green_skills_outputs["ENTS"] = green_skills_outputs["ENTS"].apply(pg.safe_literal_eval)

skills_df = pg.create_skill_df(green_skills_outputs)

#download occupations data for job id to soc code mapper

green_occs_outputs = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{analysis_config['occ_date_stamp']}/ojo_large_sample_occupation_green_measures_production_{analysis_config['production'].lower()}.csv",
    )
green_occs_outputs = pg.process_soc_columns(green_occs_outputs)

2023-12-20 10:55:49,265 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2023-12-20 10:55:50,484 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


100%|██████████| 13436987/13436987 [00:10<00:00, 1304159.96it/s]


In [6]:
#download ESCO skills tables
trans_skills = load_s3_data("open-jobs-lake", 'escoe_extension/inputs/data/esco/esco_transversal_mapper.json')
formatted_esco_skills = pd.read_csv('s3://open-jobs-lake/escoe_extension/outputs/data/skill_ner_mapping/esco_data_formatted.csv')
esco_hier = load_s3_data("open-jobs-lake", 'escoe_extension/outputs/data/skill_ner_mapping/esco_hier_mapper.json')

soc_name_dict = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{analysis_config['occ_date_stamp']}/soc_name_dict.json",
    )

green_occs_outputs['soc_name_6'] = green_occs_outputs['SOC_2020_EXT'].map(soc_name_dict['soc_2020_6'])
green_occs_outputs['soc_name_4'] = green_occs_outputs['SOC_2020'].map(soc_name_dict['soc_2020_4'])

In [7]:
esco_hier["K"] = "Knowledge"
esco_hier["S"] = "Skills"
esco_hier["T"] = "Transversal skills and competencies"
esco_hier["A"] = "Attitudes"
esco_hier["L"] = "Language skills and Knowledge"

In [11]:
formatted_esco_skills_skills = (formatted_esco_skills
                                .query('id.str.len() > 30')
                                .assign(hierarchy_levels = lambda x: x['hierarchy_levels'].apply(pg.safe_literal_eval))
                                .explode('hierarchy_levels'))
formatted_esco_skills_skills = formatted_esco_skills_skills[formatted_esco_skills_skills.hierarchy_levels.apply(lambda x: 'S' in x)]

formatted_esco_skills_hier = formatted_esco_skills_skills.explode('hierarchy_levels').reset_index(drop=True)
formatted_esco_skills_hier['hierarchy_name'] = formatted_esco_skills_hier['hierarchy_levels'].map(esco_hier)
formatted_esco_skills_hier['skill_level'] = formatted_esco_skills_hier['hierarchy_levels'].apply(get_skill_level)

## 2. method to identify 'new' green skills

- create tfidf matrix of skills and occupation greenness
- get rid of ESCO transversal skills

In [12]:
jobid2socdict = green_occs_outputs.set_index('job_id')['soc_name_4'].to_dict()
skillid2skilldict = skills_df.set_index('extracted_full_skill_id')['extracted_full_skill'].to_dict()

skills_df['occupation'] = skills_df['job_id'].map(jobid2socdict)

non_green_skills_df = skills_df[skills_df['GREEN_ENTS'].isna()]
#add esco transversal skills flag
#non_green_skills_df['transversal_skill_flag'] = non_green_skills_df['extracted_full_skill_id'].apply(lambda x: 'Y' if x in list(trans_skills.keys()) else 'N')
#only do nontransversal skills + only skill level skills
#non_green_skills_df = non_green_skills_df[(non_green_skills_df.extracted_full_skill_id.str.len() > 20) & (non_green_skills_df.transversal_skill_flag == "N")].reset_index(drop=True)

In [13]:
#make greeness score dict for soc 2020 4 digit
occ_agg['SOC_2020'] = occ_agg['SOC_2020'].astype(str)
occ_agg['soc_name_4'] = occ_agg['SOC_2020'].map(soc_name_dict['soc_2020_4'])
soc2score = occ_agg.set_index('soc_name_4')['greenness_score'].to_dict()

Create TFIDF matrix of skills and occupation greeness 

In [14]:
non_green_skills_df['occ_label'] = non_green_skills_df['occupation'].map(soc2score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_green_skills_df['occ_label'] = non_green_skills_df['occupation'].map(soc2score)


In [15]:
#create count matrix with occ_label as rows and skill_label_id as columns
count_matrix = pd.pivot_table(non_green_skills_df, 
                              index='occ_label', 
                              columns='extracted_full_skill_id', 
                              values='extracted_full_skill', 
                              aggfunc='count', 
                              fill_value=0)

# Initialize the TfidfTransformer
tfidf_transformer = TfidfTransformer()

# Fit and transform the count matrix to obtain the TF-IDF matrix
tfidf_matrix = tfidf_transformer.fit_transform(count_matrix)

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=count_matrix.columns)
tfidf_df['occ_label'] = count_matrix.index

## 3. Analyse skills with the highest tfidf scores
- for low/mid/high occupations based on greenness
- from liz: live thinking - remove the skills which are in the top 100 tfidf lists for green and not-green occupations

In [16]:
#first, filter the top 100 skills based on tfidf scores for each row

tfidf_df

extracted_full_skill_id,0007bdc2-dd15-4824-b7d6-416522c46f35,00090cc1-1f27-439e-a4e0-19a87a501bfc,0023e7a5-43da-4b68-bee3-726ef21f986d,00298d97-3dc3-4086-a902-bce0a2fba831,004017c9-0337-4f5a-8077-798de9ef12e3,00735755-adc6-4ea0-b034-b8caff339c9f,0085e6bd-6829-4e8e-b302-842a7fe57ed9,009673d9-e2fd-46ef-a64b-6027ac7fd613,00994812-ac9e-4856-954b-f71bcc6066bb,009db49b-fcf5-4409-b4a5-232d059f3597,...,ff9eeebf-d4c8-487f-8cd4-aa22cc471588,ff9f1770-2b94-4afa-bf82-1671d7f54b99,ffb30d85-b45e-45f8-b03b-e3ac16078daf,ffc1e455-ced2-4e67-bdb3-1c50f9683859,ffc69c38-ce70-4d08-85ac-54036ca80a87,ffc836d3-1897-4ec1-877e-a4f08f1e05c6,ffddfc7c-a9dd-449f-9e96-882dc447c8b6,fff0e2cd-d0bd-4b02-9daf-158b79d9688a,fff5bc45-b506-4466-8977-4869079c1cb2,occ_label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,high
1,0.0,0.002057,0.000279,0.000157,6.5e-05,0.0,0.0,0.0,0.0,1.7e-05,...,0.0,0.0,0.000366,0.000139,0.000418,0.0,0.002772,0.000593,0.000349,low
2,0.0,0.002688,0.000224,0.000106,0.0,3.5e-05,0.000104,9e-06,9e-06,0.000146,...,0.0,0.0,0.000224,0.000174,0.000801,9e-06,0.000722,0.000347,0.000319,low-mid
3,2.7e-05,0.002284,0.000481,0.000206,4.2e-05,0.000912,0.000721,0.0,0.0,0.000292,...,0.000108,5.4e-05,0.000343,0.000258,0.001357,0.0,0.000189,0.000223,0.000824,mid-high


In [85]:
label = 'low'
brown_tfidf = (tfidf_df
 .query(f'occ_label == "{label}"')
 .drop(columns='occ_label')
 .T
 .reset_index()
 .rename(columns={'extracted_full_skill_id': 'skill_id', 1: 'tfidf_score'})
 .sort_values('tfidf_score', ascending=False))
brown_tfidf['skill'] = brown_tfidf['skill_id'].map(skillid2skilldict)

low_tfidf_graph = alt.Chart(brown_tfidf[10:20]).mark_bar(opacity=0.8, color=colordict.get(label)).encode(
    x=alt.X('tfidf_score', title='TF-IDF Score'),
    y=alt.Y('skill', sort='-x', title='Skill'))

low_tfidf_graph

# low_tfidf_graph_config = pt.configure_plots(low_tfidf_graph,
#                                         chart_title=f'top 10 non-green skills for {label} occupations')
# low_tfidf_graph_config.save(f"{graph_dir}/top_non_green_skills_{label}.html")

# low_tfidf_graph_config


In [86]:
label = 'low-mid'
lowmid_tfidf = (tfidf_df
 .query(f'occ_label == "{label}"')
 .drop(columns='occ_label')
 .T
 .reset_index()
 .rename(columns={'extracted_full_skill_id': 'skill_id', 2: 'tfidf_score'})
 .sort_values('tfidf_score', ascending=False))

lowmid_tfidf['skill'] = lowmid_tfidf['skill_id'].map(skillid2skilldict)


lowmid_tfidf_graph = alt.Chart(lowmid_tfidf[10:20]).mark_bar(opacity=0.8, color=colordict.get(label)).encode(
    x=alt.X('tfidf_score', title='TF-IDF Score'),
    y=alt.Y('skill', sort='-x', title=''))
                        
lowmid_tfidf_graph

# lowmid_tfidf_graph_config = pt.configure_plots(lowmid_tfidf_graph,
#                                         chart_title=f'top 10 non-green skills for {label} occupations')

# lowmid_tfidf_graph_config.save(f"{graph_dir}/top_non_green_skills_{label}.html")

# lowmid_tfidf_graph_config

In [167]:
label = 'mid-high'
midhigh_tfidf = (tfidf_df
 .query(f'occ_label == "{label}"')
 .drop(columns='occ_label')
 .T
 .reset_index()
 .rename(columns={'extracted_full_skill_id': 'skill_id', 3: 'tfidf_score'})
 .sort_values('tfidf_score', ascending=False))

midhigh_tfidf['skill'] = midhigh_tfidf['skill_id'].map(skillid2skilldict)
midhigh_tfidf['greenness'] = label

midhigh_tfidf_graph = alt.Chart(midhigh_tfidf[10:20]).mark_bar(opacity=0.8, color=colordict.get(label)).encode(
    x=alt.X('tfidf_score', title='TF-IDF Score'),
    y=alt.Y('skill', sort='-x', title='Skill'))
                        
midhigh_tfidf_graph

# midhigh_tfidf_graph_config = pt.configure_plots(midhigh_tfidf_graph,
#                                         chart_title=f'top 10 non-green skills for {label} occupations')

# midhigh_tfidf_graph_config.save(f"{graph_dir}/top_non_green_skills_{label}.html")

# midhigh_tfidf_graph_config

In [168]:
label = 'high'
high_tfidf = (tfidf_df
 .query(f'occ_label == "{label}"')
 .drop(columns='occ_label')
 .T
 .reset_index()
 .rename(columns={'extracted_full_skill_id': 'skill_id', 0: 'tfidf_score'})
 .sort_values('tfidf_score', ascending=False))
high_tfidf['skill'] = high_tfidf['skill_id'].map(skillid2skilldict)


high_tfidf_graph = alt.Chart(high_tfidf[10:20]).mark_bar(opacity=0.8, color=colordict.get(label)).encode(
    x=alt.X('tfidf_score', title='TF-IDF Score'),
    y=alt.Y('skill', sort='-x', title=''))

high_tfidf_graph

# high_tfidf_graph_config = pt.configure_plots(high_tfidf_graph,
#                                         chart_title=f'top 10 non-green skills for {label} occupations')

# high_tfidf_graph_config.save(f"{graph_dir}/top_non_green_skills_{label}.html")

# high_tfidf_graph_config

In [175]:
all_skills_chart = (low_tfidf_graph | lowmid_tfidf_graph) & (midhigh_tfidf_graph | high_tfidf_graph)
all_skills_chart = all_skills_chart.properties(title=f'Top 10 non-green skills for each occupation group')


non_skills_colors = pd.DataFrame(
        {
            "x": [0, 0, 0, 0],
            "y": [0, 0, 0, 0],
            "greenness": list(colordict.keys()),
            "color": list(colordict.values())})

legend_chart = (
    alt.Chart(non_skills_colors)
    .mark_circle(size=0, 
                 opacity=0.8)
    .encode(
        x=alt.X("x", title="", axis=None),
        y=alt.Y("y", title="", axis=None),
        color=alt.Color(
            "greenness:N",
            scale=alt.Scale(
                domain=list(
                    dict(
                        zip(
                            non_skills_colors["greenness"],
                            non_skills_colors["color"],
                        )
                    ).keys()
                ),
                range=list(
                    dict(
                        zip(
                            non_skills_colors["greenness"],
                            non_skills_colors["color"],
                        )
                    ).values()
                ),
            ),
            legend=alt.Legend(title=""),
        ),
    )
).properties(width=-1, height=100)

all_skills_chart = (all_skills_chart | legend_chart).configure_view(stroke=None)

all_skills_chart.save(f"{graph_dir}/top_non_green_skills.html")