## Create new skills plots 

This notebook creates graphs to explore new 'green' skills

In [118]:
#load imports 
from dap_prinz_green_jobs.getters.ojo_getters import (
    get_large_ojo_job_title_sample
    
)
import dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures as pg
from dap_prinz_green_jobs import PROJECT_DIR
from dap_prinz_green_jobs.getters.data_getters import load_s3_data
import dap_prinz_green_jobs.utils.plotting as pt

import pandas as pd
import numpy as np


import os
from datetime import datetime
import altair as alt
import random

from sklearn.feature_extraction.text import TfidfTransformer

Load variables and functions

In [203]:
# save graphs
today = datetime.today().strftime("%y%m%d")
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

/Users/india.kerlenesta/Projects/dap_green_jobs/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/231215 directory already exists


In [204]:
#alt disable max rows

alt.data_transformers.disable_max_rows()
colordict = dict(zip(['green', 'neutral',  'brown'], pt.GREEN_MEASURES_COLORS))

In [217]:
def get_skill_level(skill:str) -> int:
    """Get skill level from ESCO hierarchy

    Args:
        skill (str): skill

    Returns:
        int: skill level
    """
    if not isinstance(skill, str):
        return None
    
    if 'S' in skill:
        if len(skill) == 1:
            return 0 #S
        elif len(skill) == 2:
            return 1 #S1
        elif skill.count('.') == 1:
            return 2 #S1.1
        elif skill.count('.') == 2:
            return 3 #S1.1.1
    else:
        return None
    # elif 'K' in skill:
    #     if len(skill) == 1:
    #         return 0 #k
    #     elif len(skill) == 2:
    #         return 1

## 0. Load Data

load and clean up data

In [4]:
skill_measures_df, occs_measures_df, inds_measures_df, soc_name_dict = pg.load_ojo_green_measures()

2023-12-15 11:16:10,665 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2023-12-15 11:16:11,097 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [5]:
all_green_measures_df = pg.merge_green_measures(skill_measures_df=skill_measures_df, 
                                                    occs_measures_df=occs_measures_df, 
                                                    inds_measures_df=inds_measures_df,
                                                    soc_name_dict=soc_name_dict)

There are 1000000 rows in the merged data
There are 1000000 unique job ids


In [11]:
print(all_green_measures_df.shape)
print(all_green_measures_df.columns)
all_green_measures_df.head(2)

(1000000, 30)
Index(['job_id', 'NUM_ORIG_ENTS', 'NUM_SPLIT_ENTS', 'ENTS', 'GREEN_ENTS',
       'PROP_GREEN', 'BENEFITS', 'GREEN CATEGORY', 'GREEN/NOT GREEN',
       'GREEN TIMESHARE', 'GREEN TOPICS', 'SOC_2020_EXT', 'SOC_2020',
       'SOC_2010', 'SOC_names', 'SIC', 'SIC_name', 'SIC_confidence',
       'SIC_method', 'company_description', 'INDUSTRY TOTAL GHG EMISSIONS',
       'INDUSTRY GHG PER UNIT EMISSIONS', 'INDUSTRY PROP HOURS GREEN TASKS',
       'INDUSTRY PROP WORKERS GREEN TASKS',
       'INDUSTRY PROP WORKERS 20PERC GREEN TASKS',
       'INDUSTRY GHG EMISSIONS PER EMPLOYEE',
       'INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE', 'NUM_GREEN_ENTS',
       'SOC_2020_name', 'SOC_2020_EXT_name'],
      dtype='object')


Unnamed: 0,job_id,NUM_ORIG_ENTS,NUM_SPLIT_ENTS,ENTS,GREEN_ENTS,PROP_GREEN,BENEFITS,GREEN CATEGORY,GREEN/NOT GREEN,GREEN TIMESHARE,...,INDUSTRY TOTAL GHG EMISSIONS,INDUSTRY GHG PER UNIT EMISSIONS,INDUSTRY PROP HOURS GREEN TASKS,INDUSTRY PROP WORKERS GREEN TASKS,INDUSTRY PROP WORKERS 20PERC GREEN TASKS,INDUSTRY GHG EMISSIONS PER EMPLOYEE,INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE,NUM_GREEN_ENTS,SOC_2020_name,SOC_2020_EXT_name
0,41547517,22,23,"[[[passionate about], SKILL], [[maintaining st...",[],0.0,,,,,...,59.5,0.0,8.3,58.6,18.9,0.2,242.7,0,,
1,41547521,3,3,"[[[Porta Cabins on a construction site], SKILL...",[],0.0,,Non-Green,Non-green,0.0,...,,,,,,,,0,Vehicle valeters and cleaners,Vehicle valeters and cleaners


In [6]:
#create skills df

skills_df = pg.create_skill_df(skill_measures_df)

100%|██████████| 13436987/13436987 [00:10<00:00, 1236868.84it/s]


In [68]:
#download ESCO skills tables
trans_skills = load_s3_data("open-jobs-lake", 'escoe_extension/inputs/data/esco/esco_transversal_mapper.json')
formatted_esco_skills = pd.read_csv('s3://open-jobs-lake/escoe_extension/outputs/data/skill_ner_mapping/esco_data_formatted.csv')
esco_hier = load_s3_data("open-jobs-lake", 'escoe_extension/outputs/data/skill_ner_mapping/esco_hier_mapper.json')

In [181]:
esco_hier["K"] = "Knowledge"
esco_hier["S"] = "Skills"
esco_hier["T"] = "Transversal skills and competencies"
esco_hier["A"] = "Attitudes"
esco_hier["L"] = "Language skills and Knowledge"


In [218]:
formatted_esco_skills_skills = (formatted_esco_skills
                                .query('id.str.len() > 30')
                                .assign(hierarchy_levels = lambda x: x['hierarchy_levels'].apply(pg.safe_literal_eval))
                                .explode('hierarchy_levels'))
formatted_esco_skills_skills = formatted_esco_skills_skills[formatted_esco_skills_skills.hierarchy_levels.apply(lambda x: 'S' in x)]

formatted_esco_skills_hier = formatted_esco_skills_skills.explode('hierarchy_levels').reset_index(drop=True)
formatted_esco_skills_hier['hierarchy_name'] = formatted_esco_skills_hier['hierarchy_levels'].map(esco_hier)
formatted_esco_skills_hier['skill_level'] = formatted_esco_skills_hier['hierarchy_levels'].apply(get_skill_level)

## 2. method to identify 'new' green skills

- create tfidf matrix of skills and occupation greenness
- get rid of ESCO transversal skills

In [74]:
jobid2socdict = all_green_measures_df.set_index('job_id')['SOC_2020_name'].to_dict()
skillid2skilldict = skills_df.set_index('extracted_full_skill_id')['extracted_full_skill'].to_dict()

skills_df['occupation'] = skills_df['job_id'].map(jobid2socdict)

non_green_skills_df = skills_df[skills_df['GREEN_ENTS'].isna()]
#add esco transversal skills flag
non_green_skills_df['transversal_skill_flag'] = non_green_skills_df['extracted_full_skill_id'].apply(lambda x: 'Y' if x in list(trans_skills.keys()) else 'N')
#only do nontransversal skills + only skill level skills
non_green_skills_df = non_green_skills_df[(non_green_skills_df.extracted_full_skill_id.str.len() > 20) & (non_green_skills_df.transversal_skill_flag == "N")].reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_green_skills_df['transversal_skill_flag'] = non_green_skills_df['extracted_full_skill_id'].apply(lambda x: 'Y' if x in list(trans_skills.keys()) else 'N')


In [103]:
#aggregate by occupation to low/med/high green occuspations 

all_green_measures_df_soc = all_green_measures_df[~all_green_measures_df['SOC_2020_name'].isna()]

measures_by_occ_df = (all_green_measures_df_soc
                      .groupby('SOC_2020_name')
                      .agg({'job_id': 'count',
                            'GREEN TOPICS': 'mean',
                            'GREEN CATEGORY': pg.get_mode,
                            'GREEN/NOT GREEN': pg.get_mode,
                            'PROP_GREEN': 'mean',
                            'GREEN TIMESHARE': 'mean',
                            'INDUSTRY TOTAL GHG EMISSIONS': 'mean',
                            'INDUSTRY GHG PER UNIT EMISSIONS': 'mean',
                            'INDUSTRY PROP HOURS GREEN TASKS': 'mean',
                            'INDUSTRY GHG EMISSIONS PER EMPLOYEE': 'mean',
                            'INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE': 'mean'})
                      .reset_index()
                      .rename(columns={'job_id': 'num_jobs',
                                       'index': 'SOC_2020_name'})
                      .query('num_jobs > 50')
                      .reset_index(drop=True))

# #I THINK THIS IS IT - IT'S TOO LONG 

cat = 'GREEN TOPICS'

mean_value = measures_by_occ_df[cat].mean()
print(f'the mean value is: {mean_value}')

std_dev = measures_by_occ_df[cat].std()
print(f'the standard deviation is: {std_dev}')

# Define the bin edges based on standard deviations
bins = [float('-inf'), mean_value, mean_value + (2*std_dev), float('inf')]
print(bins)

# Define labels for the categories
labels = ['brown', 'neutral', 'green']

# Create a new categorical column based on standard deviations
measures_by_occ_df[f'{cat.lower()}_cat'] = pd.cut(measures_by_occ_df[cat], bins=bins, labels=labels, include_lowest=True)

# measures_by_occ_df['green_timeshare_chat'].value_counts()
occ2label = measures_by_occ_df.set_index('SOC_2020_name')[f'{cat.lower()}_cat'].to_dict()

non_green_skills_df['occ_label'] = non_green_skills_df['occupation'].map(occ2label)

the mean value is: 2.943121801908339
the standard deviation is: 6.896936243803245
[-inf, 2.943121801908339, 16.73699428951483, inf]


In [104]:
non_green_skills_df.occ_label.value_counts()

brown      2274099
neutral     597925
green        84771
Name: occ_label, dtype: int64

Create TFIDF matrix of skills and occupation greeness 

In [105]:
#create count matrix with occ_label as rows and skill_label_id as columns
count_matrix = pd.pivot_table(non_green_skills_df, 
                              index='occ_label', 
                              columns='extracted_full_skill_id', 
                              values='extracted_full_skill', 
                              aggfunc='count', 
                              fill_value=0)

# Initialize the TfidfTransformer
tfidf_transformer = TfidfTransformer()

# Fit and transform the count matrix to obtain the TF-IDF matrix
tfidf_matrix = tfidf_transformer.fit_transform(count_matrix)

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=count_matrix.columns)
tfidf_df['occ_label'] = count_matrix.index

In [110]:
count_matrix

extracted_full_skill_id,0007bdc2-dd15-4824-b7d6-416522c46f35,00090cc1-1f27-439e-a4e0-19a87a501bfc,0023e7a5-43da-4b68-bee3-726ef21f986d,00298d97-3dc3-4086-a902-bce0a2fba831,004017c9-0337-4f5a-8077-798de9ef12e3,00735755-adc6-4ea0-b034-b8caff339c9f,0085e6bd-6829-4e8e-b302-842a7fe57ed9,009673d9-e2fd-46ef-a64b-6027ac7fd613,00994812-ac9e-4856-954b-f71bcc6066bb,009db49b-fcf5-4409-b4a5-232d059f3597,...,ff939984-b1a6-4d57-88bd-e453262433c4,ff9eeebf-d4c8-487f-8cd4-aa22cc471588,ff9f1770-2b94-4afa-bf82-1671d7f54b99,ffb30d85-b45e-45f8-b03b-e3ac16078daf,ffc1e455-ced2-4e67-bdb3-1c50f9683859,ffc69c38-ce70-4d08-85ac-54036ca80a87,ffc836d3-1897-4ec1-877e-a4f08f1e05c6,ffddfc7c-a9dd-449f-9e96-882dc447c8b6,fff0e2cd-d0bd-4b02-9daf-158b79d9688a,fff5bc45-b506-4466-8977-4869079c1cb2
occ_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
brown,1,539,65,29,3,40,38,1,1,26,...,17,1,2,54,26,180,0,284,68,81
green,0,22,0,2,0,0,0,0,0,0,...,2,0,0,11,5,11,0,11,35,4
neutral,0,172,19,9,2,8,11,0,0,18,...,5,3,0,21,23,62,1,4,6,41


## 3. Analse skills with the highest tfidf scores
- for low/mid/high occupations based on greenness

In [152]:
label = 'brown'
brown_tfidf = (tfidf_df
 .query(f'occ_label == "{label}"')
 .drop(columns='occ_label')
 .T
 .reset_index()
 .rename(columns={'extracted_full_skill_id': 'skill_id', 0: 'tfidf_score'})
 .sort_values('tfidf_score', ascending=False))
brown_tfidf['skill'] = brown_tfidf['skill_id'].map(skillid2skilldict)

brown_chart = alt.Chart(brown_tfidf[:10]).mark_bar(opacity=0.8, 
                        color=colordict.get(label)).encode(
    x=alt.X('tfidf_score', title='TF-IDF Score'),
    y=alt.Y('skill', sort='-x', title='Skill'))

brown_chart_config = pt.configure_plots(brown_chart,
                                        chart_title=f'top 10 non-green skills for {label} occupations')
brown_chart_config.save(f"{graph_dir}/top_non_green_skills_{label}.html")

label = 'green'
green_tfidf = (tfidf_df
 .query(f'occ_label == "{label}"')
 .drop(columns='occ_label')
 .T
 .reset_index()
 .rename(columns={'extracted_full_skill_id': 'skill_id', 1: 'tfidf_score'})
 .sort_values('tfidf_score', ascending=False))
green_tfidf['skill'] = green_tfidf['skill_id'].map(skillid2skilldict)

green_tfidf = alt.Chart(green_tfidf[:10]).mark_bar(opacity=0.8, 
                        color=colordict.get(label)).encode(
    x=alt.X('tfidf_score', title='TF-IDF Score'),
    y=alt.Y('skill', sort='-x', title='Skill'))

green_tfidf_config = pt.configure_plots(green_tfidf,
                                        chart_title=f'top 10 non-green skills for {label} occupations')

green_tfidf_config.save(f"{graph_dir}/top_non_green_skills_{label}.html")

In [222]:
brown_chart_config

In [223]:
green_tfidf_config