## Create new skills plots 

This notebook creates graphs to explore new 'green' skills. 

It relies on a network approach to identify the most commonly co-occuring non-green skills in job adverts that also contain green skills.

In [1]:
#load imports 
import dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures as pg
from dap_prinz_green_jobs import PROJECT_DIR, BUCKET_NAME, analysis_config
from dap_prinz_green_jobs.getters.data_getters import load_s3_data
import dap_prinz_green_jobs.utils.plotting as pt

import os
from datetime import datetime
import random 
from tqdm import tqdm
from itertools import combinations, chain

import pandas as pd
import altair as alt

import networkx as nx
import nx_altair as nxa
from textwrap import wrap

## 0. Load variables and functions

In [2]:
# save graphs
today = datetime.today().strftime("%y%m%d")
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

/Users/india.kerlenesta/Projects/dap_green_jobs/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/240122 directory already exists


In [3]:
#alt disable max rows

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [4]:
#define variables here 
bad_coefs = analysis_config['bad_coef_threshold']

## 0. Load & Clean Data

- load:
    - aggregated occupation data to label job ids as low/mid/high green occupations
    - skills data to generate the non-green/green skills network
    - occupations data to map job ids to soc codes

- clean:
    - skills data to add additional occupational metadata
    - filter skills data to only include job adverts with at least 1 skill and at least 1 green skill

In [5]:
## LOAD RELEVANT DATASETS

#download aggregated occupation data to get non-green/mid-green/high-green occupations 

occ_date = analysis_config['analysis_files']['agg_soc_date_stamp']
print(occ_date)
occ_agg = pd.read_csv(f's3://prinz-green-jobs/outputs/data/ojo_application/extracted_green_measures/analysis/occupation_aggregated_data_{occ_date}.csv')

occ_agg = occ_agg.query('num_job_ads > 100').reset_index(drop=True)

#download skills data

green_skills_outputs = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{analysis_config['skills_date_stamp']}/ojo_large_sample_skills_green_measures_production_{analysis_config['production']}.csv",
    )
green_skills_outputs["GREEN_ENTS"] = green_skills_outputs["GREEN_ENTS"].apply(
    pg.safe_literal_eval
)
green_skills_outputs["ENTS"] = green_skills_outputs["ENTS"].apply(pg.safe_literal_eval)


full_skill_mapping =pg.load_full_skill_mapping(analysis_config)
skills_df = pg.create_skill_df(green_skills_outputs, full_skill_mapping=full_skill_mapping)

#download occupations data for job id to soc code mapper

green_occs_outputs = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{analysis_config['occ_date_stamp']}/ojo_large_sample_occupation_green_measures_production_{analysis_config['production'].lower()}.csv",
    )
green_occs_outputs = pg.process_soc_columns(green_occs_outputs)

soc_name_dict = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{analysis_config['occ_date_stamp']}/soc_name_dict.json",
    )

20240112
2024-01-22 09:22:14,369 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2024-01-22 09:22:14,912 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2024-01-22 09:24:47,664 - dap_prinz_green_jobs - INFO - Loading full skills mappings to ESCO from 100 S3 files


100%|██████████| 100/100 [01:32<00:00,  1.09it/s]
100%|██████████| 13436987/13436987 [00:24<00:00, 559381.97it/s]


In [6]:
## CLEAN RELEVANT DATASETS

#let's add soc occupation name to job id 
green_occs_outputs['soc_name_6'] = green_occs_outputs['SOC_2020_EXT'].map(soc_name_dict['soc_2020_6'])
green_occs_outputs['soc_name_4'] = green_occs_outputs['SOC_2020'].map(soc_name_dict['soc_2020_4'])

jobid2socdict = green_occs_outputs.set_index('job_id')['soc_name_4'].to_dict()
skills_df['occupation'] = skills_df['job_id'].map(jobid2socdict)

#then, lets add occupation greeness to job id
occ_agg['occupation_name'] = occ_agg['SOC_2020'].astype(str).map(soc_name_dict['soc_2020_4'])
soc42occgreeness = occ_agg.set_index('occupation_name')['occ_greenness'].to_dict()

skills_df['occupation_greenness'] = skills_df['occupation'].map(soc42occgreeness)

## finally, let's filter the skills df to:
# - only include job ids from green occupations
# - contain at least 1 skill
# - contain at least 1 green skill

nongreen_job_ids = skills_df.groupby('job_id')['extracted_green_skill'].count().reset_index().query('extracted_green_skill == 0').job_id.tolist()
skills_df = skills_df.query('job_id not in @nongreen_job_ids')

skills_df_skill = skills_df.query('~extracted_full_skill_id.isna() or ~extracted_green_skill_id.isna()')
skills_df_skill = skills_df_skill.query('occupation_greenness == "high"').reset_index(drop=True)

In [7]:
print(f"Number of job ids that are green occupations: {skills_df_skill.job_id.nunique()}")

Number of job ids that are green occupations: 34012


## 2. Method to identify 'new' green skills

- Goal: find the most commonly co-occurring green to non-green skills in job adverts from green occupations

### 2.1 Create skill co-occurence dataframe

In [8]:
## Firstly, let's create a dataframe that creates a column of all skills (green and non-green) per job advert 
all_skills_per_job_id = (skills_df_skill
                         .groupby('job_id')
                         .agg({'extracted_full_skill_id': lambda x: [i for i in list(x.unique()) if isinstance(i, str)],
                               'extracted_green_skill_id': lambda x:  [i for i in list(x.unique()) if isinstance(i, str)]})
                         .reset_index()
                         .assign(all_skills = lambda x: x.extracted_full_skill_id + x.extracted_green_skill_id))

## let's also create a dictionary of all skills per job id where the key is the job id and the value is a list of all skills extracted from the job advert
all_skills_per_job_id_dict = all_skills_per_job_id[['job_id', 'all_skills']].set_index('job_id')['all_skills'].T.to_dict()

In [9]:
## Now, let's count the number of times each skill combination appears
skill_combinations_count = {}
for job_id, skills_list in tqdm(all_skills_per_job_id_dict.items()):
    skill_combo_per_job_id = list(combinations(skills_list, 2))
    for combo in skill_combo_per_job_id:
        sorted_combo = tuple(sorted(combo))
        if sorted_combo not in skill_combinations_count:
            skill_combinations_count[sorted_combo] = set()
        skill_combinations_count[sorted_combo].add(job_id)
skill_combinations_count = {k: len(v) for k, v in skill_combinations_count.items()}

100%|██████████| 34012/34012 [00:02<00:00, 14635.06it/s]


In [10]:
#Finally, let's create a dataframe from the skill combination count dictionary
skill_combinations_count_df = pd.DataFrame(skill_combinations_count, index=['weight']).T.reset_index()
skill_combinations_count_df.columns = ['skill1', 'skill2', 'weight']
skill_combinations_count_df = skill_combinations_count_df.query('weight > 1').reset_index(drop=True)

print(f"the number of skill combinations is {len(skill_combinations_count_df)}")

the number of skill combinations is 282080


### 2.2 Create skill co-occurence network
- create an undirected, weighted network 
- add node attributes for whether the node is a green skill or not
- prune network to:
    - remove edges with weights of 1
    - remove notes with low clustering co-efficients i.e. highly transversal skills
- create subgraph of network with edges that connect green skills to non-green skills

In [11]:
#firstly, let's create a networkx graph from the skill combination count dataframe weight weights of 1 or more 

G = nx.from_pandas_edgelist(skill_combinations_count_df, 'skill1', 'skill2', 'weight')

In [12]:
#let's further prune the graph by removing nodes with a clustering coeffient of less than 0.3
clustering_coefs = nx.clustering(G)
nodes = list(clustering_coefs.keys())
clustering_coefs_to_get_rid_of = []
for node in nodes:
    if clustering_coefs[node] < bad_coefs: 
        clustering_coefs_to_get_rid_of.append(
            {node: clustering_coefs[node]})
clustering_coefs_to_get_rid_of = {
    k: v for el in clustering_coefs_to_get_rid_of for k, v in el.items()}
print(f'removing {len(clustering_coefs_to_get_rid_of)} nodes based on clustering coef')
# remove skill nodes based on clustering coef
bad_skills = list(clustering_coefs_to_get_rid_of.keys())
G.remove_nodes_from(bad_skills)

removing 439 nodes based on clustering coef


In [13]:
#let's add node metadata to the graph for whether the skill is green or not

#is the skill green or not?
green_skills = list(set(chain(*all_skills_per_job_id.extracted_green_skill_id.to_list())))
nongreen_skills = list(set(chain(*all_skills_per_job_id.extracted_full_skill_id.to_list())))

green_skills_dict = {k: 'Y' for k in green_skills}
nongreen_skills_dict = {k: 'N' for k in nongreen_skills}

binary_skill_dict = {**green_skills_dict, **nongreen_skills_dict}

nx.set_node_attributes(G, binary_skill_dict, "green skill?")

# add skill name as node attribute
skill_name_dict = {v[1]:v[0] for k, v in full_skill_mapping.items()}
nx.set_node_attributes(G, skill_name_dict, "skill_name")

In [14]:
#finally, let's filter graph based on whether the two nodes are green or not
filtered_edges = [(u, v) for u, v, data in G.edges(data=True) if G.nodes[u]["green skill?"] == "Y" and G.nodes[v]["green skill?"] == "N"]
filtered_G = G.edge_subgraph(filtered_edges)

#within the subgraph, only maintain the largest connected component to visualise
# Find connected components
connected_components = list(nx.connected_components(filtered_G))
# Keep only the nodes in the largest connected component
largest_component = max(connected_components, key=len)
subgraph = filtered_G.subgraph(largest_component)

# Print the result
print("Original Graph Nodes:", len(G.nodes))
print("Original Graph Edges:", len(G.edges))
print("Filtered Graph Nodes:", len(subgraph.nodes))
print("Filtered Graph Edges:", len(subgraph.edges))

Original Graph Nodes: 4899
Original Graph Edges: 185615
Filtered Graph Nodes: 1774
Filtered Graph Edges: 3528


### 2.3 Create skill co-occurence graphs
- analyse graph to produce altair graphs 

In [15]:
# graph 1: network - filter filter for the top 100 skills based on edge weight

filtered_graph = nx.Graph(filtered_edges)


pos = nx.spring_layout(subgraph)
chart = nxa.draw_networkx(
    G=subgraph,
    pos=pos,
    node_size=200,
    node_color='green skill?',
    width='weight',
    node_tooltip=['skill_name', 'green skill?']
).interactive().properties(width=800, height=800)

chart_clean = pt.configure_plots(chart, chart_title='Skill Co-occurence Network', chart_subtitle=['Green skills are in green and non-green skills are in yellow.','The width of the edges represents the number of times the two skills co-occur in a job advert.'])

chart_clean.save(f"{graph_dir}/skill_cooccurence_network.html")

In [227]:
chart_clean

In [228]:
#graph 2: top skill combinations by degree centrality

degree_centrality = nx.degree_centrality(filtered_G)
# Identify nodes that are not green
non_green_nodes = [node for node, green_skill in nx.get_node_attributes(filtered_G, 'green skill?').items() if green_skill == 'N']

# Sort non-green nodes based on degree centrality
sorted_non_green_nodes = sorted(non_green_nodes, key=lambda x: degree_centrality[x], reverse=True)[:10]
top_nongreen_skills = {k:v for k,v in degree_centrality.items() if k in sorted_non_green_nodes}


top_nongreen_skills_df = pd.DataFrame.from_dict(top_nongreen_skills, orient='index').reset_index()
top_nongreen_skills_df.columns = ['skill', 'degree centrality']
top_nongreen_skills_df['skill_name'] = top_nongreen_skills_df.skill.map(skill_name_dict)
top_nongreen_skills_df['skill_name_wrapped'] = top_nongreen_skills_df['skill_name'].apply(wrap, args=[50])

hex_code = pt.NESTA_COLOURS[0]
graph = alt.Chart(top_nongreen_skills_df).mark_bar().encode(
    x=alt.X('degree centrality:Q', title='Degree Centrality'),
    y=alt.Y('skill_name_wrapped:N', title='', sort='-x', axis=alt.Axis(labelLimit=500)),
    color=alt.value(hex_code)).properties(height=400, width=600)

graph_config = pt.configure_plots(graph, 
                                  chart_title='Top 10 Non-Green Skills in Green Occupations by Degree Centrality', 
                                  chart_subtitle=['Degree centrality is the number of connections a node has to other nodes in the network.'])

graph_config.save(f"{graph_dir}/top_nongreen_skills_by_degree_centrality.html")

In [229]:
graph_config