## Skill similarities across occupations

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures as pg
from dap_prinz_green_jobs.getters.data_getters import save_to_s3, load_s3_data
from dap_prinz_green_jobs import BUCKET_NAME, logger, analysis_config
from dap_prinz_green_jobs import BUCKET_NAME, PROJECT_DIR, analysis_config
from dap_prinz_green_jobs.getters.ojo_getters import (
    get_mixed_ojo_location_sample,
    get_mixed_ojo_salaries_sample,
    get_large_ojo_location_sample,
    get_large_ojo_salaries_sample,
)
from dap_prinz_green_jobs.utils.plotting import configure_plots
from sklearn.metrics.pairwise import cosine_similarity
import altair as alt
from datetime import datetime
import yaml
import os

import pandas as pd
import numpy as np

from tqdm import tqdm
from textwrap import wrap

## Skills per occupations

In [3]:
occ_sim_folder = f'outputs/data/ojo_application/extracted_green_measures/analysis/occupation_similarity/{analysis_config["analysis_files"]["occ_most_similar_date"]}'

In [4]:
occ_most_similar = load_s3_data(
            BUCKET_NAME,
            f"{occ_sim_folder}/occ_most_similar.json",
        )
del occ_most_similar['Betting shop managers']

2024-03-20 16:29:46,245 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [5]:
occ_agg = load_s3_data(
        BUCKET_NAME,
        f'outputs/data/ojo_application/extracted_green_measures/analysis/occupation_aggregated_data_{analysis_config["analysis_files"]["agg_soc_date_stamp"]}_all.csv',
    )

occ_agg = occ_agg[occ_agg['clean_soc_name']!='Betting shop managers']

# Lets not include so many of the occupations - it makes the plot laggy
occ_agg = occ_agg[occ_agg['num_job_ads']>500]

occ_agg.reset_index(inplace=True)

2024-03-20 16:29:47,616 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [6]:
occ_skills_info = load_s3_data(
            BUCKET_NAME,
            f"{occ_sim_folder}/occ_skills_info.json",
        )

In [7]:
soc_2num = dict(zip(occ_agg['SOC_2020_EXT'], occ_agg['num_job_ads']))
soc_2_prop_green = dict(zip(occ_agg['SOC_2020_EXT'], occ_agg["average_prop_green_skills"]))

In [8]:
len(occ_most_similar)

1329

## format for plot

In [9]:
soc_name_2_id = {pg.clean_soc_name(k):v['SOC_2020_EXT'] for k,v in occ_skills_info.items()}

In [10]:
collated_df = pd.DataFrame()
for occ_name, occ_sim_details in occ_most_similar.items():
    occ_sim_details_df = pd.DataFrame(occ_sim_details)
    occ_sim_details_df["SOC_2020_EXT_name"] = occ_sim_details_df["SOC_2020_EXT_name"].apply(lambda x: pg.clean_soc_name(x))
    occ_sim_details_df['Occupation'] = occ_name
    occ_sim_details_df['Occupation_num_jobs_ads'] = soc_2num.get(soc_name_2_id[occ_name])
    occ_sim_details_df["av_proportion_green_skills"] = occ_sim_details_df["SOC_2020_EXT_name"].apply(lambda x: soc_2_prop_green.get(soc_name_2_id[x]))
    occ_sim_details_df['similar_num_skills'] = occ_sim_details_df['number of skills']
    collated_df = pd.concat([collated_df, occ_sim_details_df])

# And only use data with an ok similarity
collated_df = collated_df[collated_df['similarity']>0.75]

In [11]:
len(collated_df)

3799

In [12]:
collated_df["av_perc_green_skills"] = collated_df["av_proportion_green_skills"]*100
collated_df["SOC_2020_EXT_name_wrapped"] = collated_df["SOC_2020_EXT_name"].apply(wrap, args=[50])

## Plot

In [13]:
# save graphs
today = datetime.today().strftime("%y%m%d")
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

/Users/elizabethgallagher/Code/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/240320 directory already exists


In [14]:
print(len(collated_df))
collated_df = collated_df[collated_df['Occupation_num_jobs_ads']>2000]
collated_df.sort_values(by='Occupation', inplace=True)
print(len(collated_df))

3799
2035


In [59]:
collated_df['av_perc_green_skills_fixed'] = collated_df['av_perc_green_skills'].apply(lambda x: x if pd.notnull(x) else -1)

In [75]:
select_box = alt.binding_select(
    options=list(collated_df["Occupation"].unique()), name="Occupation "
)
selection = alt.selection_point(
    value=list(collated_df["Occupation"].unique())[0],
    fields=["Occupation"],
    bind=select_box,
)

similar_skills_plot = alt.Chart(collated_df,
                                # padding={"left": 10, "top": 100, "right": 10, "bottom": 10}
                               ).mark_bar().encode(
    x=alt.X('similarity', title='Skill similarity', scale=alt.Scale(domain=[0, 1])),
    y=alt.Y('SOC_2020_EXT_name', sort='-x', title='', axis=alt.Axis(labelLimit=300)
           ),    
    color=alt.Color(
        'av_perc_green_skills_fixed',
        title=["Average percentage", "of green skills"],
        scale=alt.Scale(
            scheme='goldgreen',
            reverse=False,
            domain=[0, collated_df['av_perc_green_skills'].max()],
            domainMid=collated_df['av_perc_green_skills'].quantile(q=0.5)
        ),
        legend=None,
    ),
    tooltip=[
        alt.Tooltip("SOC_2020_EXT_name", title="Occupation"),
        alt.Tooltip("similarity", title="Similarity score", format=".2"),
        alt.Tooltip("average number of skills per job advert", title="Average number of skills per job advert"),
        alt.Tooltip("av_proportion_green_skills", title="Average percentage of green skills", format=".2%"),
        alt.Tooltip("popular_overlap", title="Popular skills overlap"),
    ]
).add_params(selection).transform_filter(selection)



In [86]:
configure_plots(
    similar_skills_plot.properties(width='container', height=300).configure(autosize="fit-x"),
    chart_title="Most similar occupations based off skill similarity",
    fontsize_normal=16,
    fontsize_title=18,
).configure_title(offset=100).save(
    f"{graph_dir}/similar_skills_plot.html"
)

In [87]:
# configure_title will allow us to put the dropdown above the plot (done later in the html)