## Create violin plots

Create violin plots of the greenness measures for the the Green Jobs Explorer.

In [1]:
import dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures as pg

from dap_prinz_green_jobs import PROJECT_DIR, analysis_config
import dap_prinz_green_jobs.utils.plotting as pt
from datetime import datetime
import os
import pandas as pd

import altair as alt

In [3]:
#save graphs
today = datetime.today().strftime('%y%m%d')
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

/Users/india.kerlenesta/Projects/dap_green_jobs/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/231215 directory already exists


In [4]:
#alt disable max rows

alt.data_transformers.disable_max_rows()


green = pt.NESTA_COLOURS[1]
yellow = pt.NESTA_COLOURS[0]
purple = pt.NESTA_COLOURS[2]

chart_width = 200
chart_height = 450

soc_name = "Aerospace engineers"
x_value = 96

## 0. Load data

In [6]:
occ_date = analysis_config['analysis_files']['agg_soc_date_stamp']
occ_agg = pd.read_csv(f's3://prinz-green-jobs/outputs/data/ojo_application/extracted_green_measures/analysis/occupation_aggregated_data_{occ_date}.csv')

occ_agg = occ_agg.query('num_job_ads > 100').reset_index(drop=True)

2023-12-15 16:41:10,167 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


## 2. Generate violin plots

In [8]:
occ_agg.columns

Index(['SOC_2020_EXT', 'SOC_2020_EXT.1', 'num_job_ads', 'prop_job_ads',
       'top_5_socs', 'occ_timeshare', 'occ_topics',
       'average_occ_green_timeshare', 'average_num_skills',
       'average_prop_green_skills', 'top_5_green_skills',
       'top_5_not_green_skills', 'num_unique_SIC2', 'num_null_sic2',
       'num_top_sic2', 'num_other_sic2', 'average_ind_perunit_ghg',
       'average_ind_prop_hours', 'average_ind_prop_workers', 'top_5_sics',
       'median_min_annualised_salary', 'median_max_annualised_salary',
       'top_5_itl2_prop', 'SOC_2020_EXT_name', 'clean_soc_name',
       'soc_description', 'SOC_2020', 'SOC_2010', 'green_topics_lists',
       'occ_greenness', 'ind_greenness', 'skills_greenness',
       'greenness_score'],
      dtype='object')

In [9]:
skills_vio = alt.Chart(occ_agg, width=100).transform_density(
    'average_prop_green_skills',
    as_=['average_prop_green_skills', 'density'],
).mark_area(orient='horizontal', color=green).encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('average_prop_green_skills:Q', title="Average proportion of green skills")).properties(
    width=chart_width,
    height=chart_height
)


ind_vio = alt.Chart(occ_agg, width=100).transform_density(
    'average_ind_perunit_ghg',
    as_=['average_ind_perunit_ghg', 'density'],
).mark_area(orient='horizontal', color=yellow).encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('average_ind_perunit_ghg:Q', title="Average GHG emissions per unit of economic activity")).properties(
    width=chart_width,
    height=chart_height
)

    
occ1_vio =  alt.Chart(occ_agg, width=100).transform_density(
    'average_occ_green_timeshare',
    as_=['average_occ_green_timeshare', 'density'],
).mark_area(orient='horizontal', color=purple).encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('average_occ_green_timeshare:Q', title="Fraction of time spent doing green tasks")).properties(
    width=chart_width,
    height=chart_height
)

    
occ2_vio =  alt.Chart(occ_agg, width=100).transform_density(
    'occ_topics',
    as_=['occ_topics', 'density'],
).mark_area(orient='horizontal', color=purple).encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('occ_topics:Q', title="Number of O*NET green topics")).properties(
    width=chart_width,
    height=chart_height
)

In [17]:
occupation_dropdown = alt.selection_single(
    fields=['SOC_2020_EXT_name'],
    name='OccupationSelect',
    bind=alt.binding_select(options=list(occ_agg['SOC_2020_EXT_name'].unique())),
)

# Dots for skills_vio with dynamic data based on dropdown selection
skills_dots = alt.Chart(occ_agg).mark_point(color="red", filled=True, size=100, opacity=0.6).encode(
    x=alt.value(x_value),
    y='average_prop_green_skills:Q'
).transform_filter(occupation_dropdown)

# Dots for ind_vio with dynamic data based on dropdown selection
ind_dots = alt.Chart(occ_agg).mark_point(color="red", filled=True, size=100, opacity=0.6).encode(
    x=alt.value(x_value),
    y='average_ind_perunit_ghg:Q'
).transform_filter(occupation_dropdown)

# Dots for occ1_vio with dynamic data based on dropdown selection
occ1_dots = alt.Chart(occ_agg).mark_point(color="red", filled=True, size=100, opacity=0.6).encode(
    x=alt.value(x_value),
    y='average_occ_green_timeshare:Q'
).transform_filter(occupation_dropdown)

# Dots for occ2_vio with dynamic data based on dropdown selection
occ2_dots = alt.Chart(occ_agg).mark_point(color="red", filled=True, size=100, opacity=0.6).encode(
    x=alt.value(x_value),
    y='occ_topics:Q'
).transform_filter(occupation_dropdown)


In [18]:
# Combine everything into a single chart
vio_chart_with_dots = (occ1_vio + occ1_dots) | (occ2_vio + occ2_dots) | (skills_vio + skills_dots) | (ind_vio + ind_dots)

final_chart = alt.vconcat(vio_chart_with_dots).add_selection(
    occupation_dropdown
)

#add chart title based on occupation dropdown
vio_chart_with_dots_config = pt.configure_plots(final_chart)

In [19]:
vio_chart_with_dots_config

In [20]:
vio_chart_with_dots_config.save(f"{graph_dir}/vio_chart_with_dots.html")