## Create violin plots

Create violin plots of the greenness measures for the the Green Jobs Explorer.

In [1]:
import dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures as pg

from dap_prinz_green_jobs import PROJECT_DIR, analysis_config
import dap_prinz_green_jobs.utils.plotting as pt
from datetime import datetime
import os
import pandas as pd
import numpy as np
import altair as alt

In [2]:
#save graphs
today = datetime.today().strftime('%y%m%d')
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

/Users/elizabethgallagher/Code/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/240205 directory already exists


In [3]:
#alt disable max rows

alt.data_transformers.disable_max_rows()


green = pt.NESTA_COLOURS[1]
yellow = pt.NESTA_COLOURS[0]
purple = pt.NESTA_COLOURS[2]

chart_width = 200
chart_height = 450
x_value = 96

## 0. Load data

In [4]:
occ_date = analysis_config['analysis_files']['agg_soc_date_stamp']
occ_agg = pd.read_csv(f's3://prinz-green-jobs/outputs/data/ojo_application/extracted_green_measures/analysis/occupation_aggregated_data_{occ_date}.csv')

occ_agg = occ_agg.query('num_job_ads > 100').reset_index(drop=True)

2024-02-05 17:42:12,839 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [5]:
# Turn prop green skills into % for clarity
occ_agg['average_perc_green_skills']=occ_agg['average_prop_green_skills']*100
occ_agg['average_prop_occ_green_timeshare']=occ_agg['average_occ_green_timeshare']/100

## 2. Generate violin plots

In [6]:
occ_names = sorted(list(occ_agg['clean_soc_name'].unique()))

occupation_dropdown = alt.selection_single(
    fields=['clean_soc_name'],
    name='OccupationSelect',
    value=occ_names[0],
    bind=alt.binding_select(options=occ_names))



In [7]:
skills_vio = alt.Chart(occ_agg, width=100).transform_density(
    'average_perc_green_skills',
    as_=['average_perc_green_skills', 'density'],
).mark_area(orient='horizontal', color=green).encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('average_perc_green_skills:Q', title="Average proportion of green skills")).properties(
    width=chart_width,
    height=chart_height
)


ind_vio = alt.Chart(occ_agg, width=100).transform_density(
    'average_ind_perunit_ghg',
    as_=['average_ind_perunit_ghg', 'density'],
).mark_area(orient='horizontal', color=yellow).encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('average_ind_perunit_ghg:Q', title="Average GHG emissions per unit of economic activity")).properties(
    width=chart_width,
    height=chart_height
)

    
occ1_vio =  alt.Chart(occ_agg, width=100).transform_density(
    'average_occ_green_timeshare',
    as_=['average_occ_green_timeshare', 'density'],
).mark_area(orient='horizontal', color=purple).encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('average_occ_green_timeshare:Q', title="Fraction of time spent doing green tasks")).properties(
    width=chart_width,
    height=chart_height
)

In [8]:


# Dots for skills_vio with dynamic data based on dropdown selection
skills_dots = alt.Chart(occ_agg).mark_point(color="red", filled=True, size=100, opacity=0.6).encode(
    x=alt.value(x_value),
    y='average_perc_green_skills:Q'
).transform_filter(occupation_dropdown)

# Dots for ind_vio with dynamic data based on dropdown selection
ind_dots = alt.Chart(occ_agg).mark_point(color="red", filled=True, size=100, opacity=0.6).encode(
    x=alt.value(x_value),
    y='average_ind_perunit_ghg:Q'
).transform_filter(occupation_dropdown)

# Dots for occ1_vio with dynamic data based on dropdown selection
occ1_dots = alt.Chart(occ_agg).mark_point(color="red", filled=True, size=100, opacity=0.6).encode(
    x=alt.value(x_value),
    y='average_occ_green_timeshare:Q'
).transform_filter(occupation_dropdown)

In [9]:
# Combine everything into a single chart
vio_chart_with_dots = (occ1_vio + occ1_dots) | (skills_vio + skills_dots) | (ind_vio + ind_dots)

final_chart = alt.vconcat(vio_chart_with_dots).add_selection(
    occupation_dropdown
)

#add chart title based on occupation dropdown
vio_chart_with_dots_config = pt.configure_plots(final_chart)



In [10]:
vio_chart_with_dots_config

In [11]:
vio_chart_with_dots_config.save(f"{graph_dir}/vio_chart_with_dots.html")

## Beeswarm plot
- Use the numbers of occupations in cuts of the measures to determine the amount of x jitter added

In [12]:
occ_agg.dropna(subset=['average_occ_green_timeshare'], inplace=True)

In [13]:
n_cuts=100
occ_agg['average_prop_green_skills_cut'] = pd.cut(occ_agg['average_prop_green_skills'], n_cuts).apply(lambda x: float(x.left))
occ_agg['average_ind_perunit_ghg_cut'] = pd.cut(occ_agg['average_ind_perunit_ghg'], n_cuts).apply(lambda x: float(x.left)) 
occ_agg['average_occ_green_timeshare_cut'] = pd.cut(occ_agg['average_occ_green_timeshare'], n_cuts).apply(lambda x: float(x.left)) 

num_each_interval = occ_agg['average_prop_green_skills_cut'].value_counts().to_dict()
occ_agg['average_prop_green_skills_cut_n'] = occ_agg['average_prop_green_skills_cut'].map(num_each_interval)
num_each_interval = occ_agg['average_ind_perunit_ghg_cut'].value_counts().to_dict()
occ_agg['average_ind_perunit_ghg_cut_n'] = occ_agg['average_ind_perunit_ghg_cut'].map(num_each_interval)
num_each_interval = occ_agg['average_occ_green_timeshare_cut'].value_counts().to_dict()
occ_agg['average_occ_green_timeshare_cut_n'] = occ_agg['average_occ_green_timeshare_cut'].map(num_each_interval)

occ_agg['average_prop_green_skills_x_jitter'] = occ_agg['average_prop_green_skills_cut_n'].apply(lambda x: np.random.uniform(-x,x))
occ_agg['average_ind_perunit_ghg_x_jitter'] = occ_agg['average_ind_perunit_ghg_cut_n'].apply(lambda x: np.random.uniform(-x,x))
occ_agg['average_occ_green_timeshare_x_jitter'] = occ_agg['average_occ_green_timeshare_cut_n'].apply(lambda x: np.random.uniform(-x,x))

# occ_agg['average_prop_green_skills_x_jitter'] = occ_agg['average_prop_green_skills_cut_n'].apply(lambda x: np.random.uniform(-1,1))
# occ_agg['average_ind_perunit_ghg_x_jitter'] = occ_agg['average_ind_perunit_ghg_cut_n'].apply(lambda x: np.random.uniform(-1,1))
# occ_agg['average_occ_green_timeshare_x_jitter'] = occ_agg['average_occ_green_timeshare_cut_n'].apply(lambda x: np.random.uniform(-1,1))



In [14]:
occ_names = sorted(list(occ_agg['clean_soc_name'].unique()))

occupation_dropdown = alt.selection_point(
    fields=['clean_soc_name'],
    value=occ_names[0],
    bind=alt.binding_select(options=occ_names, name='Highlight occupation: ',),
)

In [15]:
circle_size = 50
select_dot_size = 100
circle_alpha=0.3

skills_beeswarm = alt.Chart(
    occ_agg,
).mark_circle(size=circle_size, color=green, opacity = circle_alpha).encode(
    y=alt.Y('average_prop_green_skills_x_jitter', title="").axis(labels=False, values=[0], grid=False, ticks=True),
    x=alt.X('average_perc_green_skills:Q',
          title="Average percentage of green skills (%)").axis(grid=False, ticks=True),
    tooltip=[
                alt.Tooltip("clean_soc_name", title="Occupation"),
                alt.Tooltip("average_prop_green_skills", title="Average percentage of green skills (%)", format='.2%')
    ]
).properties(
)

# Dots for skills_vio with dynamic data based on dropdown selection
skills_beeswarm_dots = alt.Chart(occ_agg).mark_point(color="red", filled=True, size=select_dot_size, opacity=0.6).encode(
    y='average_prop_green_skills_x_jitter',
    x='average_perc_green_skills:Q'
).transform_filter(occupation_dropdown)


inds_beeswarm = alt.Chart(
    occ_agg,
).mark_circle(size=circle_size, color=yellow, opacity = circle_alpha).encode(
    y=alt.Y('average_ind_perunit_ghg_x_jitter', title="").axis(labels=False, values=[0], grid=False, ticks=True),
    x=alt.X('average_ind_perunit_ghg:Q',
          title="Average GHG emissions per unit of economic output").axis(grid=False, ticks=True),
    tooltip=[
                alt.Tooltip("clean_soc_name", title="Occupation"),
                alt.Tooltip("average_ind_perunit_ghg", title="Average GHG emissions per unit of economic output", format='.3')
    ]
).properties(
)

# Dots for skills_vio with dynamic data based on dropdown selection
inds_beeswarm_dots = alt.Chart(occ_agg).mark_point(color="red", filled=True, size=select_dot_size, opacity=0.6).encode(
    y='average_ind_perunit_ghg_x_jitter',
    x='average_ind_perunit_ghg:Q'
).transform_filter(occupation_dropdown)


occs_beeswarm = alt.Chart(
    occ_agg,
).mark_circle(size=circle_size, color=purple, opacity = circle_alpha).encode(
    y=alt.Y('average_occ_green_timeshare_x_jitter', title="").axis(labels=False, values=[0], grid=False, ticks=True),
    x=alt.X('average_occ_green_timeshare:Q',
          title="Percentage of time spent on green tasks (%)").axis(grid=False, ticks=True),
    tooltip=[
                alt.Tooltip("clean_soc_name", title="Occupation"),
                alt.Tooltip("average_prop_occ_green_timeshare", title="Percentage of time spent on green tasks (%)", format='.2%')
    ]
).properties(
)

# Dots for skills_vio with dynamic data based on dropdown selection
occs_beeswarm_dots = alt.Chart(occ_agg).mark_point(color="red", filled=True, size=select_dot_size, opacity=0.6).encode(
    y='average_occ_green_timeshare_x_jitter',
    x='average_occ_green_timeshare:Q'
).transform_filter(occupation_dropdown)



In [16]:
# Combine everything into a single chart

beeswarm_chart_with_dots = alt.vconcat(
    (occs_beeswarm + occs_beeswarm_dots).properties(width='container', height=100),
    (skills_beeswarm + skills_beeswarm_dots).properties(width='container', height=100),
    (inds_beeswarm + inds_beeswarm_dots).properties(width='container', height=100),
    spacing=50
)
    

beeswarm_final_chart = beeswarm_chart_with_dots.add_selection(
    occupation_dropdown
)

#add chart title based on occupation dropdown
beeswarm_final_chart_config = pt.configure_plots(
    beeswarm_final_chart.configure(autosize="fit-x"),
    fontsize_normal=16,
    fontsize_title=18,
    chart_title="Green measure values across occupations"
)

In [17]:
beeswarm_final_chart_config.save(f"{graph_dir}/beeswarm_chart_with_dots.html")

In [18]:
beeswarm_final_chart_config