## Create violin plots

Create violin plots of the greenness measures for the the Green Jobs Explorer.

In [50]:
import dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures as pg

from dap_prinz_green_jobs import PROJECT_DIR
import dap_prinz_green_jobs.utils.plotting as pt
from datetime import datetime
import os
import pandas as pd

import altair as alt

In [2]:
#save graphs
today = datetime.today().strftime('%y%m%d')
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

/Users/india.kerlenesta/Projects/dap_green_jobs/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/231215 directory already exists


In [272]:
#alt disable max rows

alt.data_transformers.disable_max_rows()


green = pt.NESTA_COLOURS[1]
yellow = pt.NESTA_COLOURS[0]
purple = pt.NESTA_COLOURS[2]

chart_width = 200
chart_height = 450

soc_name = "Aerospace engineers"
x_value = 96

## 0. Load data

load and clean up data 

In [4]:
skill_measures_df, occs_measures_df, inds_measures_df, soc_name_dict = pg.load_ojo_green_measures()

2023-12-15 13:54:16,283 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2023-12-15 13:54:16,592 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [5]:
all_green_measures_df = pg.merge_green_measures(skill_measures_df=skill_measures_df, 
                                                    occs_measures_df=occs_measures_df, 
                                                    inds_measures_df=inds_measures_df,
                                                    soc_name_dict=soc_name_dict)

There are 1000000 rows in the merged data
There are 1000000 unique job ids


In [6]:
print(all_green_measures_df.shape)
print(all_green_measures_df.columns)
all_green_measures_df.head(2)

(1000000, 30)
Index(['job_id', 'NUM_ORIG_ENTS', 'NUM_SPLIT_ENTS', 'ENTS', 'GREEN_ENTS',
       'PROP_GREEN', 'BENEFITS', 'GREEN CATEGORY', 'GREEN/NOT GREEN',
       'GREEN TIMESHARE', 'GREEN TOPICS', 'SOC_2020_EXT', 'SOC_2020',
       'SOC_2010', 'SOC_names', 'SIC', 'SIC_name', 'SIC_confidence',
       'SIC_method', 'company_description', 'INDUSTRY TOTAL GHG EMISSIONS',
       'INDUSTRY GHG PER UNIT EMISSIONS', 'INDUSTRY PROP HOURS GREEN TASKS',
       'INDUSTRY PROP WORKERS GREEN TASKS',
       'INDUSTRY PROP WORKERS 20PERC GREEN TASKS',
       'INDUSTRY GHG EMISSIONS PER EMPLOYEE',
       'INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE', 'NUM_GREEN_ENTS',
       'SOC_2020_name', 'SOC_2020_EXT_name'],
      dtype='object')


Unnamed: 0,job_id,NUM_ORIG_ENTS,NUM_SPLIT_ENTS,ENTS,GREEN_ENTS,PROP_GREEN,BENEFITS,GREEN CATEGORY,GREEN/NOT GREEN,GREEN TIMESHARE,...,INDUSTRY TOTAL GHG EMISSIONS,INDUSTRY GHG PER UNIT EMISSIONS,INDUSTRY PROP HOURS GREEN TASKS,INDUSTRY PROP WORKERS GREEN TASKS,INDUSTRY PROP WORKERS 20PERC GREEN TASKS,INDUSTRY GHG EMISSIONS PER EMPLOYEE,INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE,NUM_GREEN_ENTS,SOC_2020_name,SOC_2020_EXT_name
0,41547517,22,23,"[[[passionate about], SKILL], [[maintaining st...",[],0.0,,,,,...,59.5,0.0,8.3,58.6,18.9,0.2,242.7,0,,
1,41547521,3,3,"[[[Porta Cabins on a construction site], SKILL...",[],0.0,,Non-Green,Non-green,0.0,...,,,,,,,,0,Vehicle valeters and cleaners,Vehicle valeters and cleaners


## 2. Generate violin plots

In [10]:
all_green_measures_df.columns

Index(['job_id', 'NUM_ORIG_ENTS', 'NUM_SPLIT_ENTS', 'ENTS', 'GREEN_ENTS',
       'PROP_GREEN', 'BENEFITS', 'GREEN CATEGORY', 'GREEN/NOT GREEN',
       'GREEN TIMESHARE', 'GREEN TOPICS', 'SOC_2020_EXT', 'SOC_2020',
       'SOC_2010', 'SOC_names', 'SIC', 'SIC_name', 'SIC_confidence',
       'SIC_method', 'company_description', 'INDUSTRY TOTAL GHG EMISSIONS',
       'INDUSTRY GHG PER UNIT EMISSIONS', 'INDUSTRY PROP HOURS GREEN TASKS',
       'INDUSTRY PROP WORKERS GREEN TASKS',
       'INDUSTRY PROP WORKERS 20PERC GREEN TASKS',
       'INDUSTRY GHG EMISSIONS PER EMPLOYEE',
       'INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE', 'NUM_GREEN_ENTS',
       'SOC_2020_name', 'SOC_2020_EXT_name'],
      dtype='object')

In [261]:
all_green_measures_df_occ = all_green_measures_df.query('SIC != ""').reset_index(drop=True)

measures_by_occ_df = (all_green_measures_df_occ
                      .groupby('SOC_2020_name')
                      .agg({'job_id': 'count',
                            'PROP_GREEN': 'mean',
                            'NUM_GREEN_ENTS': 'mean',
                            'GREEN TIMESHARE': 'mean',
                            'GREEN TOPICS': 'mean',
                           'INDUSTRY TOTAL GHG EMISSIONS': 'mean',
                            'INDUSTRY GHG PER UNIT EMISSIONS': 'mean',
                            'INDUSTRY PROP HOURS GREEN TASKS': 'mean',
                            'INDUSTRY GHG EMISSIONS PER EMPLOYEE': 'mean',
                            'INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE': 'mean'})
                      .reset_index()
                      .query('job_id > 50')
                      .reset_index(drop=True))

In [344]:
skills_vio = alt.Chart(measures_by_occ_df, width=100).transform_density(
    'PROP_GREEN',
    as_=['PROP_GREEN', 'density'],
).mark_area(orient='horizontal', color=green).encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('PROP_GREEN:Q', title="Average proportion of green skills")).properties(
    width=chart_width,
    height=chart_height
)


ind_vio = alt.Chart(measures_by_occ_df, width=100).transform_density(
    'INDUSTRY GHG PER UNIT EMISSIONS',
    as_=['INDUSTRY GHG PER UNIT EMISSIONS', 'density'],
).mark_area(orient='horizontal', color=yellow).encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('INDUSTRY GHG PER UNIT EMISSIONS:Q', title="Average GHG emissions per unit of economic activity")).properties(
    width=chart_width,
    height=chart_height
)

    
occ1_vio =  alt.Chart(measures_by_occ_df, width=100).transform_density(
    'GREEN TIMESHARE',
    as_=['GREEN TIMESHARE', 'density'],
).mark_area(orient='horizontal', color=purple).encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('GREEN TIMESHARE:Q', title="Fraction of time spent doing green tasks")).properties(
    width=chart_width,
    height=chart_height
)

    
occ2_vio =  alt.Chart(measures_by_occ_df, width=100).transform_density(
    'GREEN TOPICS',
    as_=['GREEN TOPICS', 'density'],
).mark_area(orient='horizontal', color=purple).encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y('GREEN TOPICS:Q', title="Number of O*NET green topics")).properties(
    width=chart_width,
    height=chart_height
)

In [346]:
occupation_dropdown = alt.selection_single(
    fields=['SOC_2020_name'],
    name='OccupationSelect',
    bind=alt.binding_select(options=list(measures_by_occ_df['SOC_2020_name'].unique())),
)

# Dots for skills_vio with dynamic data based on dropdown selection
skills_dots = alt.Chart(measures_by_occ_df).mark_point(color="red", filled=True, size=100, opacity=0.6).encode(
    x=alt.value(x_value),
    y='PROP_GREEN:Q'
).transform_filter(occupation_dropdown)

# Dots for ind_vio with dynamic data based on dropdown selection
ind_dots = alt.Chart(measures_by_occ_df).mark_point(color="red", filled=True, size=100, opacity=0.6).encode(
    x=alt.value(x_value),
    y='INDUSTRY GHG PER UNIT EMISSIONS:Q'
).transform_filter(occupation_dropdown)

# Dots for occ1_vio with dynamic data based on dropdown selection
occ1_dots = alt.Chart(measures_by_occ_df).mark_point(color="red", filled=True, size=100, opacity=0.6).encode(
    x=alt.value(x_value),
    y='GREEN TIMESHARE:Q'
).transform_filter(occupation_dropdown)

# Dots for occ2_vio with dynamic data based on dropdown selection
occ2_dots = alt.Chart(measures_by_occ_df).mark_point(color="red", filled=True, size=100, opacity=0.6).encode(
    x=alt.value(x_value),
    y='GREEN TOPICS:Q'
).transform_filter(occupation_dropdown)


In [347]:
# Combine everything into a single chart
vio_chart_with_dots = (occ1_vio + occ1_dots) | (occ2_vio + occ2_dots) | (skills_vio + skills_dots) | (ind_vio + ind_dots)

final_chart = alt.vconcat(vio_chart_with_dots).add_selection(
    occupation_dropdown
)

#add chart title based on occupation dropdown
vio_chart_with_dots_config = pt.configure_plots(final_chart)

In [348]:
vio_chart_with_dots_config

In [349]:
vio_chart_with_dots_config.save(f"{graph_dir}/vio_chart_with_dots.html")