## Create 2x2 typologies

Create occupational- and industry- 2x2 typologies for the UK labour market.

In [9]:
import dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures as pg

from dap_prinz_green_jobs import PROJECT_DIR, analysis_config
from dap_prinz_green_jobs.utils.plotting import configure_plots, GREEN_MEASURES_COLORS
from datetime import datetime
import os
import pandas as pd

import altair as alt
import ast

In [3]:
# save graphs
today = datetime.today().strftime("%y%m%d")
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

Creating /Users/india.kerlenesta/Projects/dap_green_jobs/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/240122 directory


In [4]:
#alt disable max rows

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## 0. Load data

In [17]:
occ_date = analysis_config['analysis_files']['agg_soc_date_stamp']
print(occ_date)
occ_agg = pd.read_csv(f's3://prinz-green-jobs/outputs/data/ojo_application/extracted_green_measures/analysis/occupation_aggregated_data_{occ_date}.csv')

occ_agg = occ_agg.query('num_job_ads > 100').reset_index(drop=True)

ind_date = analysis_config['analysis_files']['agg_sic_date_stamp']
print(ind_date)
ind_agg = pd.read_csv(f's3://prinz-green-jobs/outputs/data/ojo_application/extracted_green_measures/analysis/industry_aggregated_data_{ind_date}.csv')

ind_agg = ind_agg.query('num_job_ads > 100').reset_index(drop=True)
ind_agg['SIC_name'] = ind_agg['top_5_sics'].apply(lambda x: [i.get('sic_name', None) for i in ast.literal_eval(x)][0])


20240112
20240112


## 1. Generate 2x2 typology graphs

### 1.1 industry 2x2 typology

In [18]:
#create a scatter plot
#measures_by_ind_df = measures_by_ind_df.dropna(subset=['INDUSTRY TOTAL GHG EMISSIONS'])

custom_color_scheme = [
    GREEN_MEASURES_COLORS[0],
    GREEN_MEASURES_COLORS[1],
    GREEN_MEASURES_COLORS[2]
]
# Assuming measures_by_ind_df is your DataFrame
chart = alt.Chart(ind_agg).mark_circle(size=100).encode(
    x=alt.X('average_occ_green_timeshare', axis=alt.Axis(title='mean occupational green timeshare')).scale(type="log"),
    y=alt.Y('average_prop_green_skills', axis=alt.Axis(title='mean proportion of green skills')),
    #add condition    
    color=alt.condition(
    alt.datum.average_ind_perunit_ghg > 1,  # Condition for color change
    alt.value('brown'),     # Color if condition is True
    alt.value('green')),
    tooltip=['SIC_name', 'num_job_ads', 'average_prop_green_skills', 'average_occ_green_timeshare', 'average_ind_perunit_ghg']
).properties(width=600, height=400)

mean_line = alt.Chart(pd.DataFrame({'mean_value': [ind_agg['average_occ_green_timeshare'].mean()]})).mark_rule(strokeDash=[5, 5], color='black').encode(
    x='mean_value:Q',
    size=alt.value(1)  # Adjust line thickness as needed
)

zero_line = alt.Chart(pd.DataFrame({'zero_line': [ind_agg['average_prop_green_skills'].mean()]})).mark_rule(strokeDash=[5, 5], color='black').encode(
    y='zero_line:Q',
    size=alt.value(1)  # Adjust line thickness as needed
)

ind_chart = chart + mean_line + zero_line 

ind_graph = configure_plots(ind_chart, 
                chart_title='Industry greenness',
                chart_subtitle='Mean occupational green timeshare and average proportion of green skills by industry.')

ind_graph.save(f'{graph_dir}/2x2_ind_graph.html')
ind_graph

### 1.2 occupation 2x2 typology

In [20]:
industry_columns = ['average_ind_perunit_ghg', 'average_ind_prop_hours', 'average_ind_prop_workers']

select_box = alt.binding_select(options=industry_columns, name='Select Industry Measure: ')
selection = alt.selection_point(value=industry_columns[0], fields=['column'], bind=select_box)

base = alt.Chart(occ_agg).transform_fold(
    industry_columns,
    as_=['column', 'value']
).transform_filter(
    selection  
).mark_circle(size=100).encode(
    x=alt.X('value:Q', axis=alt.Axis(title='')).scale(type="log"),
    y=alt.Y('average_prop_green_skills', axis=alt.Axis(title='mean proportion of green skills')),
    color=alt.Color('average_occ_green_timeshare', scale=alt.Scale(range=custom_color_scheme), legend=alt.Legend(title='green timeshare')),
    tooltip=['SOC_2020_EXT_name', 'num_job_ads', 'average_prop_green_skills', 'average_occ_green_timeshare', 'average_ind_perunit_ghg']
).add_selection(
    selection
)

# Chart with dynamic column selection
chart = base.add_selection(selection).properties(width=600, height=400)

mean_line = alt.Chart(occ_agg).transform_fold(
    industry_columns,
    as_=['column', 'value']
).transform_filter(
    selection  
).mark_rule(strokeDash=[5, 5], color='black').encode(
    x=alt.X('mean(value):Q', axis=alt.Axis(title='')),
    size=alt.value(1)
)

zero_line = alt.Chart(pd.DataFrame({'zero_line': [occ_agg['average_prop_green_skills'].mean()]})).mark_rule(strokeDash=[5, 5], color='black').encode(
    y='zero_line:Q',
    size=alt.value(1)  # Adjust line thickness as needed
)

occ_chart = chart + mean_line + zero_line

occ_graph = configure_plots(occ_chart, 
                chart_title='Occupational greenness',
                chart_subtitle='Select an industry measure and compare it against the occupational average proportion of green skills.').properties(width=600, height=400)

occ_graph.save(f'{graph_dir}/2x2_occ_graph.html')
occ_graph

