## Create 2x2 typologies

Create occupational- and industry- 2x2 typologies for the UK labour market.

In [1]:
import dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures as pg

from dap_prinz_green_jobs import PROJECT_DIR, analysis_config
from dap_prinz_green_jobs.utils.plotting import configure_plots, GREEN_MEASURES_COLORS, NESTA_COLOURS_DICT
from datetime import datetime
import os
import pandas as pd

import altair as alt
import ast
from textwrap import wrap

In [2]:
# save graphs
today = datetime.today().strftime("%y%m%d")
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

Creating /Users/elizabethgallagher/Code/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/240301 directory


In [3]:
#alt disable max rows

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## 0. Load data

In [4]:
gje_occ_agg = pd.read_csv(f's3://prinz-green-jobs/outputs/data/ojo_application/extracted_green_measures/analysis/occupation_aggregated_data_{occ_date}_extra_gjeformat.csv')
len(gje_occ_agg)

NameError: name 'occ_date' is not defined

In [5]:
occ_date = analysis_config['analysis_files']['agg_soc_date_stamp']
print(occ_date)
occ_agg = pd.read_csv(f's3://prinz-green-jobs/outputs/data/ojo_application/extracted_green_measures/analysis/occupation_aggregated_data_{occ_date}.csv')
occ_agg = occ_agg[occ_agg['clean_soc_name']!='Betting shop managers']
occ_agg.reset_index(inplace=True)

occ_agg = occ_agg.query('num_job_ads > 100').reset_index(drop=True)

ind_date = analysis_config['analysis_files']['agg_sic_date_stamp']
print(ind_date)
ind_agg = pd.read_csv(f's3://prinz-green-jobs/outputs/data/ojo_application/extracted_green_measures/analysis/industry_aggregated_data_{ind_date}.csv')

ind_agg = ind_agg.query('num_job_ads > 100').reset_index(drop=True)
ind_agg['SIC_name'] = ind_agg['top_5_sics'].apply(lambda x: [i.get('sic_name', None) for i in ast.literal_eval(x)][0])

20240223
2024-03-01 11:42:23,997 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
20240226


In [6]:
# For plotting
occ_agg['average_perc_green_skills'] = occ_agg['average_prop_green_skills']*100
occ_agg['average_prop_occ_green_timeshare'] = occ_agg['average_occ_green_timeshare']/100

ind_agg['average_perc_green_skills'] = ind_agg['average_prop_green_skills']*100
ind_agg['average_prop_occ_green_timeshare'] = ind_agg['average_occ_green_timeshare']/100

In [7]:
occ_agg["top_3_sics_names"] = occ_agg["top_5_sics"
].apply(lambda x: ", ".join(['"' + s["sic_name"]+ '"' + f"({round(s['prop_job_ads']*100,1)}%)" for s in ast.literal_eval(x)[0:3]]))
occ_agg["top_3_green_skills_names"] = occ_agg[
    "top_5_green_skills"
].apply(lambda x: ", ".join(['"' + s["skill_name"]+ '"' + f"({round(s['prop_job_ads']*100,1)}%)" for s in ast.literal_eval(x)[0:3]]))

In [8]:
ind_agg["top_3_green_skills_names"] = ind_agg[
    "top_5_green_skills"
].apply(lambda x: ", ".join(['"' + s["skill_name"]+ '"' + f"({round(s['prop_job_ads']*100,1)}%)" for s in ast.literal_eval(x)[0:3]]))
ind_agg["top_3_socs_names"] = ind_agg[
    "top_5_socs"
].apply(lambda x: ", ".join(['"' + pg.clean_soc_name(s["soc_name"])+ '"' + f"({round(s['prop_job_ads']*100,1)}%)" for s in ast.literal_eval(x)[0:3]]))

In [13]:
## Render the text used in tooltips to be over multiple lines so they dont cause the tooltip to be too wide
my_str = "@jakevdp Any updates? \n Even if passing a list of strings in place of a single string, the elements of the list DONOT be rendered as separate lines in tooltip."

df = pd.DataFrame(data={"exam-ple": [my_str]})

alt.Chart(df).mark_rect().encode(
    tooltip=[alt.Tooltip("exam-ple")]
)

## 1. Generate 2x2 typology graphs

### 1.1 industry 2x2 typology

In [8]:
av_color_ind = ind_agg['average_ind_perunit_ghg'].mean()
av_color_ind

0.4441553133514986

In [35]:
#create a scatter plot
custom_color_scheme = [
    GREEN_MEASURES_COLORS[0],
    GREEN_MEASURES_COLORS[1],
    GREEN_MEASURES_COLORS[2]
]
# Assuming measures_by_ind_df is your DataFrame
chart = alt.Chart(ind_agg).mark_circle(size=100).encode(
    x=alt.X('average_occ_green_timeshare', axis=alt.Axis(title=['Average percentage of time spent', 'on green tasks by industry (%)'])).scale(type="log"),
    y=alt.Y('average_perc_green_skills', axis=alt.Axis(title=['Average percentage of', 'green skills by industry (%)'])),
    #add condition  
    color=alt.Color('average_ind_perunit_ghg',
                    scale=alt.Scale(
                        domain=[
                        ind_agg['average_ind_perunit_ghg'].min(),
                        ind_agg['average_ind_perunit_ghg'].quantile(q=0.33),
                        ind_agg['average_ind_perunit_ghg'].quantile(q=0.66),
                        ind_agg['average_ind_perunit_ghg'].max()], 
                    range=["green", NESTA_COLOURS_DICT['yellow'], 'saddlebrown', 'black'], 
                    # domain=[ind_agg['average_ind_perunit_ghg'].min(),av_color_ind,ind_agg['average_ind_perunit_ghg'].max()], 
                    # range=['green', NESTA_COLOURS_DICT['yellow'], 'black'], 
                    ),
                    legend=alt.Legend(title=wrap('GHG emissions per unit of economic output', 20))),
    tooltip=[
                alt.Tooltip("SIC_name", title="Industry"),
                alt.Tooltip("num_job_ads", title="Number of job adverts", format='.4'),
        alt.Tooltip("average_prop_green_skills", title="Average percentage of green skills", format='.2%'),
        alt.Tooltip("average_prop_occ_green_timeshare", title="Average percentage of time spent on green tasks", format='.2%'),
        alt.Tooltip("average_ind_perunit_ghg", title="GHG emissions per unit of economic output", format='.3'),
        alt.Tooltip("top_3_green_skills_names", title="Top 3 skills"),
        alt.Tooltip("top_3_socs_names", title="Top 3 occupations")
    ]
).properties(width=600, height=400)

mean_line = alt.Chart(pd.DataFrame({'mean_value': [ind_agg['average_occ_green_timeshare'].mean()]})).mark_rule(strokeDash=[5, 5], color='black').encode(
    x='mean_value:Q',
    size=alt.value(1)  # Adjust line thickness as needed
)

zero_line = alt.Chart(pd.DataFrame({'zero_line': [ind_agg['average_perc_green_skills'].mean()]})).mark_rule(strokeDash=[5, 5], color='black').encode(
    y='zero_line:Q',
    size=alt.value(1)  # Adjust line thickness as needed
)

ind_chart = chart + mean_line + zero_line 

ind_graph = configure_plots(ind_chart, 
                chart_title='Green measure values for each industry',
                fontsize_normal=16,
                fontsize_title=18,
                           ).properties(width='container').interactive()

ind_graph.save(f'{graph_dir}/2x2_ind_graph.html')

ind_graph

### 1.2 occupation 2x2 typology

In [10]:
av_color = occ_agg['average_occ_green_timeshare'].mean()

In [12]:
base = alt.Chart(occ_agg).mark_circle(size=100, opacity=0.8).encode(
    x=alt.X('average_ind_perunit_ghg',
            axis=alt.Axis(title=["Average GHG emissions", "per unit of economic output"])).scale(type="log"),
    y=alt.Y('average_perc_green_skills', axis=alt.Axis(title='Average percentage of green skills (%)')),
    color=alt.Color('average_occ_green_timeshare',
                    scale=alt.Scale(
                    domain=[
                        occ_agg['average_occ_green_timeshare'].min(),
                        occ_agg['average_occ_green_timeshare'].quantile(q=0.33),
                        occ_agg['average_occ_green_timeshare'].quantile(q=0.66),
                        occ_agg['average_occ_green_timeshare'].max()], 
                    range=['black', 'saddlebrown', NESTA_COLOURS_DICT['yellow'], "green"], 
                    ),
                    # scale=alt.Scale(scheme='goldgreen',reverse=False, domainMid=av_color),
                    legend=alt.Legend(title=wrap('Time spent on green tasks (%)',20))),
    tooltip=[
                alt.Tooltip("SOC_2020_EXT_name", title="Occupation"),
                alt.Tooltip("num_job_ads", title="Number of job adverts", format='.4'),
        alt.Tooltip("average_prop_green_skills", title="Average percentage of green skills", format='.2%'),
        alt.Tooltip("average_prop_occ_green_timeshare", title="Percentage of time spent on green tasks", format='.2%'),
        alt.Tooltip("average_ind_perunit_ghg", title="Average GHG emissions per unit of economic output", format='.3'),
        alt.Tooltip("top_3_green_skills_names", title="Top 3 skills"),
        alt.Tooltip("top_3_sics_names", title="Top 3 industries")
    ]
).properties(width=600, height=400)

mean_line = alt.Chart(occ_agg).mark_rule(
    strokeDash=[5, 5], color='black').encode(
    x=alt.X('mean(average_ind_perunit_ghg):Q', axis=alt.Axis(title='')),
    size=alt.value(1)
)

zero_line = alt.Chart(
    pd.DataFrame({'zero_line': [occ_agg['average_perc_green_skills'].mean()]})).mark_rule(strokeDash=[5, 5], color='black').encode(
    y='zero_line:Q',
    size=alt.value(1)  # Adjust line thickness as needed
)

occ_chart = base + mean_line + zero_line

occ_graph = configure_plots(occ_chart, 
                chart_title='Green measure values for each occupation',
                fontsize_normal=16,
                fontsize_title=18,
                           ).properties(width='container').interactive()

occ_graph.save(f'{graph_dir}/2x2_occ_graph.html')

occ_graph

In [19]:
ind_agg[ind_agg['SIC_name'].str.contains('extile')]['SIC_name'].tolist()

['Manufacture of machinery for textile, apparel and leather production',
 'Retail sale via stalls and markets of textiles, clothing and footwear',
 'Manufacture of textiles',
 'Wholesale of machinery for the textile industry and of sewing and knitting machines',
 'Manufacture of other textiles',
 'Manufacture of made-up textile articles, except apparel',
 'Agents involved in the sale of textiles, clothing, fur, footwear and leather goods',
 'Manufacture of other technical and industrial textiles']

In [23]:
occ_agg[occ_agg['SOC_2020_EXT_name'].str.contains('extile')]['SOC_2020_EXT_name'].tolist()

['Textile process operatives n.e.c.',
 'Textiles, garments and related trades n.e.c.',
 'Textile machine technicians']

In [26]:
ind_agg[ind_agg['SIC_name'].str.contains('extile')]['top_3_green_skills_names'].tolist()

['"health and safety regulations"(47.5%), "lean manufacturing"(14.2%), "implement environmental protection measures"(12.5%)',
 '"health and safety regulations"(32.3%), "energy efficiency"(15.6%), "ensure efficient utilisation of warehouse space"(7.2%)',
 '"health and safety regulations"(48.3%), "lean manufacturing"(6.8%), "develop management plans"(5.9%)',
 '"health and safety regulations"(44.4%), "challenging issues in the textile industry"(11.1%), "manage environmental management system"(7.4%)',
 '"health and safety regulations"(53.8%), "lean manufacturing"(5.1%), "implement environmental protection measures"(5.1%)',
 '"lean manufacturing"(20.6%), "health and safety regulations"(17.6%), "environmental engineering"(11.8%)',
 '"health and safety regulations"(33.3%), "ensure efficient utilisation of warehouse space"(16.7%), "follow procedures to control substances hazardous to health"(11.1%)',
 '"health and safety regulations"(60.5%), "lean manufacturing"(8.6%), "implement environmental

In [27]:
occ_agg[occ_agg['SOC_2020_EXT_name'].str.contains('extile')]['top_3_green_skills_names'].tolist()

['"health and safety regulations"(49.9%), "implement environmental protection measures"(4.6%), "safety engineering"(3.7%)',
 '"health and safety regulations"(23.8%), "monitor nature conservation"(19.0%), "sustainable footwear materials and components"(14.3%)',
 '"health and safety regulations"(47.8%), "challenging issues in the textile industry"(13.0%), "safety engineering"(8.7%)']