## Create 2x2 typologies

Create occupational- and industry- 2x2 typologies for the UK labour market.

In [339]:
import dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures as pg
from dap_prinz_green_jobs.getters.ojo_getters import (
    get_large_ojo_location_sample,
    get_large_ojo_salaries_sample,
)
from dap_prinz_green_jobs import BUCKET_NAME, logger, PROJECT_DIR
from dap_prinz_green_jobs.getters.data_getters import load_s3_data, save_to_s3
from dap_prinz_green_jobs.utils.plotting import configure_plots

from datetime import datetime
import os
import ast
import pandas as pd
import numpy as np

import altair as alt


In [340]:
# save graphs
today = datetime.today().strftime("%y%m%d")
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

/Users/india.kerlenesta/Projects/dap_green_jobs/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/231212 directory already exists


In [341]:
#alt disable max rows

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## 0. Load data

load and clean up data 

In [342]:
skill_measures_df, occs_measures_df, inds_measures_df, soc_name_dict = pg.load_ojo_green_measures()

In [343]:
#clean up SOC codes in occs_measures_df

socs = occs_measures_df['SOC'].tolist()
socs_dict = []
for soc in socs:
    if isinstance(soc, str): 
        soc_clean = soc.replace('nan', 'None')
        formatted_soc = ast.literal_eval(soc_clean)
    else:
        formatted_soc = soc
    socs_dict.append(formatted_soc)
    
soc_df = pd.json_normalize(socs_dict)
occs_measures_df = pd.concat([occs_measures_df, soc_df], axis=1)

In [344]:
skill_measures_df['id'] = skill_measures_df['job_id']
occs_measures_df['id'] = occs_measures_df['job_id']
inds_measures_df['id'] = inds_measures_df['job_id']

print(len(skill_measures_df))
print(skill_measures_df["id"].nunique())
print(len(occs_measures_df))
print(occs_measures_df["id"].nunique())
print(len(inds_measures_df))
print(inds_measures_df["id"].nunique())

all_green_measures_df = pd.merge(
     skill_measures_df, occs_measures_df, how="outer", on="job_id"
 )
all_green_measures_df = pd.merge(
    all_green_measures_df, inds_measures_df, how="outer", on="job_id"
)
#replace float with 0
all_green_measures_df = all_green_measures_df.fillna("")

all_green_measures_df.drop(columns=["SOC", "id_x", "id_y"], inplace=True)

all_green_measures_df.rename(
    columns={"name": "SOC_names", "id": "job_id"}, inplace=True
)
all_green_measures_df["SOC_2020_name"] = all_green_measures_df["SOC_2020"].map(
    soc_name_dict['soc_2020_4']
)

all_green_measures_df.rename(columns={})
all_green_measures_df = all_green_measures_df.loc[:, ~all_green_measures_df.columns.duplicated()]

print(len(all_green_measures_df))
print(all_green_measures_df["job_id"].nunique())
print(all_green_measures_df.columns)
all_green_measures_df.head(2)

1000000
1000000
1000000
1000000
1000000
1000000
1000000
1000000
Index(['job_id', 'NUM_ORIG_ENTS', 'NUM_SPLIT_ENTS', 'ENTS', 'GREEN_ENTS',
       'PROP_GREEN', 'BENEFITS', 'GREEN CATEGORY', 'GREEN/NOT GREEN',
       'GREEN TIMESHARE', 'GREEN TOPICS', 'SOC_2020_EXT', 'SOC_2020',
       'SOC_2010', 'SOC_names', 'SIC', 'SIC_name', 'SIC_confidence',
       'SIC_method', 'company_description', 'INDUSTRY TOTAL GHG EMISSIONS',
       'INDUSTRY GHG PER UNIT EMISSIONS', 'INDUSTRY PROP HOURS GREEN TASKS',
       'INDUSTRY PROP WORKERS GREEN TASKS',
       'INDUSTRY PROP WORKERS 20PERC GREEN TASKS',
       'INDUSTRY GHG EMISSIONS PER EMPLOYEE',
       'INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE', 'SOC_2020_name'],
      dtype='object')


Unnamed: 0,job_id,NUM_ORIG_ENTS,NUM_SPLIT_ENTS,ENTS,GREEN_ENTS,PROP_GREEN,BENEFITS,GREEN CATEGORY,GREEN/NOT GREEN,GREEN TIMESHARE,...,SIC_method,company_description,INDUSTRY TOTAL GHG EMISSIONS,INDUSTRY GHG PER UNIT EMISSIONS,INDUSTRY PROP HOURS GREEN TASKS,INDUSTRY PROP WORKERS GREEN TASKS,INDUSTRY PROP WORKERS 20PERC GREEN TASKS,INDUSTRY GHG EMISSIONS PER EMPLOYEE,INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE,SOC_2020_name
0,41547517,22,23,"[[['passionate about'], 'SKILL'], [['maintaini...",[],0.0,,,,,...,closest distance,Zachary Daniels specialises in Retail and Fina...,59.5,0.0,8.3,58.6,18.9,0.2,242.7,
1,41547521,3,3,"[[['Porta Cabins on a construction site'], 'SK...",[],0.0,,Non-Green,Non-green,0.0,...,,,,,,,,,,Vehicle valeters and cleaners


In [345]:
all_green_measures_df = all_green_measures_df[all_green_measures_df['INDUSTRY GHG PER UNIT EMISSIONS']!=":"].reset_index(drop=True)

all_green_measures_df['GREEN_ENTS'] = all_green_measures_df['GREEN_ENTS'].apply(pg.safe_literal_eval)
all_green_measures_df['ENTS'] = all_green_measures_df['ENTS'].apply(pg.safe_literal_eval)

# #add number of green ents
all_green_measures_df['NUM_GREEN_ENTS'] = all_green_measures_df['GREEN_ENTS'].apply(lambda x: len(x) if isinstance(x, list) else 0)
all_green_measures_df['NUM_ENTS'] = all_green_measures_df['ENTS'].apply(lambda x: len(x) if isinstance(x, list) else 0)

In [346]:
all_green_measures_df['INDUSTRY TOTAL GHG EMISSIONS'] = all_green_measures_df['INDUSTRY TOTAL GHG EMISSIONS'].apply(lambda x: float(x) if x != '' else np.nan)
all_green_measures_df['INDUSTRY GHG PER UNIT EMISSIONS'] = all_green_measures_df['INDUSTRY GHG PER UNIT EMISSIONS'].apply(lambda x: float(x) if x != '' else np.nan)
all_green_measures_df['GREEN TIMESHARE'] = all_green_measures_df['GREEN TIMESHARE'].apply(lambda x: float(x) if x != '' else np.nan)

all_green_measures_df['INDUSTRY PROP HOURS GREEN TASKS'] = all_green_measures_df['INDUSTRY PROP HOURS GREEN TASKS'].apply(lambda x: float(x) if x != '' else np.nan)
all_green_measures_df['INDUSTRY GHG EMISSIONS PER EMPLOYEE'] = all_green_measures_df['INDUSTRY GHG EMISSIONS PER EMPLOYEE'].apply(lambda x: float(x) if x != '' else np.nan)
all_green_measures_df['INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE'] = all_green_measures_df['INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE'].apply(lambda x: float(x) if x != '' else np.nan)

## 1. Generate 2x2 typology graphs

### 1.1 industry 2x2 typology

In [None]:
#the industry 2x2 typology - for green, neutral and brown industries

all_green_measures_df_ind = all_green_measures_df.query('SIC != ""').reset_index(drop=True)

measures_by_ind_df = (all_green_measures_df_ind
                      .groupby('SIC_name')
                      .agg({'job_id': 'count',
                            'PROP_GREEN': 'mean',
                            'NUM_GREEN_ENTS': 'mean',
                            'GREEN TIMESHARE': 'mean',
                           'INDUSTRY TOTAL GHG EMISSIONS': 'mean',
                            'INDUSTRY GHG PER UNIT EMISSIONS': 'mean',
                            'INDUSTRY PROP HOURS GREEN TASKS': 'mean',
                            'INDUSTRY GHG EMISSIONS PER EMPLOYEE': 'mean',
                            'INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE': 'mean'})
                      .reset_index())


measures_by_ind_df.rename(columns={'job_id': 'number of job ads'}, inplace=True)
measures_by_ind_df = measures_by_ind_df[measures_by_ind_df['number of job ads'] >= 50].reset_index(drop=True)

In [None]:
mean_value = measures_by_ind_df['INDUSTRY TOTAL GHG EMISSIONS'].mean()
std_dev = measures_by_ind_df['INDUSTRY TOTAL GHG EMISSIONS'].std()

# Define the bin edges based on standard deviations
bins = [float('-inf'), mean_value, mean_value + (2*std_dev), float('inf')]

# Define labels for the categories
labels = ['green', 'neutral', 'brown']

# Create a new categorical column based on standard deviations
measures_by_ind_df['ghg_emissions_cat'] = pd.cut(measures_by_ind_df['INDUSTRY TOTAL GHG EMISSIONS'], bins=bins, labels=labels, include_lowest=True)

measures_by_ind_df['ghg_emissions_cat'].value_counts()

green      403
neutral    128
brown       14
Name: ghg_emissions_cat, dtype: int64

In [None]:
#create a scatter plot
custom_color_scheme = ['#1a9641', '#d8b365', '#8c510a']  # Example colors for brown, yellow, and green

measures_by_ind_df = measures_by_ind_df.dropna(subset=['INDUSTRY TOTAL GHG EMISSIONS'])

# Assuming measures_by_ind_df is your DataFrame
chart = alt.Chart(measures_by_ind_df).mark_circle(size=60).encode(
    x=alt.X('GREEN TIMESHARE', scale=alt.Scale(domain=[0, 30]), axis=alt.Axis(title='mean occupational green timeshare')),
    y=alt.Y('PROP_GREEN', axis=alt.Axis(title='mean proportion of green skills')),
    size=alt.Size('INDUSTRY TOTAL GHG EMISSIONS:Q', legend=alt.Legend(title='GHG emissions')),
    color=alt.Color('ghg_emissions_cat', scale=alt.Scale(range=custom_color_scheme), legend=None),
    tooltip=['SIC_name', 'number of job ads', 'PROP_GREEN', 'GREEN TIMESHARE', 'INDUSTRY TOTAL GHG EMISSIONS']
).properties(width=600, height=400)

mean_line = alt.Chart(pd.DataFrame({'mean_value': [measures_by_ind_df['GREEN TIMESHARE'].mean()]})).mark_rule(strokeDash=[5, 5], color='black').encode(
    x='mean_value:Q',
    size=alt.value(1)  # Adjust line thickness as needed
)

zero_line = alt.Chart(pd.DataFrame({'zero_line': [measures_by_ind_df['PROP_GREEN'].mean()]})).mark_rule(strokeDash=[5, 5], color='black').encode(
    y='zero_line:Q',
    size=alt.value(1)  # Adjust line thickness as needed
)

ind_chart = chart + mean_line + zero_line 

ind_graph = configure_plots(ind_chart, 
                chart_title='Industry greenness',
                chart_subtitle='Mean occupational green timeshare and average proportion of green skills by industry.')

ind_graph.save(f'{graph_dir}/2x2_ind_graph.html')
ind_graph

### 1.2 occupation 2x2 typology

In [None]:
all_green_measures_df_soc = all_green_measures_df[~all_green_measures_df['SOC_2020_name'].isna()]

measures_by_occ_df = (all_green_measures_df_soc
                      .groupby('SOC_2020_name')
                      .agg({'job_id': 'count',
                            'PROP_GREEN': 'mean',
                            'GREEN TIMESHARE': 'mean',
                            'INDUSTRY TOTAL GHG EMISSIONS': 'mean',
                            'INDUSTRY GHG PER UNIT EMISSIONS': 'mean',
                            'INDUSTRY PROP HOURS GREEN TASKS': 'mean',
                            'INDUSTRY GHG EMISSIONS PER EMPLOYEE': 'mean',
                            'INDUSTRY CARBON DIOXIDE EMISSIONS PER EMPLOYEE': 'mean'}))

measures_by_occ_df.reset_index(inplace=True)
measures_by_occ_df.rename(columns={'job_id': 'number of job ads'}, inplace=True)

measures_by_occ_df = measures_by_occ_df[measures_by_occ_df['number of job ads'] >= 50].reset_index(drop=True)

In [None]:
mean_value = measures_by_occ_df['GREEN TIMESHARE'].mean()
std_dev = measures_by_occ_df['GREEN TIMESHARE'].std()

# Define the bin edges based on standard deviations
bins = [float('-inf'), mean_value + (std_dev), mean_value + (2*std_dev), float('inf')]

# Define labels for the categories
labels = ['brown', 'neutral','green']

# Create a new categorical column based on standard deviations
measures_by_occ_df['green_timeshare_cat'] = pd.cut(measures_by_occ_df['GREEN TIMESHARE'], bins=bins, labels=labels, include_lowest=True)

measures_by_occ_df['green_timeshare_cat'].value_counts()

brown      328
green       31
neutral     27
Name: green_timeshare_cat, dtype: int64

In [None]:
measures_by_occ_df = measures_by_occ_df[~measures_by_occ_df['green_timeshare_cat'].isna()]

custom_color_scheme = ['#8c510a','#d8b365','#1a9641']  # Example colors for brown, yellow, and green

industry_columns = [i for i in measures_by_occ_df.columns if 'INDUSTRY' in i]

select_box = alt.binding_select(options=industry_columns, name='Select Industry Measure: ')
selection = alt.selection_point(fields=['column'], bind=select_box)

base = alt.Chart(measures_by_occ_df).transform_fold(
    industry_columns,
    as_=['column', 'value']
).transform_filter(
    selection  
).mark_circle().encode(
    x=alt.X('value:Q', axis=alt.Axis(title='')),
    y=alt.Y('PROP_GREEN', axis=alt.Axis(title='mean proportion of green skills')),
    size=alt.Size('GREEN TIMESHARE:Q', legend=alt.Legend(title='green time share')),
    color=alt.Color('green_timeshare_cat', scale=alt.Scale(range=custom_color_scheme), legend=alt.Legend(title='')),
    tooltip=['SOC_2020_name', 'number of job ads', 'PROP_GREEN', 'GREEN TIMESHARE', 'INDUSTRY TOTAL GHG EMISSIONS']
).add_selection(
    selection
)

# Chart with dynamic column selection
chart = base.add_selection(selection).properties(width=600, height=400)

mean_line = alt.Chart(measures_by_occ_df).transform_fold(
    industry_columns,
    as_=['column', 'value']
).transform_filter(
    selection  
).mark_rule(strokeDash=[5, 5], color='black').encode(
    x=alt.X('mean(value):Q', axis=alt.Axis(title='')),
    size=alt.value(1)
)

zero_line = alt.Chart(pd.DataFrame({'zero_line': [measures_by_occ_df['PROP_GREEN'].mean()]})).mark_rule(strokeDash=[5, 5], color='black').encode(
    y='zero_line:Q',
    size=alt.value(1)  # Adjust line thickness as needed
)

occ_chart = chart + mean_line + zero_line

occ_graph = configure_plots(occ_chart, 
                chart_title='Occupational greenness',
                chart_subtitle='Select an industry measure and compare it against the occupational average proportion of green skills.').properties(width=600, height=400)

occ_graph.save(f'{graph_dir}/2x2_occ_graph.html')
occ_graph

