## Measures analysis

High level analysis of the measures

In [1]:
from dap_prinz_green_jobs.getters.ojo_getters import (
    get_mixed_ojo_job_title_sample
    
)

from dap_prinz_green_jobs.getters.occupation_getters import load_job_title_soc
from dap_prinz_green_jobs import BUCKET_NAME, logger, PROJECT_DIR
from dap_prinz_green_jobs.getters.data_getters import load_s3_data
import pandas as pd
import numpy as np

import altair as alt

import random
import os

In [2]:
#save graphs

graph_dir = str(PROJECT_DIR / "outputs/figures/between_measure_analysis/140923/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

Creating /Users/elizabethgallagher/Code/dap_prinz_green_jobs/outputs/figures/between_measure_analysis/140923 directory


In [4]:
date_stamp = "20230914"
production = "True"
config="base"

green_skills_outputs = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{date_stamp}/ojo_sample_skills_green_measures_production_{production}_{config}.json",
    )

date_stamp = "20230816"

green_occs_outputs = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{date_stamp}/ojo_sample_occupation_green_measures_production_{production}_{config}.json",
    )

green_inds_outputs = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/{date_stamp}/ojo_sample_industry_green_measures_production_{production}_{config}.json",
    )

In [5]:
skill_measures_df = pd.DataFrame.from_dict(green_skills_outputs, orient='index').reset_index().rename(columns={'index':'id'})
occs_measures_df = pd.DataFrame.from_dict(green_occs_outputs, orient='index').reset_index().rename(columns={'index':'id'})
inds_measures_df = pd.DataFrame.from_dict(green_inds_outputs, orient='index').reset_index().rename(columns={'index':'id'})


In [23]:
skill_measures_df["NUM_ENTS"] = skill_measures_df["NUM_SPLIT_ENTS"]

In [24]:
len(inds_measures_df)

39866

In [25]:
# Job metadata
ojo_job_title_raw = get_mixed_ojo_job_title_sample()
ojo_job_title_raw['id'] = ojo_job_title_raw['id'].astype(str) # Just to be consistant

In [26]:
len(ojo_job_title_raw)

79259

In [27]:
sd = load_job_title_soc()
soc_occ_dict = (sd.set_index("SOC 2020")
                ['SOC 2020 UNIT GROUP DESCRIPTIONS']
                .to_dict())

In [28]:
all_green_measures_df = pd.merge(skill_measures_df, occs_measures_df, how="outer", on="id")
all_green_measures_df = pd.merge(all_green_measures_df, inds_measures_df, how="outer", on="id")
all_green_measures_df= pd.merge(all_green_measures_df, ojo_job_title_raw, how="left", on="id")

all_green_measures_df['NUM_GREEN_ENTS'] = all_green_measures_df['GREEN_ENTS'].apply(lambda x: len(x) if x else 0)
# Separate out the SOC columns
for soc_columns in ['SOC_2020_EXT', 'SOC_2020', 'SOC_2010', 'name']:
    all_green_measures_df[soc_columns] = all_green_measures_df['SOC'].apply(lambda x: x[soc_columns] if x else None)
all_green_measures_df.drop(columns=['SOC'], inplace=True)

all_green_measures_df.rename(columns={"name":"SOC_names", "id": "job_id"}, inplace=True)
all_green_measures_df["SOC_2020_name"] = all_green_measures_df["SOC_2020"].map(soc_occ_dict)
    
all_green_measures_df = all_green_measures_df.replace({np.nan:None})


# weird thing in industry measures. 3 times
all_green_measures_df = all_green_measures_df[all_green_measures_df['INDUSTRY GHG PER UNIT EMISSIONS']!=":"]

all_green_measures_df.head(2)

Unnamed: 0,job_id,NUM_ORIG_ENTS,NUM_SPLIT_ENTS,ENTS,GREEN_ENTS,PROP_GREEN,BENEFITS,NUM_ENTS,GREEN CATEGORY,GREEN/NOT GREEN,...,sector,parent_sector,knowledge_domain,occupation,NUM_GREEN_ENTS,SOC_2020_EXT,SOC_2020,SOC_2010,SOC_names,SOC_2020_name
0,41549950,16,16,"[[[good work-life balance], SKILL], [[managing...","[[MSc in EIA, Environmental Management, [green...",0.0625,"[Option to purchase up, pension scheme, Group ...",16,Green New & Emerging,Green,...,Environmental Science,Scientific,Marketing And Media,Director Consultancy,1,2151/02,2151,2141,"[Biological scientists, Ecologists, Conservati...",Conservation professionals
1,41550510,9,10,"[[[Establish a productive relationship], SKILL...",[],0.0,,10,Non-Green,Non-green,...,Area Manager,Retail,Sales,Retail Manager,0,4143/99,4143,7220,[Database administrators and web content techn...,Customer service managers


In [123]:
# Set industry info to None if its a recruiter

all_green_measures_df.loc[all_green_measures_df["type"] == "Recruitment consultancy", [
    'SIC', 'SIC_name','INDUSTRY TOTAL GHG EMISSIONS', 'INDUSTRY GHG PER UNIT EMISSIONS',
       'INDUSTRY PROP HOURS GREEN TASKS', 'INDUSTRY PROP WORKERS GREEN TASKS',
       'INDUSTRY PROP WORKERS 20PERC GREEN TASKS']] = None

## High level information

In [124]:
coverage_df = pd.DataFrame([
    {"type": "SIC found", "value_type": "True", "num_job_advs":pd.notnull(all_green_measures_df["SIC"]).sum()},
    {"type": "SIC found", "value_type": "False", "num_job_advs":pd.isnull(all_green_measures_df["SIC"]).sum()},
    {"type": "SOC 2020 found", "value_type": "True", "num_job_advs":pd.notnull(all_green_measures_df["SOC_2020"]).sum()},
    {"type": "SOC 2020 found", "value_type": "False", "num_job_advs":pd.isnull(all_green_measures_df["SOC_2020"]).sum()},
    {"type": "Has at least 5 skills", "value_type": "True", "num_job_advs":(all_green_measures_df["NUM_ENTS"]>5).sum()},
    {"type": "Has at least 5 skills", "value_type": "False", "num_job_advs":(all_green_measures_df["NUM_ENTS"]<=5).sum()},
    {"type": "Has a company name", "value_type": "True", "num_job_advs":pd.notnull(all_green_measures_df["company_raw"]).sum()},
    {"type": "Has a company name", "value_type": "False", "num_job_advs":pd.isnull(all_green_measures_df["company_raw"]).sum()},
    {"type": "Has a job title", "value_type": "True", "num_job_advs":pd.notnull(all_green_measures_df["job_title_raw"]).sum()},
    {"type": "Has a job title", "value_type": "False", "num_job_advs":pd.isnull(all_green_measures_df["job_title_raw"]).sum()},
])

In [125]:
widths = 100
heights = 100

top_socs = all_green_measures_df["SOC_2020_name"].value_counts()[0:10].reset_index()

top_socs_chart = alt.Chart(top_socs, title="Most common SOCs").mark_bar(size=15, color="#0F294A").encode(
    y= alt.Y("index", title="SOC_2020", sort=None),
    x=alt.X("SOC_2020_name", title="Number of job adverts"),
    tooltip=["index",'SOC_2020_name']
).properties(height=heights, width=widths)

top_job_title_raw_without_soc = all_green_measures_df[pd.isnull(all_green_measures_df["SOC_2020_name"])]["job_title_raw"].value_counts()[0:10].reset_index()

top_job_title_raw_without_soc_chart = alt.Chart(top_job_title_raw_without_soc, title="Most common job titles without SOC info").mark_bar(size=15, color="#0F294A").encode(
    y= alt.Y("index", title="job_title_raw", sort=None),
    x=alt.X("job_title_raw", title="Number of job adverts"),
    tooltip=["index",'job_title_raw']
).properties(height=heights, width=widths)

top_job_title_raw_with_soc = all_green_measures_df[pd.notnull(all_green_measures_df["SOC_2020_name"])]["job_title_raw"].value_counts()[0:10].reset_index()

top_job_title_raw_with_soc_chart = alt.Chart(top_job_title_raw_with_soc, title="Most common job titles with SOC info").mark_bar(size=15, color="#0F294A").encode(
    y= alt.Y("index", title="job_title_raw", sort=None),
    x=alt.X("job_title_raw", title="Number of job adverts"),
    tooltip=["index",'job_title_raw']
).properties(height=heights, width=widths)

top_job_title_raw = all_green_measures_df["job_title_raw"].value_counts()[0:10].reset_index()

top_job_title_raw_chart = alt.Chart(top_job_title_raw, title="Most common job titles").mark_bar(size=15, color="#0F294A").encode(
    y= alt.Y("index", title="job_title_raw", sort=None),
    x=alt.X("job_title_raw", title="Number of job adverts"),
    tooltip=["index",'job_title_raw']
).properties(height=heights, width=widths)


top_sics = all_green_measures_df["SIC_name"].value_counts()[0:10].reset_index()

top_sics_chart = alt.Chart(top_sics, title="Most common SICs").mark_bar(size=15, color="#0F294A").encode(
    y= alt.Y("index", title="SIC_name", sort=None),
    x=alt.X("SIC_name", title="Number of job adverts"),
    tooltip=["index",'SIC_name']
).properties(height=heights, width=widths)

top_company_raw = all_green_measures_df[pd.isnull(all_green_measures_df["SIC_name"])]["company_raw"].value_counts()[0:10].reset_index()

top_company_raw_chart = alt.Chart(top_company_raw, title="Most common company names without SIC info").mark_bar(size=15, color="#0F294A").encode(
    y= alt.Y("index", title="company_raw", sort=None),
    x=alt.X("company_raw", title="Number of job adverts"),
    tooltip=["index",'company_raw']
).properties(height=heights, width=widths)


top_company_raw_withsic = all_green_measures_df[pd.notnull(all_green_measures_df["SIC_name"])]["company_raw"].value_counts()[0:10].reset_index()

top_company_raw_withsic_chart = alt.Chart(top_company_raw_withsic, title="Most common company names with SIC info").mark_bar(size=15, color="#0F294A").encode(
    y= alt.Y("index", title="company_raw", sort=None),
    x=alt.X("company_raw", title="Number of job adverts"),
    tooltip=["index",'company_raw']
).properties(height=heights, width=widths)


coverage_full = alt.Chart(coverage_df, title=f"Coverage from dataset of {len(all_green_measures_df)} job adverts").mark_bar().encode(
    y=alt.X('type', title=""),
    x=alt.Y('num_job_advs', title="Number of job adverts", ),
    color=alt.Color('value_type',
                    scale=alt.Scale(domain=['True', 'False'], range= ['#97D9E3', '#646363']),
                    title="",
                   ),
    tooltip=["num_job_advs", "value_type"]
)

soc_sic_chart = (top_socs_chart | top_job_title_raw_without_soc_chart | top_job_title_raw_with_soc_chart) & (top_sics_chart | top_company_raw_chart | top_company_raw_withsic_chart) & coverage_full

soc_sic_chart.save(f'{graph_dir}/soc_sic_chart.html')
soc_sic_chart

## Occupation measures
Group by occupation.

Some 2020 SOCs have different measures, since their 2010 codes were different, and sometimes matched to different green scores

In [126]:
soc_column = "SOC_2020"
occ_columns = ['GREEN CATEGORY', 'GREEN/NOT GREEN', 'GREEN TIMESHARE', 'GREEN TOPICS']

soc2020_2_job_ad_count = all_green_measures_df.groupby(soc_column)['job_id'].count().to_dict()

occ_measures_per_occ = all_green_measures_df.drop_duplicates(
    subset=[soc_column] + occ_columns)[[soc_column, 'SOC_2010', 'SOC_2020_name'] + occ_columns]
occ_measures_per_occ = occ_measures_per_occ[pd.notnull(occ_measures_per_occ[soc_column])]

occ_measures_per_occ['num_job_advs'] = occ_measures_per_occ[soc_column].map(soc2020_2_job_ad_count)

In [127]:
occ_measures_per_occ[occ_measures_per_occ[soc_column]=='8160']

Unnamed: 0,SOC_2020,SOC_2010,SOC_2020_name,GREEN CATEGORY,GREEN/NOT GREEN,GREEN TIMESHARE,GREEN TOPICS,num_job_advs
7507,8160,8131,"Production, factory and assembly supervisors",Green Increased Demand,Green,11.1,6,17
19665,8160,8114,"Production, factory and assembly supervisors",Green Increased Demand,Green,3.3,9,17
33619,8160,8121,"Production, factory and assembly supervisors",Non-Green,Non-green,0.0,5,17
39713,8160,8133,"Production, factory and assembly supervisors",Green Enhanced Skills,Green,0.9,6,17


In [128]:
# 2 occupational greenness measures sized by number job adverts 
occ_scatter_greenness = alt.Chart(
    occ_measures_per_occ[((pd.notnull(occ_measures_per_occ['GREEN TIMESHARE'])) & (pd.notnull(occ_measures_per_occ['GREEN TOPICS'])))],
    title=f'Occupation measures grouped by {soc_column}'
).mark_circle().encode(
    x=alt.X('GREEN TIMESHARE', title="% of time spend on green tasks", scale=alt.Scale(type='symlog'),),
    y=alt.Y('GREEN TOPICS', title="Number of green topics", scale=alt.Scale(type='symlog'),),
    size=alt.StrokeWidth('num_job_advs', title="Number of job adverts"),
    color=alt.Color('GREEN CATEGORY',
                   scale=alt.Scale(
            domain=['Non-Green', 'Green Enhanced Skills', 'Green New & Emerging', 'Green Increased Demand'],
            range=['#EB003B', 'green', 'blue', '#18A48C']), title="Green category"),
    tooltip=['SOC_2020_name', 'SOC_2020', 'GREEN TIMESHARE', 'GREEN TOPICS','GREEN CATEGORY', 'num_job_advs']
).properties(height=400, width=400).interactive()

top_socs = all_green_measures_df["SOC_2020_name"].value_counts()[0:10].reset_index()

top_socs_chart = alt.Chart(top_socs, title="Most common SOCs").mark_bar(size=15, color="#0F294A").encode(
    y= alt.Y("index", title="SOC_2020", sort=None),
    x=alt.X("SOC_2020_name", title="Number of job adverts"),
    tooltip=["index",'SOC_2020_name']
).properties(height=80, width=300)


num_bins = 20

col_name = 'GREEN TIMESHARE'
occ_timeshare_hist = occ_measures_per_occ[pd.notnull(occ_measures_per_occ[col_name])][col_name].value_counts(bins=num_bins).reset_index()
occ_timeshare_hist['x'] = occ_timeshare_hist['index'].apply(lambda x: x.mid)
occ_timeshare_hist.drop(columns='index', inplace=True)

occ_timeshare_hist_chart = alt.Chart(occ_timeshare_hist).mark_bar(size=15, color= "#0F294A").encode(
    x= alt.X("x", title="% of time spend on green tasks"),
    y=alt.Y(col_name, title="Number of job adverts"),
    tooltip=[col_name]
).properties(height=80, width=300)

col_name = 'GREEN TOPICS'
occ_topics_hist = occ_measures_per_occ[pd.notnull(occ_measures_per_occ[col_name])][col_name].value_counts(bins=num_bins).reset_index()
occ_topics_hist['x'] = occ_topics_hist['index'].apply(lambda x: x.mid)
occ_topics_hist.drop(columns='index', inplace=True)

occ_topics_hist_chart = alt.Chart(occ_topics_hist).mark_bar(size=15, color= "#0F294A").encode(
    x= alt.X("x", title="Number of green topics"),
    y=alt.Y(col_name, title="Number of job adverts"),
    tooltip=[col_name]
).properties(height=80, width=300)

green_cats = occ_measures_per_occ['GREEN CATEGORY'].value_counts(dropna=False).reset_index()

green_cats_chart = alt.Chart(green_cats).mark_bar(size=15, color= "#0F294A").encode(
    y= alt.Y("index", title="Green category"),
    x=alt.X("GREEN CATEGORY", title="Number of job adverts"),
    tooltip=["GREEN CATEGORY"]
).properties(height=80, width=300)

occ_greenness = (top_socs_chart & occ_timeshare_hist_chart & occ_topics_hist_chart & green_cats_chart) | occ_scatter_greenness 

#save graph
occ_greenness.save(f'{graph_dir}/occ_greenness.html')
occ_greenness

In [129]:
widths = 200

dataset_has_greentimes = occ_measures_per_occ[pd.notnull(occ_measures_per_occ["GREEN TIMESHARE"])]
most_green_timeshare_occ_chart = alt.Chart(
    dataset_has_greentimes.sort_values(by="GREEN TIMESHARE", ascending=False).head(10),
    title=f"SOC with the highest green timeshares (out of {len(occ_measures_per_occ)})"
).mark_bar(size=6, color= "green").encode(
    x= alt.X("GREEN TIMESHARE", title="% of time spend on green tasks"),
    y=alt.Y("SOC_2020_name", sort=None),
    tooltip=["GREEN TIMESHARE", "SOC_2020_name"]
).properties(height=100, width=widths)

least_green_timeshare_occ_chart = alt.Chart(
    dataset_has_greentimes.sort_values(by="GREEN TIMESHARE", ascending=True).head(10),
    title="SOC with the lowest green timeshares"
).mark_bar(size=6, color= "#EB003B").encode(
    x= alt.X("GREEN TIMESHARE", title="% of time spend on green tasks"),
    y=alt.Y("SOC_2020_name", sort=None),
    tooltip=["GREEN TIMESHARE", "SOC_2020_name"]
).properties(height=100, width=widths)

measure_col_name = 'GREEN TOPICS'
dataset_has_greentopics = occ_measures_per_occ[pd.notnull(occ_measures_per_occ[measure_col_name])]
most_green_topics_occ_chart = alt.Chart(
    dataset_has_greentopics.sort_values(by=measure_col_name, ascending=False).head(10),
    title="SOC with the highest numbers of green topics"
).mark_bar(size=6, color= "green").encode(
    x= alt.X(measure_col_name, title="Number of green topics"),
    y=alt.Y("SOC_2020_name", sort=None),
    tooltip=[measure_col_name, "SOC_2020_name"]
).properties(height=100, width=widths)

least_green_topics_occ_chart = alt.Chart(
    dataset_has_greentopics.sort_values(by=measure_col_name, ascending=True).head(10),
    title="SOC with the lowest numbers of green topics"
).mark_bar(size=6, color= "#EB003B").encode(
    x= alt.X(measure_col_name, title="Number of green topics"),
    y=alt.Y("SOC_2020_name", sort=None),
    tooltip=[measure_col_name, "SOC_2020_name"]
).properties(height=100, width=widths)

print(f"{len(occ_measures_per_occ)} unique SOCs")
occ_greenness_most_least = (most_green_timeshare_occ_chart | least_green_timeshare_occ_chart) & (
    most_green_topics_occ_chart | least_green_topics_occ_chart
)

occ_greenness_most_least.save(f'{graph_dir}/occ_greenness_most_least.html')
occ_greenness_most_least

463 unique SOCs


## Skills
- Most common green skills

In [130]:
num_bins = 20

prop_skills_hist = all_green_measures_df['PROP_GREEN'].value_counts(bins=num_bins).reset_index()
prop_skills_hist['x'] = prop_skills_hist['index'].apply(lambda x: x.mid)
prop_skills_hist.drop(columns='index', inplace=True)

prob_green_chart = alt.Chart(prop_skills_hist).mark_bar(size=15, color="green").encode(
    x= alt.X("x", title="Proportion of green skills"),
    y=alt.Y("PROP_GREEN", title="Number of job adverts"),
    tooltip=['PROP_GREEN']
)

num_skills_hist = all_green_measures_df['NUM_ENTS'].value_counts(bins=num_bins).reset_index()
num_skills_hist['x'] = num_skills_hist['index'].apply(lambda x: x.mid)
num_skills_hist.drop(columns='index', inplace=True)

num_skills_chart = alt.Chart(num_skills_hist).mark_bar(size=15, color='#0F294A').encode(
    x= alt.X("x", title="Number of all skills"),
    y=alt.Y("NUM_ENTS", title="Number of job adverts"),
    tooltip=['NUM_ENTS']
)

skill_greenness = num_skills_chart | prob_green_chart

skill_greenness.save(f'{graph_dir}/skill_greenness.html')
skill_greenness

#### Separate out all the skills and the green info (if it's green)

In [131]:
print(all_green_measures_df['occupation'].nunique())
print(all_green_measures_df['sector'].nunique())
print(all_green_measures_df['parent_sector'].nunique())
print(all_green_measures_df['knowledge_domain'].nunique())

2104
669
37
13


In [134]:
ents_explode = all_green_measures_df[['job_id', "SOC_2020_name", 'job_title_raw', 'occupation', 'ENTS']].explode("ENTS").reset_index(drop=True)
ents_explode["skill_label"] = ents_explode["ENTS"].apply(lambda x: x[0] if x else [])
print(len(ents_explode))
ents_explode = ents_explode.explode("skill_label").reset_index(drop=True)
print(len(ents_explode))
ents_explode.head(2)

724945
789202


Unnamed: 0,job_id,SOC_2020_name,job_title_raw,occupation,ENTS,skill_label
0,41549950,Conservation professionals,Principal Consultant/Associate Director - Ecol...,Director Consultancy,"[[good work-life balance], SKILL]",good work-life balance
1,41549950,Conservation professionals,Principal Consultant/Associate Director - Ecol...,Director Consultancy,"[[managing client projects], SKILL]",managing client projects


In [135]:
green_ents_explode = all_green_measures_df[['job_id', "SOC_2020_name", 'job_title_raw', 'occupation', 'GREEN_ENTS']].explode("GREEN_ENTS").reset_index(drop=True)
green_ents_explode["skill_label"] = green_ents_explode["GREEN_ENTS"].apply(lambda x: x[0] if isinstance(x, list) else None)
green_ents_explode["extracted_skill"] = green_ents_explode["GREEN_ENTS"].apply(
    lambda x: x[1][2][0] if isinstance(x, list) else None)
green_ents_explode["extracted_skill_id"] = green_ents_explode["GREEN_ENTS"].apply(
    lambda x: x[1][2][1] if isinstance(x, list) else None)

green_ents_explode.head(2)

Unnamed: 0,job_id,SOC_2020_name,job_title_raw,occupation,GREEN_ENTS,skill_label,extracted_skill,extracted_skill_id
0,41549950,Conservation professionals,Principal Consultant/Associate Director - Ecol...,Director Consultancy,"[MSc in EIA, Environmental Management, [green,...","MSc in EIA, Environmental Management",complete training in environmental matters,2cb27e30-2be7-4a09-9502-fdd2102c046b
1,41550510,Customer service managers,Retail Liaison Manager - Greater London,Retail Manager,,,,


In [136]:
green_skills_df = pd.concat([ents_explode, green_ents_explode])
green_skills_df = green_skills_df[(
    (green_skills_df["skill_label"]!="") & (pd.notnull(green_skills_df["skill_label"]))
)]

# Remove the duplicate green skills per job advert
print(len(green_skills_df))
green_skills_df.sort_values(by="extracted_skill", inplace=True)
green_skills_df.drop_duplicates(subset=["job_id", "skill_label"], keep='first', inplace=True)
print(len(green_skills_df))

green_skills_df.head(2)

822332
775893


Unnamed: 0,job_id,SOC_2020_name,job_title_raw,occupation,ENTS,skill_label,GREEN_ENTS,extracted_skill,extracted_skill_id
49068,47646218,Purchasing managers and directors,Category Manager Logistics,Category Manager,,Drive CSR,"[Drive CSR, [green, 0.6834325674325674, [CSR, ...",CSR,66db424f-2abe-420d-8e5b-186607266b61
48358,47567621,,CSR Co-Ordinator,,,CSR topics,"[CSR topics, [green, 0.7734325674325675, [CSR,...",CSR,66db424f-2abe-420d-8e5b-186607266b61


In [220]:
green_skills_df[green_skills_df['extracted_skill']=='promote sustainability']['extracted_skill_id'].unique()

array(['469e19ed-a0bd-445a-ae2d-4ba9430e296b'], dtype=object)

In [221]:
green_skills_df[green_skills_df['extracted_skill_id']=='469e19ed-a0bd-445a-ae2d-4ba9430e296b']['extracted_skill'].unique()


array(['advocate sustainability', 'encourage sustainability',
       'promote sustainability', 'promoting sustainability'], dtype=object)

In [137]:
top_n = 10

top_ents = green_skills_df["skill_label"].value_counts()[0:10].reset_index()

top_ents_chart = alt.Chart(top_ents, title="Most common extracted entities").mark_bar(size=15, color='#0F294A').encode(
    y= alt.Y("index", title="Entity extracted", sort=None),
    x=alt.X("skill_label", title="Number of job adverts"),
    tooltip=['skill_label'],
).properties(width=100)

top_green_ents = green_skills_df[pd.notnull(green_skills_df["extracted_skill"])]["skill_label"].value_counts()[0:10].reset_index()

top_green_ents_chart = alt.Chart(top_green_ents, title="Most common extracted green entities").mark_bar(size=15, color="green").encode(
    y= alt.Y("index", title="Entity extracted", sort=None),
    x=alt.X("skill_label", title="Number of job adverts"),
    tooltip=['skill_label']
).properties(width=100)

top_esco_green_skills = green_skills_df["extracted_skill"].value_counts()[0:10].reset_index()

top_esco_green_skills_chart = alt.Chart(top_esco_green_skills, title="Most common mapped green ESCO skills").mark_bar(size=15, color="green").encode(
    y= alt.Y("index", title="ESCO green skill", sort=None),
    x=alt.X("extracted_skill", title="Number of job adverts"),
    tooltip=['extracted_skill']
).properties(width=100)

common_skills_chart = top_ents_chart | top_green_ents_chart | top_esco_green_skills_chart

common_skills_chart.save(f'{graph_dir}/common_skills_chart.html')
common_skills_chart


In [138]:
# put them all together
all_skill_greenness = (num_skills_chart.properties(height=200) | prob_green_chart.properties(height=200)) & (top_ents_chart | top_green_ents_chart | top_esco_green_skills_chart)
all_skill_greenness.save(f'{graph_dir}/all_skill_greenness.html')
all_skill_greenness

###  Most and least proportion of green skills


In [170]:
widths = 200

min_num_ents = 10

dataset_min_num_ents = all_green_measures_df[all_green_measures_df["NUM_ENTS"]>=min_num_ents]

most_prop_green_skills_chart = alt.Chart(
    dataset_min_num_ents.sort_values(by="PROP_GREEN", ascending=False).head(10)[["PROP_GREEN", "job_title_raw", "company_raw"]],
    title=f"Jobs with the highest proportion of green skills (out of {len(dataset_min_num_ents)})"
).mark_bar(size=6, color= "green").encode(
    x= alt.X("PROP_GREEN", title="Proportion of green skills"),
    y=alt.Y("job_title_raw", sort=None),
    tooltip=["PROP_GREEN", "job_title_raw", "company_raw"]
).properties(height=100, width=widths)

least_prop_green_skills_chart = alt.Chart(
    dataset_min_num_ents.sort_values(by="PROP_GREEN", ascending=True).head(10)[["PROP_GREEN", "job_title_raw", "company_raw"]],
    title=f"Jobs with the highest proportion of green skills (out of {len(dataset_min_num_ents)})"
).mark_bar(size=6, color= "green").encode(
    x= alt.X("PROP_GREEN", title="Proportion of green skills"),
    y=alt.Y("job_title_raw", sort=None),
    tooltip=["PROP_GREEN", "job_title_raw", "company_raw"]
).properties(height=100, width=widths)

skills_greenness_most_least = (most_prop_green_skills_chart | least_prop_green_skills_chart)


skills_greenness_most_least.save(f'{graph_dir}/skills_greenness_most_least.html')
skills_greenness_most_least

In [178]:
all_green_measures_df['green_ents_list'] = all_green_measures_df["GREEN_ENTS"].apply(
    lambda x: [skill[0] for skill in x] if x else None)
all_green_measures_df['PROP_GREEN_JITTER'] = all_green_measures_df["PROP_GREEN"].apply(
    lambda x: x+random.uniform(-0.01, 0.01))
all_green_measures_df['NUM_ENTS_JITTER'] = all_green_measures_df["NUM_ENTS"].apply(
    lambda x: x+random.uniform(-0.1, 0.1))

In [179]:
all_green_measures_df["GREEN_ENTS"].iloc[0]

[['MSc in EIA, Environmental Management',
  ['green',
   0.938,
   ['complete training in environmental matters',
    '2cb27e30-2be7-4a09-9502-fdd2102c046b',
    0.5971940121657241]]]]

In [180]:
plot_n = 500
high_prop_green_skills = all_green_measures_df.sort_values(
    by="PROP_GREEN", ascending=False).head(
    plot_n
)[["NUM_ENTS_JITTER", "PROP_GREEN_JITTER", "job_id",'NUM_ENTS', 'PROP_GREEN', "job_title_raw", "company_raw", "SIC_name", "SOC_2020_name", "green_ents_list"]]


high_prop_green_skills_scatter_greenness = alt.Chart(
    high_prop_green_skills,
    title=f'Highest {plot_n} job adverts with highest proportion of green skills'
).mark_circle(color="green").encode(
    x=alt.X('NUM_ENTS_JITTER', title="Total number of entities"), #, scale=alt.Scale(type='symlog'),),
    y=alt.Y('PROP_GREEN_JITTER', title="Proportion of green skills"),
    tooltip=["job_id",'NUM_ENTS', 'PROP_GREEN', "job_title_raw", "company_raw", "SIC_name", "SOC_2020_name", "green_ents_list"]
).properties(height=400, width=400).interactive()

low_prop_green_skills = all_green_measures_df.sort_values(
    by="PROP_GREEN", ascending=True).head(plot_n
                                         )[["NUM_ENTS_JITTER", "PROP_GREEN_JITTER", "job_id",'NUM_ENTS', 'PROP_GREEN', "job_title_raw", "company_raw", "SIC_name", "SOC_2020_name", "green_ents_list"]]



low_prop_green_skills_scatter_greenness = alt.Chart(
    low_prop_green_skills,
    title=f'Lowest {plot_n} job adverts with lowest proportion of green skills'
).mark_circle(color= "#EB003B").encode(
    x=alt.X('NUM_ENTS_JITTER', title="Total number of entities"), #, scale=alt.Scale(type='symlog'),),
    y=alt.Y('PROP_GREEN_JITTER', title="Proportion of green skills"),
    tooltip=["job_id",'NUM_ENTS', 'PROP_GREEN', "job_title_raw", "company_raw", "SIC_name", "SOC_2020_name", "green_ents_list"]
).properties(height=400, width=400).interactive()

skills_greenness_most_least_scatter = high_prop_green_skills_scatter_greenness | low_prop_green_skills_scatter_greenness

skills_greenness_most_least_scatter.save(f'{graph_dir}/skills_greenness_most_least_scatter.html')
skills_greenness_most_least_scatter

In [187]:
# Most common green ESCO skill matched to and what the entities were

In [211]:
most_common_escos = list(green_skills_df['extracted_skill'].value_counts()[0:5].to_dict().keys())
most_common_escos

['sustainability',
 'health and safety regulations',
 'promote sustainability',
 'environmental sustainability',
 'encourage sustainability']

In [213]:
top_skills_sample = green_skills_df[green_skills_df['extracted_skill'].isin(most_common_escos)
               ].groupby('extracted_skill').apply(
    lambda x: x.sample(n=min(len(x), 3), random_state=42) if len(x)!=0 else None).reset_index(drop=True)
top_skills_sample[['job_title_raw', 'skill_label', 'extracted_skill']].to_csv("skills_mapped_sample_sept.csv")
                  

In [217]:
most_common_esco_ids = list(green_skills_df['extracted_skill_id'].value_counts()[0:5].to_dict().keys())

top_skills_ids_sample = green_skills_df[green_skills_df['extracted_skill_id'].isin(most_common_esco_ids)
               ].groupby('extracted_skill_id').apply(
    lambda x: x.sample(n=min(len(x), 3), random_state=42) if len(x)!=0 else None).reset_index(drop=True)
top_skills_ids_sample[['job_title_raw', 'skill_label', 'extracted_skill', 'extracted_skill_id']].to_csv("skill_ids_mapped_sample_sept.csv")
   

## Industry measures

In [None]:
ind_columns = ['INDUSTRY TOTAL GHG EMISSIONS',
       'INDUSTRY GHG PER UNIT EMISSIONS', 'INDUSTRY PROP HOURS GREEN TASKS',
       'INDUSTRY PROP WORKERS GREEN TASKS',
       'INDUSTRY PROP WORKERS 20PERC GREEN TASKS']

ind_measures_per_occ = all_green_measures_df.drop_duplicates(
    subset=["SIC"] + ind_columns)[['SIC', 'SIC_name'] + ind_columns]
ind_measures_per_occ = ind_measures_per_occ[pd.notnull(ind_measures_per_occ["SIC"])]

sic_2_job_ad_count = all_green_measures_df.groupby("SIC")['job_id'].count().to_dict()

ind_measures_per_occ['num_job_advs'] = ind_measures_per_occ["SIC"].map(sic_2_job_ad_count)

In [None]:
ind_measures_per_occ.head(2)

In [None]:
def get_binned_data(dataset, col_name, num_bins=20):
    
    data_hist = dataset[pd.notnull(dataset[col_name])][col_name].value_counts(bins=num_bins).reset_index()
    data_hist['x'] = data_hist['index'].apply(lambda x: x.mid)
    data_hist.drop(columns='index', inplace=True)
    return data_hist


In [None]:
top_sics = all_green_measures_df["SIC_name"].value_counts()[0:10].reset_index()

top_sics_chart = alt.Chart(top_sics, title="Most common SICs").mark_bar(size=15, color="#0F294A").encode(
    y= alt.Y("index", title="SIC", sort=None),
    x=alt.X("SIC_name", title="Number of job adverts"),
    tooltip=["index",'SIC_name']
).properties(height=100, width=100)

col_name = 'INDUSTRY TOTAL GHG EMISSIONS'
ind_total_ghg_hist = get_binned_data(ind_measures_per_occ, col_name, num_bins=20)
ind_total_ghg_hist_chart = alt.Chart(ind_total_ghg_hist).mark_bar(size=15, color= "#0F294A").encode(
    x= alt.X("x", title="Total GHG emissions"),
    y=alt.Y(col_name, title="Number of job adverts"),
    tooltip=[col_name]
).properties(height=100, width=200)

col_name = 'INDUSTRY GHG PER UNIT EMISSIONS'
ind_norm_ghg_hist = get_binned_data(ind_measures_per_occ, col_name, num_bins=20)

ind_norm_ghg_hist_chart = alt.Chart(ind_norm_ghg_hist).mark_bar(size=15, color= "#0F294A").encode(
    x= alt.X("x", title="GHG emissions per unit"),
    y=alt.Y(col_name, title="Number of job adverts"),
    tooltip=[col_name]
).properties(height=100, width=200)

col_name = 'INDUSTRY PROP HOURS GREEN TASKS'
ind_greentasks_1_hist = get_binned_data(ind_measures_per_occ, col_name, num_bins=20)
ind_greentasks_1_hist_chart = alt.Chart(ind_greentasks_1_hist).mark_bar(size=15, color= "#0F294A").encode(
    x= alt.X("x", title=col_name),
    y=alt.Y(col_name, title="Number of job adverts"),
    tooltip=[col_name]
).properties(height=100, width=200)

col_name = 'INDUSTRY PROP WORKERS GREEN TASKS'
ind_greentasks_2_hist = get_binned_data(ind_measures_per_occ, col_name, num_bins=20)
ind_greentasks_2_hist_chart = alt.Chart(ind_greentasks_2_hist).mark_bar(size=15, color= "#0F294A").encode(
    x= alt.X("x", title=col_name),
    y=alt.Y(col_name, title="Number of job adverts"),
    tooltip=[col_name]
).properties(height=100, width=200)

col_name = 'INDUSTRY PROP WORKERS 20PERC GREEN TASKS'
ind_greentasks_3_hist = get_binned_data(ind_measures_per_occ, col_name, num_bins=20)
ind_greentasks_3_hist_chart = alt.Chart(ind_greentasks_3_hist).mark_bar(size=15, color= "#0F294A").encode(
    x= alt.X("x", title=col_name),
    y=alt.Y(col_name, title="Number of job adverts"),
    tooltip=[col_name]
).properties(height=100, width=200)

In [140]:
# 2 industry greenness measures sized by number job adverts 
ind_scatter_greenness = alt.Chart(
    ind_measures_per_occ[(
        (pd.notnull(ind_measures_per_occ['INDUSTRY GHG PER UNIT EMISSIONS'])) & (
            pd.notnull(ind_measures_per_occ['INDUSTRY PROP HOURS GREEN TASKS'])) & (
            pd.notnull(ind_measures_per_occ['INDUSTRY TOTAL GHG EMISSIONS']))
    )],
    title=f'Industry measures grouped by SIC'
).mark_circle().encode(
    x=alt.X('INDUSTRY GHG PER UNIT EMISSIONS', scale=alt.Scale(type='symlog'), title="GHG emissions per unit"),
    y=alt.Y('INDUSTRY PROP HOURS GREEN TASKS', title="Proportion of hours on green tasks"),
    size=alt.Size('num_job_advs', title="Number of job adverts", scale=alt.Scale(range=[50, 1000])),
    color=alt.Color('INDUSTRY TOTAL GHG EMISSIONS', scale=alt.Scale(scheme='redyellowgreen', reverse=True, domainMid=1000)),
    tooltip=['SIC_name', 'SIC', 'INDUSTRY GHG PER UNIT EMISSIONS', 'INDUSTRY PROP HOURS GREEN TASKS','INDUSTRY TOTAL GHG EMISSIONS', 'num_job_advs']
).properties(height=400,width=400).interactive()

ind_greenness = (
    top_sics_chart & ind_total_ghg_hist_chart & ind_norm_ghg_hist_chart
) | (
    ind_greentasks_1_hist_chart & ind_greentasks_2_hist_chart & ind_greentasks_3_hist_chart
) | ind_scatter_greenness
#save graph
ind_greenness.save(f'{graph_dir}/ind_greenness.html')
ind_greenness


In [141]:
widths = 200

measure_col_name = 'INDUSTRY GHG PER UNIT EMISSIONS'
dataset_has_ghg = ind_measures_per_occ[(
    (pd.notnull(ind_measures_per_occ[measure_col_name])) & (pd.notnull(ind_measures_per_occ['SIC_name']))
)]
most_ghg_emissions_chart = alt.Chart(
    dataset_has_ghg.sort_values(by=measure_col_name, ascending=False).head(10),
    title=f"SIC with the highest GHG emissions per unit (out of {len(ind_measures_per_occ)})"
).mark_bar(size=6, color= "#EB003B").encode(
    x= alt.X(measure_col_name, title="GHG emissions per unit"),
    y=alt.Y("SIC_name", sort=None),
    tooltip=[measure_col_name, "SIC_name"]
).properties(height=100, width=widths)

least_ghg_emissions_chart = alt.Chart(
    dataset_has_ghg.sort_values(by=measure_col_name, ascending=True).head(10),
    title="SIC with the lowest GHG emissions per unit "
).mark_bar(size=6, color= "green").encode(
    x= alt.X(measure_col_name, title="GHG emissions per unit"),
    y=alt.Y("SIC_name", sort=None),
    tooltip=[measure_col_name, "SIC_name"]
).properties(height=100, width=widths)

measure_col_name = 'INDUSTRY PROP HOURS GREEN TASKS'
dataset_has_proptasks = ind_measures_per_occ[pd.notnull(ind_measures_per_occ[measure_col_name])]
most_prop_tasks_chart = alt.Chart(
    dataset_has_proptasks.sort_values(by=measure_col_name, ascending=False).head(10),
    title="SIC with the highest proportion of green tasks"
).mark_bar(size=6, color= "green").encode(
    x= alt.X(measure_col_name, title="Proportion of green tasks"),
    y=alt.Y("SIC_name", sort=None),
    tooltip=[measure_col_name, "SIC_name"]
).properties(height=100, width=widths)

least_prop_tasks_chart = alt.Chart(
    dataset_has_proptasks.sort_values(by=measure_col_name, ascending=True).head(10),
    title="SIC with the lowest proportion of green tasks"
).mark_bar(size=6, color= "#EB003B").encode(
    x= alt.X(measure_col_name, title="Proportion of green tasks"),
    y=alt.Y("SIC_name", sort=None),
    tooltip=[measure_col_name, "SIC_name"]
).properties(height=100, width=widths)

print(f"{len(ind_measures_per_occ)} unique SICs")
ind_greenness_most_least = (least_ghg_emissions_chart | most_ghg_emissions_chart) & (
    most_prop_tasks_chart | least_prop_tasks_chart
)

ind_greenness_most_least.save(f'{graph_dir}/ind_greenness_most_least.html')
ind_greenness_most_least

290 unique SICs


## Everything together!

In [181]:
section_font_size = 20

all_1 = alt.hconcat(
    top_socs_chart,
    top_job_title_raw_without_soc_chart,
    top_job_title_raw_with_soc_chart,
    title="SOCs & job titles"
)

all_2 = alt.hconcat(
    top_sics_chart,
    top_company_raw_chart,
    top_company_raw_withsic_chart,
    title="SICs & company names"
)
all_section = alt.vconcat(
    all_1,
    all_2,
    coverage_full,
    title=alt.TitleParams(f"Full dataset - {len(all_green_measures_df)} job adverts", fontSize=section_font_size),)


occ_1 = alt.hconcat(
    alt.vconcat(
        occ_timeshare_hist_chart,
        occ_topics_hist_chart,
        green_cats_chart
    ),
        occ_scatter_greenness)


occ_2 = alt.vconcat(
    alt.hconcat(
        most_green_timeshare_occ_chart,
        least_green_timeshare_occ_chart
    ),
    alt.hconcat(
        most_green_topics_occ_chart,
        least_green_topics_occ_chart
    )
)

occ_section = alt.vconcat(
    occ_1,
    occ_2,
    title=alt.TitleParams(f"Occupation measures - {len(occ_measures_per_occ)} occupations", fontSize=section_font_size),
)

# Industry measures

ind_1 = alt.hconcat(
    alt.vconcat(ind_total_ghg_hist_chart, ind_norm_ghg_hist_chart),
    alt.vconcat(ind_greentasks_1_hist_chart, ind_greentasks_2_hist_chart, ind_greentasks_3_hist_chart),
    ind_scatter_greenness
)
ind_2 = alt.vconcat(
    alt.hconcat(least_ghg_emissions_chart, most_ghg_emissions_chart),
    alt.hconcat(most_prop_tasks_chart, least_prop_tasks_chart)
)

ind_section = alt.vconcat(
    ind_1,
    ind_2,
    title=alt.TitleParams(f"Industry measures - {len(ind_measures_per_occ)} industries", fontSize=section_font_size),
)

# Skills measures

skills_1 = alt.vconcat(
    alt.hconcat(
    num_skills_chart.properties(height=200),
    prob_green_chart.properties(height=200)
),
    alt.hconcat(
        top_ents_chart,
        top_green_ents_chart,
        top_esco_green_skills_chart
    )
)
    
                       

skills_2 = alt.hconcat(high_prop_green_skills_scatter_greenness, low_prop_green_skills_scatter_greenness)

skills_section = alt.vconcat(
    skills_1,
    skills_2,
    title=alt.TitleParams(f"Skills measures - {len(all_green_measures_df)} job adverts", fontSize=section_font_size),
)



all_chart  = alt.vconcat(all_section, occ_section, ind_section, skills_section)

In [182]:
all_chart.save(f'{graph_dir}/all_individual_measures.html')


### The three individual scatters

In [183]:
all_green_measures_df['PROP_GREEN_BIN'] = pd.cut(all_green_measures_df['PROP_GREEN'], bins = 10)

In [184]:
plot_n = 100
rand_data = all_green_measures_df.sample(1000, random_state=42)
rand_data.drop(columns='PROP_GREEN_BIN', inplace=True)
random_skills_scatter_greenness = alt.Chart(
    rand_data,
    title=f'{len(rand_data)} random job adverts'
).mark_circle(color="green").encode(
    x=alt.X('NUM_ENTS_JITTER', title="Total number of entities"), #, scale=alt.Scale(type='symlog'),),
    y=alt.Y('PROP_GREEN_JITTER', title="Proportion of green skills"),
#     tooltip=["job_id",'NUM_ENTS', 'PROP_GREEN', "job_title_raw", "company_raw", "SIC_name", "SOC_2020_name", "green_ents_list"]
).properties(height=400, width=400).interactive()

random_skills_scatter_greenness

In [186]:
all_individual_measures_scatters = alt.hconcat(occ_scatter_greenness, ind_scatter_greenness, random_skills_scatter_greenness)
all_individual_measures_scatters.save(f'{graph_dir}/all_individual_measures_scatters.html')


## Between measures
- Taking a sample of job adverts with a range of different proportions of green skills
- Dont include those with < 5 entities

In [147]:

strat_sample = all_green_measures_df[all_green_measures_df["NUM_ENTS"]>5]
strat_sample = strat_sample.groupby('PROP_GREEN_BIN').apply(
    lambda x: x.sample(n=min(len(x), 300), random_state=42) if len(x)!=0 else None).reset_index(drop=True)
strat_sample.drop(columns="PROP_GREEN_BIN", inplace=True)
len(strat_sample)

1594

In [148]:
column_renamer = {
    'INDUSTRY PROP HOURS GREEN TASKS':"Industry: Proportion of green tasks",
    'INDUSTRY GHG PER UNIT EMISSIONS': "Industry: GHG emission per unit",
    'GREEN TIMESHARE': "Occupation: % time spent on green tasks",
    'GREEN TOPICS': "Occupation: Number of green topics",
    "PROP_GREEN": "Proportion of skills which are green"
}

In [149]:
x_col = 'INDUSTRY PROP HOURS GREEN TASKS'
y_col = 'GREEN TIMESHARE'
col_col = "PROP_GREEN"
ind_occ_1_chart = alt.Chart(
    strat_sample[(
        (pd.notnull(strat_sample[x_col])) & (
            pd.notnull(strat_sample[y_col])) & (
            pd.notnull(strat_sample[col_col]))
    )],
#     title=f"Occupation vs industry measures, with a stratified (by prop green skills) sample of {len(strat_sample)} job adverts"
).mark_circle(size=50).encode(
    x=alt.X(
        x_col,
        title=column_renamer[x_col]
#         scale=alt.Scale(type='symlog')
    ),
    y=alt.Y(y_col, title=column_renamer[y_col]),
    color=alt.Color(col_col, scale=alt.Scale(scheme='redyellowgreen', domainMid=0.2), title=column_renamer[col_col]),
    tooltip=['job_id', x_col, y_col, col_col, 'company_raw', "SIC_name", 'job_title_raw', "SOC_2020_name"]
).properties(height=400,width=400).interactive()

In [150]:
x_col = 'INDUSTRY GHG PER UNIT EMISSIONS'
y_col = 'GREEN TIMESHARE'
col_col = "PROP_GREEN"
ind_occ_2_chart = alt.Chart(
    strat_sample[(
        (pd.notnull(strat_sample[x_col])) & (
            pd.notnull(strat_sample[y_col])) & (
            pd.notnull(strat_sample[col_col]))
    )],
#     title=f"Occupation vs industry measures, with a stratified (by prop green skills) sample of {len(strat_sample)} job adverts"
).mark_circle(size=50).encode(
    x=alt.X(
        x_col,
        title=column_renamer[x_col],
        scale=alt.Scale(type='symlog')
    ),
    y=alt.Y(y_col, title=column_renamer[y_col]),
    color=alt.Color(col_col, scale=alt.Scale(scheme='redyellowgreen', domainMid=0.2), title=column_renamer[col_col]),
    tooltip=['job_id', x_col, y_col, col_col, 'company_raw', "SIC_name", 'job_title_raw', "SOC_2020_name"]
).properties(height=400,width=400).interactive()

In [151]:
x_col = 'INDUSTRY PROP HOURS GREEN TASKS'
y_col = 'GREEN TOPICS'
col_col = "PROP_GREEN"
ind_occ_3_chart = alt.Chart(
    strat_sample[(
        (pd.notnull(strat_sample[x_col])) & (
            pd.notnull(strat_sample[y_col])) & (
            pd.notnull(strat_sample[col_col]))
    )],
#     title=f"Occupation vs industry measures, with a stratified (by prop green skills) sample of {len(strat_sample)} job adverts"
).mark_circle(size=50).encode(
    x=alt.X(
        x_col,
        title=column_renamer[x_col]
    ),
    y=alt.Y(y_col, title=column_renamer[y_col]),
    color=alt.Color(col_col, scale=alt.Scale(scheme='redyellowgreen', domainMid=0.2), title=column_renamer[col_col]),
    tooltip=['job_id', x_col, y_col, col_col, 'company_raw', "SIC_name", 'job_title_raw', "SOC_2020_name"]
).properties(height=400,width=400).interactive()

In [152]:
x_col = 'INDUSTRY GHG PER UNIT EMISSIONS'
y_col = 'GREEN TOPICS'
col_col = "PROP_GREEN"
ind_occ_4_chart = alt.Chart(
    strat_sample[(
        (pd.notnull(strat_sample[x_col])) & (
            pd.notnull(strat_sample[y_col])) & (
            pd.notnull(strat_sample[col_col]))
    )],
#     title=f"Occupation vs industry measures, with a stratified (by prop green skills) sample of {len(strat_sample)} job adverts"
).mark_circle(size=50).encode(
    x=alt.X(
        x_col,
        title=column_renamer[x_col],
        scale=alt.Scale(type='symlog')
    ),
    y=alt.Y(y_col, title=column_renamer[y_col]),
    color=alt.Color(col_col, scale=alt.Scale(scheme='redyellowgreen', domainMid=0.2), title=column_renamer[col_col]),
    tooltip=['job_id', x_col, y_col, col_col, 'company_raw', "SIC_name", 'job_title_raw', "SOC_2020_name"]
).properties(height=400,width=400).interactive()

In [153]:
prop_green_jobs_dist_sample  = alt.Chart(
    strat_sample,
    title=f"Sample of {len(strat_sample)} job adverts which have at least 5 skills entities"
).mark_bar(
    color="#0F294A",
).encode(
    x=alt.X("PROP_GREEN", bin=True, title="Proportion of skills which are green"),
    y=alt.Y("count()", title="Number of job adverts"),
)


#### What are the industries and occupations with the most and least avergae prop of green skills

In [154]:
all_green_measures_df_5ents = all_green_measures_df[all_green_measures_df['NUM_ENTS']>5]

In [155]:
widths = 80

In [156]:
prop_green_skills_per_occ = all_green_measures_df_5ents.groupby("SOC_2020_name")['PROP_GREEN'].mean().reset_index().sort_values(
    by='PROP_GREEN', ascending=False).rename(columns={"PROP_GREEN": "Average proportion of green skills"})

most_prop_skills_by_occ = alt.Chart(
    prop_green_skills_per_occ.head(10),
    title="SOC with the highest proportion of green skills"
).mark_bar(size=6, color= "green").encode(
    x= alt.X("Average proportion of green skills"),
    y=alt.Y("SOC_2020_name", sort=None),
    tooltip=["Average proportion of green skills", "SOC_2020_name"]
).properties(height=100, width=widths)

least_prop_skills_by_occ = alt.Chart(
    prop_green_skills_per_occ.tail(10),
    title="SOC with the lowest proportion of green skills"
).mark_bar(size=6, color= "green").encode(
    x= alt.X("Average proportion of green skills"),
    y=alt.Y("SOC_2020_name", sort=None),
    tooltip=["Average proportion of green skills", "SOC_2020_name"]
).properties(height=100, width=widths)

In [157]:
prop_green_skills_per_ind = all_green_measures_df_5ents.groupby("SIC_name")['PROP_GREEN'].mean().reset_index().sort_values(
    by='PROP_GREEN', ascending=False).rename(columns={"PROP_GREEN": "Average proportion of green skills"})

most_prop_skills_by_ind = alt.Chart(
    prop_green_skills_per_ind.head(10),
    title="SIC with the highest proportion of green skills"
).mark_bar(size=6, color= "green").encode(
    x= alt.X("Average proportion of green skills"),
    y=alt.Y("SIC_name", sort=None),
    tooltip=["Average proportion of green skills", "SIC_name"]
).properties(height=100, width=widths)

least_prop_skills_by_ind = alt.Chart(
    prop_green_skills_per_ind.tail(10),
    title="SIC with the lowest proportion of green skills"
).mark_bar(size=6, color= "green").encode(
    x= alt.X("Average proportion of green skills"),
    y=alt.Y("SIC_name", sort=None),
    tooltip=["Average proportion of green skills", "SIC_name"]
).properties(height=100, width=widths)

## All between measures together

In [158]:
ind_occ_charts = alt.vconcat(
    alt.hconcat(
        ind_occ_1_chart.properties(height=200,width=200),
        ind_occ_2_chart.properties(height=200,width=200),
    ),
    alt.hconcat(
        ind_occ_3_chart.properties(height=200,width=200),
        ind_occ_4_chart.properties(height=200,width=200),
    ),
    prop_green_jobs_dist_sample.properties(height=100,width=100),
    alt.vconcat(
    alt.hconcat(most_prop_skills_by_occ, least_prop_skills_by_occ),
    alt.hconcat(most_prop_skills_by_ind, least_prop_skills_by_ind),
        title="All job adverts with more than 5 entities"
),
    title=f"Occupation vs industry measures, with a stratified (by prop green skills) sample of {len(strat_sample)} job adverts")

ind_occ_charts.save(f'{graph_dir}/ind_occ_charts.html')
