# Chloropleth plot
Average greenness by region


In [1]:
from dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures import *

from dap_prinz_green_jobs.getters.industry_getters import load_sic
from dap_prinz_green_jobs import BUCKET_NAME, logger, PROJECT_DIR, analysis_config
from dap_prinz_green_jobs.getters.data_getters import load_s3_data, save_to_s3
from dap_prinz_green_jobs.utils.chloropleth_utils import get_nuts2polygons_dict, get_nuts1polygons_dict, get_nuts3polygons_dict
from dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures import clean_soc_name
from dap_prinz_green_jobs.utils.plotting import configure_plots, NESTA_COLOURS_DICT

import altair as alt
from altair import datum
import geopandas as gpd

from datetime import datetime
from textwrap import wrap

import os
import ast

In [2]:
#save graphs
today = datetime.today().strftime('%y%m%d')
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

/Users/elizabethgallagher/Code/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/240301 directory already exists


## Load each aggregated by region dataset

In [3]:
def get_min_max_norm(df_col):
	return (df_col - df_col.min()) / (df_col.max() - df_col.min())

def get_standardised_norm(df_col):
	return (df_col-df_col.mean())/ df_col.std()

In [4]:
itl_aggregated_data = pd.DataFrame()
for agg_itl_by in ["itl_1_code", "itl_2_code", "itl_3_code"]:
    date_stamp = analysis_config['analysis_files']['agg_region_date_stamp']
    per_itl_aggregated_data = load_s3_data(
        BUCKET_NAME,
        f"outputs/data/ojo_application/extracted_green_measures/analysis/{agg_itl_by}_aggregated_data_{date_stamp}.csv"
        )

    per_itl_aggregated_data['average_perc_green_skills'] = per_itl_aggregated_data['average_prop_green_skills']*100
    per_itl_aggregated_data['average_prop_occ_green_timeshare'] = per_itl_aggregated_data['average_occ_green_timeshare']/100

    per_itl_aggregated_data.loc[:, "itl_type"] = agg_itl_by
    per_itl_aggregated_data.rename(columns = {agg_itl_by: "itl_code"}, inplace=True)
    per_itl_aggregated_data.drop([f"{agg_itl_by}.1"], axis=1, inplace=True)
    per_itl_aggregated_data.loc[:, 'average_ind_perunit_ghg_scaled'] = 1 - get_min_max_norm(per_itl_aggregated_data['average_ind_perunit_ghg'])
    per_itl_aggregated_data.loc[:, 'average_occ_green_timeshare_scaled'] = get_min_max_norm(per_itl_aggregated_data['average_occ_green_timeshare'])
    per_itl_aggregated_data.loc[:, 'average_perc_green_skills_scaled'] = get_min_max_norm(per_itl_aggregated_data['average_perc_green_skills'])
    itl_aggregated_data = pd.concat([itl_aggregated_data, per_itl_aggregated_data])

2024-03-01 12:59:50,445 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2024-03-01 12:59:50,701 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [5]:
len(itl_aggregated_data)

208

## Get additional geometry data needed for chloropleth

In [6]:
nuts1polygons_dict = get_nuts1polygons_dict()
itl1polygons_dict = {k.replace("UK","TL"):v for k, v in nuts1polygons_dict.items()}

nuts2polygons_dict = get_nuts2polygons_dict()
itl2polygons_dict = {k.replace("UK","TL"):v for k, v in nuts2polygons_dict.items()}

nuts3polygons_dict = get_nuts3polygons_dict()
itl3polygons_dict = {k.replace("UK","TL"):v for k, v in nuts3polygons_dict.items()}

allpolygons_dict = {**itl1polygons_dict, **itl2polygons_dict, **itl3polygons_dict}

In [7]:
itl_aggregated_data['geometry_name'] = itl_aggregated_data["itl_code"].map(allpolygons_dict)
itl_aggregated_data[['geometry', 'itl_name']] = itl_aggregated_data['geometry_name'].apply(lambda x: pd.Series(x))
itl_aggregated_data.drop('geometry_name', axis=1, inplace=True)

geo_df = gpd.GeoDataFrame(itl_aggregated_data)

## Get data in format to filter in viz

In [8]:
filter_by_cols = ["average_ind_perunit_ghg_scaled", "average_occ_green_timeshare_scaled", "average_perc_green_skills_scaled"]

rename_variable = {
    "average_occ_green_timeshare_scaled": "Average time spent on green tasks",
    "average_ind_perunit_ghg_scaled": "Average per unit GHG emissions",
    "average_perc_green_skills_scaled": "Average percentage of green skills"
}

static_columns = list(set(geo_df.columns).difference(set(filter_by_cols)))

merged_data = pd.DataFrame()
for col_name in filter_by_cols:
    c_df = geo_df[static_columns + [col_name]]
    c_df.rename(columns = {col_name: "value"}, inplace=True)
    c_df["value_orig"] = c_df[col_name.split("_scaled")[0]]
    c_df['variable'] = [rename_variable.get(col_name,col_name)]*len(c_df)

    merged_data = pd.concat([merged_data, c_df])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [9]:
len(merged_data)

624

## Plot

In [10]:
merged_data['itl_type'] = merged_data['itl_type'].map({'itl_1_code': 'ITL 1', 'itl_2_code': 'ITL 2', 'itl_3_code': 'ITL 3'})

In [11]:
merged_data["top_3_sics_names"] = merged_data["top_5_sics"
].apply(lambda x: ", ".join(['"' + s["sic_name"]+ '"' for s in ast.literal_eval(x)[0:3]]))
merged_data["top_3_green_skills_names"] = merged_data[
    "top_5_green_skills"
].apply(lambda x: ", ".join(['"' + s["skill_name"]+ '"' for s in ast.literal_eval(x)[0:3]]))
merged_data["top_3_socs_names"] = merged_data[
    "top_5_socs"
].apply(lambda x: ", ".join(['"' + clean_soc_name(s["soc_name"])+ '"' for s in ast.literal_eval(x)[0:3]]))

In [138]:
select_box_itl = alt.binding_select(options=list(merged_data['itl_type'].unique()), name="Select ITL granularity: ")
selection_itl = alt.selection_point(value="ITL 2", fields=['itl_type'], bind=select_box_itl)

select_box_type = alt.binding_select(options=list(merged_data['variable'].unique()), name="Select type of measure: ")
selection_type = alt.selection_point(value="Average percentage of green skills", fields=['variable'], bind=select_box_type)

regional_measures_plot = alt.Chart(merged_data).mark_geoshape().encode(
    color=alt.Color(
        'value', title="",
        scale=alt.Scale(
            # range=["saddlebrown", NESTA_COLOURS_DICT["yellow"], "limegreen", 'green'],
            scheme='goldgreen',reverse=False
            ),
        legend=None
    ), 
    tooltip=[
                alt.Tooltip("itl_name", title="Region"),
                alt.Tooltip("num_job_ads", title="Number of job adverts"),
        alt.Tooltip("average_prop_occ_green_timeshare",
                    title="Average % of time spent on green tasks for occupations in this region", format=".2%"),
        alt.Tooltip("average_prop_green_skills",title="Average % of green skills for adverts in this region", format=".2%"),
        alt.Tooltip("average_ind_perunit_ghg", title="Average GHG emissions for industries in this region", format=",.3f"),
        alt.Tooltip("top_3_green_skills_names", title="Top 3 green skills"),
        alt.Tooltip("top_3_sics_names", title="Top 3 industries"),
        alt.Tooltip("top_3_socs_names", title="Top 3 occupations"),
        
            ]
).add_selection(
    selection_itl
).transform_filter(
    selection_itl
).add_selection(
    selection_type
).transform_filter(
    selection_type
)#.resolve_scale(color='independent',size='independent')

### Have a top 3 and bottom 3 appear next to the map in a list for each measure


In [139]:
top_merged_data = merged_data.groupby(['variable', 'itl_type']).apply(
    lambda x : x.sort_values(by = 'value', ascending = False).head(3)).reset_index(drop=True)
top_merged_data["top_bottom"] = 0
top_merged_data["ngroup"] =top_merged_data.groupby(['variable', 'itl_type'])['value'].rank()
title_rows = top_merged_data[['variable', 'itl_type']].value_counts().reset_index()
title_rows["top_bottom"] = 0
title_rows["ngroup"] = 4
title_rows["itl_name"] = "Regions with most green values"
top_merged_data = pd.concat([top_merged_data,title_rows])


bottom_merged_data = merged_data.groupby(['variable', 'itl_type']).apply(
    lambda x : x.sort_values(by = 'value', ascending = False).tail(3)).reset_index(drop=True)
bottom_merged_data["top_bottom"] = 1
bottom_merged_data["ngroup"] =bottom_merged_data.groupby(['variable', 'itl_type'])['value'].rank()
title_rows = bottom_merged_data[['variable', 'itl_type']].value_counts().reset_index()
title_rows["top_bottom"] = 1
title_rows["ngroup"] = 4
title_rows["itl_name"] = "Regions with least green values"

bottom_merged_data = pd.concat([bottom_merged_data,title_rows])

top_bottom_merged_data = pd.concat([top_merged_data,bottom_merged_data]).reset_index()
len(top_bottom_merged_data)

72

In [140]:
top_bottom_merged_data['dummy_bar_length'] = 1
top_merged_data['dummy_bar_length'] = 1
bottom_merged_data['dummy_bar_length'] = 1

In [141]:


# top_regions_measures_plot = alt.Chart(
#     top_bottom_merged_data[['value', 'itl_name', 'num_job_ads',
#                             'itl_type', 'variable',
#                             'average_occ_green_timeshare','average_prop_green_skills',
#                             'average_ind_perunit_ghg', 'dummy_bar_length']
#     ].sort_values(by='value', ascending=False),
#     title=["","","Highest and lowest", "values for this measure"]
# ).mark_circle(size=100).encode(
#     color=alt.Color(
#         'value', title="",
#         scale=alt.Scale(scheme='goldgreen',reverse=False),
#         legend=None
#     ), 
#     y=alt.Y('itl_name',title="", axis=alt.Axis(title=None, labelLimit=1000, tickSize=0, domain=False),
#            sort=alt.EncodingSortField(field="value", order="descending")),
#     # x=alt.X('num_job_ads',sort=None, title="Number of job adverts", scale=alt.Scale(type='log')),
#     x=alt.X('dummy_bar_length',sort=None, title="",axis=alt.Axis(
#         labels=False, tickSize=0, domain=False, domainWidth=0.5, grid=False)).scale(zero=False, paddingOuter=0),
#     tooltip=[
#                 alt.Tooltip("itl_name", title="Region"),
#                 alt.Tooltip("num_job_ads", title="Number of job adverts"),
#         alt.Tooltip("average_occ_green_timeshare", title="Average % of time spent on green tasks for occupations in this region", format=",.2%"),
#         alt.Tooltip("average_prop_green_skills",title="Average % of green skills for adverts in this region", format=",.2%"),
#         alt.Tooltip("average_ind_perunit_ghg", title="Average per unit GHG emissions for industries in this region", format=",.4f"),
#                 # alt.Tooltip("value", title="Greenness measure", format=",.4f"),
        
#             ]
# ).add_selection(
#     selection_itl
# ).transform_filter(
#     selection_itl
# ).add_selection(
#     selection_type
# ).transform_filter(
#     selection_type
# )

In [142]:
text_chart = (
    alt.Chart(top_bottom_merged_data[['value', 'itl_name', 'num_job_ads',
                            'itl_type', 'variable', 'top_bottom', 'ngroup',
                                     'average_prop_occ_green_timeshare',
                                      'average_occ_green_timeshare',
                                      'average_perc_green_skills',
                                      'average_prop_green_skills',
                                     'average_ind_perunit_ghg']], title="")
    .mark_text(align="left", baseline="middle", dx=10, color="black", font="Century Gothic",)
    .encode(
        x=alt.X("top_bottom", axis=alt.Axis(labels=False, grid=False, domain=False, tickSize=0,domainWidth=0.5), title=""),
        y=alt.Y("ngroup", axis=alt.Axis(labels=False, grid=False, domain=False,tickSize=0,domainWidth=0.5), title=""),
        text=alt.Text("itl_name"),
        color=alt.condition(
            alt.datum.ngroup > 3,
            alt.value("black"),
            alt.value("#3b3b3b"),# almost black
        ),
        size=alt.condition(
            alt.datum.ngroup > 3,
            alt.value(14),
            alt.value(12),
        ),
        # tooltip=[alt.Tooltip("itl_name", title=""),
        #          alt.Tooltip("average_occ_green_timeshare", title="Average % of time spent on green tasks for occupations in this region", format=",.2%"),
        # alt.Tooltip("average_prop_green_skills",title="Average % of green skills for adverts in this region", format=",.2%"),
        # alt.Tooltip("average_ind_perunit_ghg", title="Average per unit GHG emissions for industries in this region", format=",.4f"),
       
        #         ],
    )
).add_selection(
    selection_itl
).transform_filter(
    selection_itl
).add_selection(
    selection_type
).transform_filter(
    selection_type
)

In [143]:
regional_variation_plot = alt.vconcat(
    regional_measures_plot.properties(width='container', height=500),
    text_chart.properties(width='container')
).resolve_scale(color='independent',size='independent')

regional_variation_plot_config = configure_plots(
    regional_variation_plot,chart_title='Greenness measures per region',
                chart_subtitle=[
                    "Select a greenness measure and a level of regional granularity, and compare which places have the",
                    "highest and lowest average values for this measure across job adverts in this region.",
                    ]
).configure(autosize="fit-x").configure_view(strokeWidth=0)
regional_variation_plot_config

In [144]:
regional_variation_plot_config.save(f'{graph_dir}/green_measures_chloropleth.html')

In [145]:
regional_variation_plot_config_no_title = configure_plots(
    regional_variation_plot,
).configure(autosize="fit-x").configure_view(strokeWidth=0)

regional_variation_plot_config_no_title.save(f'{graph_dir}/green_measures_chloropleth_no_title.html')

## Version with the bar chart not text

In [146]:
bar_top_merged_data = merged_data.groupby(['variable', 'itl_type']).apply(
    lambda x : x.sort_values(by = 'value_orig', ascending = False).head(3)).reset_index(drop=True)
bar_top_merged_data["top_bottom"] = 0
bar_top_merged_data["top_bottom_name"] ="Highest"
bar_top_merged_data["ngroup"] = bar_top_merged_data.groupby(['variable', 'itl_type'])['value_orig'].rank()

bar_bottom_merged_data = merged_data.groupby(['variable', 'itl_type']).apply(
    lambda x : x.sort_values(by = 'value_orig', ascending = False).tail(3)).reset_index(drop=True)
bar_bottom_merged_data["top_bottom"] = 1
bar_bottom_merged_data["top_bottom_name"] = "Lowest"
bar_bottom_merged_data["ngroup"] =bar_bottom_merged_data.groupby(['variable', 'itl_type'])['value_orig'].rank()

bar_top_bottom_merged_data = pd.concat([bar_top_merged_data,bar_bottom_merged_data]).reset_index()
len(bar_top_bottom_merged_data)

54

In [108]:
# col_min = bar_top_bottom_merged_data.groupby(['variable', 'itl_type'])['value_orig'].min().reset_index().rename(columns={'value_orig': 'min_value_orig'})
# bar_top_bottom_merged_data = pd.merge(bar_top_bottom_merged_data, col_min)
# col_max = bar_top_bottom_merged_data.groupby(['variable', 'itl_type'])['value_orig'].max().reset_index().rename(columns={'value_orig': 'max_value_orig'})
# bar_top_bottom_merged_data = pd.merge(bar_top_bottom_merged_data, col_max)

In [147]:
top_regions_measures_plot = alt.Chart(
    bar_top_bottom_merged_data[['value_orig', 'value', 'itl_name', 'num_job_ads', 'itl_type', 'variable', 'top_bottom', 'top_bottom_name']].sort_values(
        by='value_orig', ascending=False),
    title=["","","Highest and lowest values for this measure"]
).mark_bar(strokeWidth=5, stroke="black", strokeOpacity=0.001).encode(
    color=alt.Color(
        'value', title="",
        scale=alt.Scale(
            # range=["saddlebrown", NESTA_COLOURS_DICT["yellow"], "limegreen", 'green'],
            scheme='goldgreen',reverse=False
        ),
        legend=None
    ), 
    y=alt.Y('itl_name',title="", axis=alt.Axis(title=None, labelLimit=200, tickSize=0, domain=False),
           sort=alt.EncodingSortField(field="value_orig", order="descending")),
    x=alt.X('value_orig',sort=None, title="",axis=alt.Axis(tickSize=0, domain=False)),
    # row=alt.Row('top_bottom_name', title=""), # unfortunarly this wont work with autosizing https://github.com/vega/vega-lite/issues/5783
    # facet=alt.Facet('top_bottom_name', columns=1),
    tooltip=[
                alt.Tooltip("itl_name", title="Region"),
                alt.Tooltip("num_job_ads", title="Number of job adverts"),
                alt.Tooltip("value_orig", title="Greenness measure", format=",.4f"),
        
            ]
).resolve_scale(
    y='independent'
).add_selection(
    selection_itl
).transform_filter(
    selection_itl
).add_selection(
    selection_type
).transform_filter(
    selection_type
)

In [148]:
regional_variation_plot_bar = alt.vconcat(
    regional_measures_plot.properties(width='container', height=500),
    top_regions_measures_plot.properties(width='container'),
).resolve_scale(color='independent',size='independent')

regional_variation_plot_bar_config = configure_plots(
    regional_variation_plot_bar.configure(autosize="fit-x"),
    fontsize_normal=16,
    fontsize_title=18,
    chart_title="Average green measure values across regions"
).configure_view(strokeWidth=0)

regional_variation_plot_bar_config.save(f'{graph_dir}/green_measures_chloropleth_bar.html')

In [149]:
regional_variation_plot_bar_config