# Chloropleth plot
Average greenness by region


In [79]:
from dap_prinz_green_jobs.analysis.ojo_analysis.process_ojo_green_measures import *
from dap_prinz_green_jobs.getters.ojo_getters import (
	get_mixed_ojo_location_sample, get_mixed_ojo_salaries_sample
)
from dap_prinz_green_jobs.getters.industry_getters import load_sic
from dap_prinz_green_jobs import BUCKET_NAME, logger, PROJECT_DIR
from dap_prinz_green_jobs.getters.data_getters import load_s3_data, save_to_s3
from dap_prinz_green_jobs.utils.chloropleth_utils import get_nuts2polygons_dict, get_nuts1polygons_dict, get_nuts3polygons_dict

from dap_prinz_green_jobs.utils.plotting import configure_plots

import altair as alt
import geopandas as gpd

from datetime import datetime

import os

In [2]:
#save graphs
today = datetime.today().strftime('%y%m%d')
graph_dir = str(PROJECT_DIR / f"outputs/figures/green_jobs_explorer/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

/Users/india.kerlenesta/Projects/dap_green_jobs/dap_prinz_green_jobs/outputs/figures/green_jobs_explorer/231215 directory already exists


## Load the aggregated by region dataset

In [3]:
agg_itl_by = "itl_3_code"
date_stamp = "20231214"
itl_aggregated_data = load_s3_data(BUCKET_NAME, f"outputs/data/ojo_application/extracted_green_measures/analysis/{agg_itl_by}_aggregated_data_{date_stamp}.csv")

2023-12-15 13:23:25,817 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2023-12-15 13:23:26,129 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [4]:
itl_aggregated_data['num_job_ads'].sum()

963316

## Get additional geometry data needed for chloropleth

In [57]:
nuts1polygons_dict = get_nuts1polygons_dict()
itl1polygons_dict = {k.replace("UK","TL"):v for k, v in nuts1polygons_dict.items()}

nuts2polygons_dict = get_nuts2polygons_dict()
itl2polygons_dict = {k.replace("UK","TL"):v for k, v in nuts2polygons_dict.items()}

nuts3polygons_dict = get_nuts3polygons_dict()
itl3polygons_dict = {k.replace("UK","TL"):v for k, v in nuts3polygons_dict.items()}

allpolygons_dict = {**itl1polygons_dict, **itl2polygons_dict, **itl3polygons_dict}

In [72]:
if agg_itl_by == "itl_3_code":
    itl_aggregated_data['geometry_name'] = itl_aggregated_data[agg_itl_by].map(allpolygons_dict)
    itl_aggregated_data[['geometry', 'itl_name']] = itl_aggregated_data['geometry_name'].apply(lambda x: pd.Series(x))
    itl_aggregated_data.drop('geometry_name', axis=1, inplace=True)
else:
    itl_aggregated_data['geometry'] = itl_aggregated_data[agg_itl_by].apply(lambda x: itl3polygons_dict.get(x, itl1polygons_dict.get(x))[0])
    itl_aggregated_data['itl_name'] = itl_aggregated_data[agg_itl_by].apply(lambda x: itl3polygons_dict.get(x, itl1polygons_dict.get(x))[1])

geo_df = gpd.GeoDataFrame(itl_aggregated_data)

## Normalise and numerate greenness measures

In [73]:
geo_df["Relative occupation greenness"] = geo_df["occ_greenness"].map({"low":0,"mid":1,"high":2})/2
geo_df["Relative industry greenness"] = geo_df["ind_greenness"].map({"low":0,"mid":1,"high":2})/2
geo_df["Relative skills greenness"] = geo_df["skills_greenness"].map({"low":0,"mid":1,"high":2})/2
geo_df["Relative overall greenness"] = geo_df["greenness_score"].map({"low":0,"low-mid":1,"mid-high":2,"high":3})/3

## Get data in format to filter in viz

In [74]:
static_columns = ['itl_name', 'geometry', 'num_job_ads',
                  'average_occ_green_timeshare', 'average_prop_green_skills', 'average_ind_perunit_ghg',
                  'top_3_sics_names', 'top_3_green_skills_names', 'top_3_socs_names',
                 'occ_greenness', 'ind_greenness', 'skills_greenness','greenness_score']

merged_data = pd.DataFrame()
for col_name in ["Relative occupation greenness", "Relative industry greenness", "Relative skills greenness", "Relative overall greenness"]:
    c_df = geo_df[static_columns + [col_name]]
    c_df.rename(columns = {col_name: "value"}, inplace=True)
    c_df['variable'] = [col_name]*len(c_df)

    merged_data = pd.concat([merged_data, c_df])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


## Plot

In [75]:
select_box = alt.binding_select(options=list(merged_data['variable'].unique()), name="Regional comparison measure ")
selection = alt.selection_point(value="Relative overall greenness", fields=['variable'], bind=select_box)

regional_measures_plot = alt.Chart(merged_data).mark_geoshape().encode(
    color=alt.Color(
        'value', title="",
        # scale=alt.Scale(scheme='goldgreen',reverse=False),
        scale=alt.Scale(domain=[0,0.5,1],range=['#5a443aff', '#39834cff','#06e43eff']), # ['#3a5240ff', '#06e43eff']
        legend=alt.Legend(title='Relative greenness'),
        # legend=None
    ), 
    tooltip=[
                alt.Tooltip("itl_name", title="Region"),
                alt.Tooltip("num_job_ads", title="Number of job adverts"),
        alt.Tooltip("greenness_score", title="Relative overall greenness"),
        alt.Tooltip("occ_greenness", title="Relative occupational greenness"),
        alt.Tooltip("skills_greenness",title="Relative skills greenness"),
        alt.Tooltip("ind_greenness", title="Relative industry greenness"),
        alt.Tooltip("top_3_green_skills_names", title="Top 3 green skills"),
        alt.Tooltip("top_3_sics_names", title="Top 3 industries"),
        alt.Tooltip("top_3_socs_names", title="Top 3 occupations"),
        
            ]
).add_selection(
    selection
).transform_filter(
    selection
)



In [76]:
regional_measures_plot_config = configure_plots(regional_measures_plot, 
                chart_title='Average relative greenness measure per region',
                chart_subtitle='Select a greenness measure and see which regions have the highest values for it.'
                                               ).properties(width=600, height=400)

In [77]:
regional_measures_plot_config

In [78]:
regional_measures_plot_config.save(f'{graph_dir}/green_measures_{agg_itl_by}_chloropleth.html')