📓👋 Welcome to the skills evaluation notebook.

The key aim of the notebook is to evaluate the performance of the SkillMeasures overall. 

In [24]:
from dap_prinz_green_jobs import PROJECT_DIR

import altair as alt
import pandas as pd

import os
from datetime import datetime
import numpy as np

In [25]:
# global variables, settings, create directory for the whole notebook

alt.data_transformers.disable_max_rows()

today = datetime.today().strftime('%y%m%d')
graph_dir = str(PROJECT_DIR / f"outputs/figures/evaluation/skills/{today}/")

if not os.path.exists(graph_dir):
    print(f"Creating {graph_dir} directory")
    os.makedirs(graph_dir)
else:
    print(f"{graph_dir} directory already exists")

/Users/india.kerlenesta/Projects/dap_green_jobs/dap_prinz_green_jobs/outputs/figures/evaluation/skills/231016 directory already exists


### 💾 0. Load data

- load labelled skills data
- clean up data

In [35]:
def convert_labels(label: float) -> str:
    #if its a float AND and integer, convert to int and then string
    if isinstance(label, float) and label.is_integer():
        return str(int(label))
    else:
        return label

In [218]:
#Load and clean the labelled data
skill_mapper = {'0': 'skill entity too bad to score',
                '1': 'poor',
                '2': 'ok',
                '3': 'excellent'}

skills_evaluation_raw = pd.read_csv('s3://prinz-green-jobs/outputs/data/labelled_job_adverts/evaluation/skills/skill_evaluation_sample_labelled.csv')

skills_evaluation_clean = (skills_evaluation_raw
                     .rename(columns={'is_good_skill_entity (1-poor, 2-ok, 3-excellent)': 'is_good_skill_entity',
                                  'is_good_green_esco (0-skill entity too bad to score, 1-poor, 2-ok, 3-excellent) if none - then score according to whether it should have been green/blank': 'is_good_green_esco',
                                  'is_good_all_esco (0-skill entity too bad to score, 1-poor, 2-ok, 3-excellent)': 'is_good_all_esco'})
                     #clean up values
                     .assign(job_id = lambda x: x.job_id.astype(str),
                             is_good_skill_entity = lambda x: x.is_good_skill_entity.apply(convert_labels),
                             is_good_green_esco = lambda x: x.is_good_green_esco.apply(convert_labels),
                             is_good_all_esco = lambda x: x.is_good_all_esco.apply(convert_labels),
                             is_good_skill_entity_label = lambda x: x.is_good_skill_entity.map(skill_mapper),
                             is_good_green_esco_label = lambda x: x.is_good_green_esco.map(skill_mapper),
                             is_good_all_esco_label = lambda x: x.is_good_all_esco.map(skill_mapper))
                     .dropna(subset=['is_good_skill_entity'])
                     .drop(columns=['Unnamed: 13'])
                     .reset_index(drop=True))

### 🤔 1. Labelled evaluation analysis 

- print overall observations
- generate label distribution plots
- generate plots about the relationship between green skill probability and skill quality

In [219]:
print(f'skills were extracted for {skills_evaluation_clean.job_id.nunique()} job ids')
print('')
print(f"{skills_evaluation_clean.shape[0]} skills were labelled")
print('')
print(f'there are {skills_evaluation_clean.skill_label.nunique()} unique extracted skills')
print('')
print(f'there are {skills_evaluation_clean.esco_skill.nunique()} unique mapped all ESCO skills')
print('')
print(f'there are {skills_evaluation_clean.green_esco_skill.nunique()} unique green ESCO skills')
print('')
print(f"the average extracted skill entity score is: {round(skills_evaluation_clean.is_good_skill_entity.astype(int).mean(),2)}")
print('')
print('the number of skills that were labelled as 0 (skill entity too bad to score) for green esco matches:')
print(len(skills_evaluation_clean.query('is_good_green_esco == "0"')))
print('')
print('the number of skills that were labelled as 0 (skill entity too bad to score) for all esco matches:')
print(len(skills_evaluation_clean.query('is_good_all_esco == "0"')))
print('')
skills_evaluation_clean_no0 = skills_evaluation_clean.query('is_good_green_esco != "0"')
print(f"the average green escoe score is: {round(skills_evaluation_clean_no0.query('~is_good_green_esco.isna()').is_good_green_esco.astype(int).mean(),2)}")
print('')
skills_evaluation_clean_no0 = skills_evaluation_clean.query('is_good_all_esco != "0"')
print(f"the average all escoe score is: {round(skills_evaluation_clean.query('~is_good_all_esco.isna()').is_good_all_esco.astype(int).mean(),2)}")

skills were extracted for 500 job ids

514 skills were labelled

there are 494 unique extracted skills

there are 309 unique mapped all ESCO skills

there are 182 unique green ESCO skills

the average extracted skill entity score is: 2.6

the number of skills that were labelled as 0 (skill entity too bad to score) for green esco matches:
1

the number of skills that were labelled as 0 (skill entity too bad to score) for all esco matches:
3

the average green escoe score is: 2.44

the average all escoe score is: 2.26


In [220]:
#DISTRIBUTION GRAPHS

is_green_df = skills_evaluation_clean.is_green.value_counts().reset_index().rename(columns={'index': 'is_green', 'is_green': 'count'})
is_green_chart = alt.Chart(is_green_df).mark_bar().encode(
    y=alt.Y('is_green:N', title='Is the skill green?'),
    x=alt.X('count:Q', title='Count'),
    color=alt.Color('is_green:N', title='Is the skill green?', legend=None)).properties(
        title={'text':'Distribution of green skills in the labelled data'})
    
is_good_skill_entity_df = (skills_evaluation_clean
                           .is_good_skill_entity
                           .value_counts()
                           .reset_index()
                           .rename(columns={'index': 'is_good_skill_entity', 'is_good_skill_entity': 'count'})
                           .assign(is_good_skill_entity_label = lambda x: x.is_good_skill_entity.map(skill_mapper)))

good_skill_entity_percent = round(is_good_skill_entity_df.query('is_good_skill_entity != "1"')['count'].sum()/is_good_skill_entity_df['count'].sum(),2)*100
is_good_skill_entity_chart = alt.Chart(is_good_skill_entity_df).mark_bar().encode(
    x=alt.X('count:Q', title='Count'),
    y=alt.Y('is_good_skill_entity_label:N', title='Is the extracted skill good?'),
    color=alt.condition(alt.datum.is_good_skill_entity > '1', alt.value('green'), alt.value('red'))).properties(
        title={'text': ['Distribution of extracted skill quality in the labelled data'],
               'subtitle': [f'{good_skill_entity_percent}% of extracted skills are ok or good.']})
    
    
is_good_green_esco_df = (skills_evaluation_clean
                           .is_good_green_esco
                           .value_counts()
                           .reset_index()
                           .rename(columns={'index': 'is_good_green_esco', 'is_good_green_esco': 'count'})
                           .assign(is_good_green_esco_label = lambda x: x.is_good_green_esco.map(skill_mapper))
                           .query('is_good_green_esco != "0"'))

good_green_esco_percent = round(is_good_green_esco_df.query('is_good_green_esco != "1"')['count'].sum()/is_good_green_esco_df['count'].sum(),2)*100
is_good_green_esco_chart = alt.Chart(is_good_green_esco_df).mark_bar().encode(
    x=alt.X('count:Q', title='Count'),
    y=alt.Y('is_good_green_esco_label:N', title='Is mapped skill good?'),
    color=alt.condition(alt.datum.is_good_green_esco != '1', alt.value('green'), alt.value('red'))).properties(
        title={'text': ['Distribution of mapped green ESCO skill in the labelled data'],
               'subtitle': [f'{good_green_esco_percent}% of extracted skills are ok or good.']})
    
is_good_all_esco_df = (skills_evaluation_clean
                           .is_good_all_esco
                           .value_counts()
                           .reset_index()
                           .rename(columns={'index': 'is_good_all_esco', 'is_good_all_esco': 'count'})
                           .assign(is_good_all_esco_label = lambda x: x.is_good_all_esco.map(skill_mapper))
                           .query('is_good_all_esco != "0"'))

good_all_esco_percent = round(is_good_all_esco_df.query('is_good_all_esco != "1"')['count'].sum()/is_good_all_esco_df['count'].sum(),2)*100
is_good_all_esco_chart = alt.Chart(is_good_all_esco_df).mark_bar().encode(
    x=alt.X('count:Q', title='Count'),
    y=alt.Y('is_good_all_esco_label:N', title='Is mapped skill good?'),
    color=alt.condition(alt.datum.is_good_all_esco != '1', alt.value('green'), alt.value('red'))).properties(
        title={'text': ['Distribution of mapped all ESCO skill in the labelled data'],
               'subtitle': [f'{good_all_esco_percent}% of extracted skills are ok or good.']})
    
skill_dist_graphs = (is_green_chart & (is_good_skill_entity_chart | is_good_green_esco_chart | is_good_all_esco_chart)).properties(title={'text': 'Labelled Skill Distribution Graphs', 'subtitle': ["", ""]})

skill_dist_graphs.save(f'{graph_dir}/skill_dist_graphs.html')

In [222]:
#RELATIONSHIP GRAPHS

#correlation matrix
skills_evaluation_clean = skills_evaluation_clean.dropna(subset=['is_good_green_esco'])

skills_evaluation_clean['is_good_skill_entity'] = skills_evaluation_clean.is_good_skill_entity.astype(int)
skills_evaluation_clean['is_good_green_esco'] = skills_evaluation_clean.is_good_green_esco.astype(int)

corr_df = (skills_evaluation_clean
           [['green_prob', 
                         'green_esco_skill_prob', 
                         'is_good_skill_entity', 
                         'is_good_green_esco']]
           .corr()
           .stack()
           .reset_index()
           .rename(columns={'level_0': 'feature_1', 'level_1': 'feature_2', 0: 'correlation'}))
corr_df['correlation_label'] = corr_df['correlation'].map('{:.2f}'.format)  # Round to 2 decimal
base = alt.Chart(corr_df).encode(
    x=alt.X('feature_1:O', title="Feature 1"),
    y=alt.Y('feature_2:O', title="Feature 2")   
)
# Text layer with correlation labels
# Colors are for easier readability
text = base.mark_text().encode(
    text='correlation_label',
    color=alt.condition(
        alt.datum.correlation < 0.5, 
        alt.value('black'),
        alt.value('white')
    )
)

# The correlation heatmap itself
cor_plot = base.mark_rect().encode(
    color='correlation:Q'
).properties(width=400, height=400)

cor_text_plot = cor_plot + text

#Relationship between green probability, esco skill probability and match quality
esco_prob_chart = alt.Chart(skills_evaluation_clean.query('is_good_green_esco != 0')).mark_point().encode(
    x=alt.X('green_prob:Q', title='Green Skill Classifier probability'),
    y=alt.Y('esco_skill_prob:Q', title='Green ESCO semantic similarity'),
    color=alt.Color('is_good_green_esco_label:N', title='Is mapped skill good?'),
    tooltip=['skill_label', 'green_esco_skill', 'green_prob', 'esco_skill_prob', 'is_good_green_esco']).properties(
        title={'text': ['Relationship between green skill classifier probability and ESCO semantic similarity']}
    )
    
(cor_text_plot | esco_prob_chart).properties(
        title={'text': ['Relationship graphs'],
               'subtitle': ["", ""]}
    ).save(f'{graph_dir}/relationship_graphs.html')