# Analysis of duration and SOC x skill cluster crosswalk

The goal is to a) get some basics statistics on the distribution of the duration of job adverts (it creates one of the figures in the report); b) assess whether there are any differences in duration across occupations, industries, skill categories and job titles, c) show some examples of skill categories needed for selected occupations.


# Imports and definitions

In [None]:
# ------------------------ DEPENDENCIES AND FUNCTIONS ------------------------
from collections import Counter
from copy import deepcopy
import datetime
import gzip
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import pickle
from pprint import PrettyPrinter
import seaborn as sns
import sys
from scipy.special import gammaln
import json

#sys.path.append('/Users/stefgarasto/Google Drive/Documents/scripts/utils_stef')
#from utils_skills_clusters import taxonomy_2_0
sys.path.append("/Users/stefgarasto/Local-Data/scripts/skill_demand_escoe/skill_demand/skill_demand")
from utils.utils_general import TaskTimer, print_elapsed, nesta_colours, sic_letter_to_text, flatten_lol, printdf, socnames_dict
from utils.textkernel_load_utils import tk_params, DATA_PATH, create_tk_import_dict, read_and_append_chunks

'''
#from textkernel_utils import *
sys.path.append("/Users/stefgarasto/Local-Data/scripts/skill_demand_escoe/skill_demand/skill_demand")

from utils.textkernel_load_utils import tk_params, data_folder, \
        create_tk_import_dict, read_and_append_chunks
from time import time as tt
#from tqdm import tqdm
#import torch
#import torch.nn as nn
from utils_skills_clusters import taxonomy_2_0
now = datetime.datetime.now()
sys.path.append('/Users/stefgarasto/Google Drive/Documents/scripts/utils_stef')
from utils.utils_general import TaskTimer, print_elapsed, nesta_colours, \
            sic_letter_to_text, flatten_lol, printdf, socnames_dict
'''
pp = PrettyPrinter(indent=4)


timer = TaskTimer()
print('Done')


In [None]:
from importlib import reload
from flow_to_stock.flow_to_stock_funcs import load_ons_vacancies
#from flow_to_stock_funcs import load_ons_vacancies


In [None]:
# SIC mapping between text and letters

sic_letter_to_text['Z'] = 'Others'
sic_letter_to_text['L_O_S'] = 'Personal and public services'#including non-profit and estate agents
sic_letter_to_text['D_E'] = 'Utilities (energy, water and waste)'
sic_letter_to_text['M_P'] = 'Educational and professional activities'
sic_letter_to_text['uncertain'] = 'Uncertain'

sic_text_to_letter= {}
for letter in sic_letter_to_text.keys():
    sic_text= sic_letter_to_text[letter]
    sic_text_to_letter[sic_text] = letter
    
print('Done')


In [None]:
#DATA_PATH = Path(data_folder).parent
#assert(DATA_PATH == Path('/Volumes/ssd_data/textkernel/data'))


In [None]:
def resample_soc_to_n_digits(soc_code,n=3):
    if np.isnan(soc_code):
        return np.nan
    else:
        m = {1: 1000, 2: 100, 3: 10}[n]
        return (soc_code - soc_code%m)/m

def resample_soc_to_n_digits_df(soc_code_df,n=3):
    #soc_code_df = soc_code_df.fillna(0)
    m = {1: 1000, 2: 100, 3: 10}[n]
    return (soc_code_df - soc_code_df%m)/m
    
    

In [None]:
#%% -------------------------------------------------------------------------
#              Main functions to convert from flow to stock
#% --------------------------------------------------------------------------

def set_month_at_beginning(x):
    """Set a datetime to the beginning of the month"""
    return pd.offsets.MonthBegin(0).rollback(x)

def set_month_at_end(x):
    """ Set a datetime to the end of the month"""
    return pd.offsets.MonthEnd(0).rollforward(x)


def get_daily_stock_breakdown(data, agg_func = 'sum', agg_col = 'vacancy_weight', 
                              breakdown_col = 'organization_industry_value', BOUNDARY = None):
    """Compute the daily stock of vacancies via cumulative sum of net Flow.
    
    Keyword arguments:
    data -- dataframe with online job vacancies. Need to have "date", 
            "end_date" and agg_col columns
    agg_func: whether to count the vacancies or to sum the weights
    agg_col -- reference column to aggregate (usually column with per-vacancy weights)
    BOUNDARY -- what to do wrt boundary conditions (start and end month)
    """
    
    if not isinstance(breakdown_col,list):
        breakdown_col = [breakdown_col]
        
    start_day = data.date.min()
    end_day = data.date.max()
    
    print(agg_func)
    if agg_func == 'sum':
        vacancy_flow_per_day = data.groupby(['date'] + breakdown_col)[agg_col].sum()
        vacancy_remove_per_day = data.groupby(['end_date']+ breakdown_col)[agg_col].sum()
    else:
        vacancy_flow_per_day = data.groupby(['date'] + breakdown_col)[agg_col].count()
        vacancy_remove_per_day = data.groupby(['end_date'] + breakdown_col)[agg_col].count()
    
    for _ in range(len(breakdown_col)):
        vacancy_flow_per_day = vacancy_flow_per_day.unstack()
        vacancy_remove_per_day = vacancy_remove_per_day.unstack()
    
    # shift vacancy_remove_per_day by one day since vacancies disappear the day
    # after their expiration date
    vacancy_remove_per_day = vacancy_remove_per_day.shift(1)
    
    # adjust so that they start and end on the same dates
    vacancy_flow_per_day = vacancy_flow_per_day.reindex(pd.date_range(start=start_day,
                                        end=end_day,freq='D'), fill_value=0)#, level=0)
    vacancy_remove_per_day = vacancy_remove_per_day.reindex(pd.date_range(start=start_day,
                                        end=end_day,freq='D'), fill_value=0)#, level=0)
    
    # compute the net Flow
    net_flow = vacancy_flow_per_day.fillna(0) - vacancy_remove_per_day.fillna(0)
    
    # Get the daily stock
    daily_stock = net_flow.cumsum()
    
    # Resample to monthly stock
    monthly_stock = net_flow.resample('M').sum().cumsum()/2
    monthly_stock.index = monthly_stock.index.map(set_month_at_beginning)
    
    # enforce boundary conditions
    if BOUNDARY == 'valid':
        monthly_stock = monthly_stock[monthly_stock.index>=set_month_at_beginning(
            pd.to_datetime(FIRST_VALID_MONTH))]
    return monthly_stock, daily_stock, vacancy_flow_per_day, vacancy_remove_per_day


In [None]:
def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # from:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array += 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))

# Load data

## Load dataframes

In [None]:
# load import dict and other auxiliary variables
print(tk_params)
# define names of files to load
N_to_load = tk_params.N_files
indices_to_load = np.random.permutation(tk_params.N_files)[:N_to_load]
dfilenames = [os.path.join(f"{DATA_PATH}data",tk_params.file_name_template.format(i)) for i in
              indices_to_load]
import_dict, dates_to_parse = create_tk_import_dict()


In [None]:
import_dict['job_title'], dates_to_parse


In [None]:
# update the dictionary
import_dict['final_sic_letter'] = 'category'
import_dict['clean_organization_name'] = 'string'
import_dict['duration_to_use'] = 'float64'
import_dict['vacancy_weight'] = 'float64'
import_dict['active_months'] = 'string'
import_dict['best_month'] = 'string'
import_dict['best_month_duration'] = 'float64'
import_dict['ttwa11cd'] = 'category'
import_dict['top_cluster'] = 'float64'
import_dict['top_cluster_weighted'] = 'float64'



### Load data on vacancy lifecycle, per vacancy weights, sector and profession

job_id, posting_id, date, duration, profession, tk_industry, organization name (original and clean), sic, duration used, vacancy weight, active month, best month, best month duration


In [None]:
timer.start_task('Loading SIC + vacancy weights')
data_df = pd.read_csv(f"{DATA_PATH}data/interim/interim_job_id_and_vacancy_weights.gz", 
    compression = 'gzip', encoding = 'utf-8', 
                      usecols = ['posting_id','date','duration_to_use','duration',
                                'clean_organization_name','profession_soc_code_value',
                                'final_sic_letter'],#,'vacancy_weight','best_month_duration'],
                      dtype = import_dict, 
                      parse_dates = ['date'])
timer.end_task()


In [None]:
data_df.dtypes


In [None]:
original_length_df = len(data_df)
print(original_length_df)
data_df.head(5)


In [None]:
data_df['end_date'] = data_df.date + pd.to_timedelta(
        data_df.duration_to_use - 1, unit='D')


### Load and join location data

job id, posting id, ttwa and region


In [None]:

timer.start_task('Load location data')
data_to_add = pd.read_csv(f"{DATA_PATH}data/interim/job_id_and_ttwa11cd_first_batch.gz",
                            #interim_location_data.gz", 
                        compression = 'gzip', encoding = 'utf-8', dtype = import_dict, 
                        usecols = ['posting_id','region_label','ttwa11cd'])
timer.end_task()


In [None]:
data_to_add.dtypes


In [None]:
len(data_to_add)


In [None]:
timer.start_task('Joining with location dataframe')
data_df = data_df.merge(data_to_add, 
                            on ='posting_id', how='left')
timer.end_task()


In [None]:
assert(len(data_df) == original_length_df)
print(data_df.ttwa11cd.notna().sum(), len(data_to_add))


#### Load helpers for geospatial mapping

1. TTWA dictionary to go from code to name
2. TTWA statistics
3. TTWA shapefiles
4. Regions shapefiles
5. Country shapefiles


In [None]:
ttwa_folder = ('/Users/stefgarasto/Local-Data/scripts/skill_demand_escoe/skill_demand/'
               'data/aux/ONS/Travel_to_Work_Areas_2011_guidance_and_information_V4')
ttwa_file_1 = 'TTWA-info-2016.xls'
ttwa_file_2 = 'TTWA-summary-statistics-2011.xls'
ttwa_stats = pd.read_excel(f"{ttwa_folder}/{ttwa_file_1}")
tmp = pd.read_excel(f"{ttwa_folder}/{ttwa_file_2}")
ttwa_stats = ttwa_stats.merge(tmp, on =['ttwa11cd','ttwa11nm'], how= 'left')
ttwa_code2name = dict(zip(ttwa_stats.ttwa11cd,ttwa_stats.ttwa11nm))
ttwa_name2code = dict(zip(ttwa_stats.ttwa11nm,ttwa_stats.ttwa11cd))


In [None]:
ttwa_name2code['London']


In [None]:
ttwa_stats.head()


In [None]:
london_code = 'E30000234'


In [None]:
biggest_ttwas = ttwa_stats.ttwa11cd[ttwa_stats[' Population']>60000].to_list()
london_code in biggest_ttwas


### Load information on top cluster for each job advert

In [None]:
# Load all pieces with posting id, soc codes and top clusters
LOAD_TOP_CLUSTERS= True
if LOAD_TOP_CLUSTERS:
    all_cluster_files = os.listdir(f"{DATA_PATH}data/interim/soc_by_clusters_no_soft_skills")
    all_cluster_files = sorted([t for t in all_cluster_files if 'best_clusters_with_soc' in t])
    print(len(all_cluster_files), all_cluster_files[:4])

    timer.start_task('Loading top clusters')
    data_to_add = []
    for ix,cluster_file in enumerate(all_cluster_files):
        data_to_add.append(pd.read_csv(f"{DATA_PATH}data/interim/soc_by_clusters_no_soft_skills/{cluster_file}",
                                       compression = 'gzip', encoding = 'utf-8', 
                                       usecols = ['posting_id','top_cluster','top_cluster_weighted'], 
                                       dtype= import_dict))

    data_to_add = pd.concat(data_to_add)
    timer.end_task()

    print(data_to_add.dtypes)


In [None]:
if LOAD_TOP_CLUSTERS:
    print(len(data_to_add))


In [None]:
if LOAD_TOP_CLUSTERS:
    # merge with main dataframe
    timer.start_task('Joining with location dataframe')
    data_df = data_df.merge(data_to_add, 
                                on ='posting_id', how='left')
    timer.end_task()


### Load and join clean job titles

In [None]:
LOAD_JOB_TITLES= True
# ADD cols to load
if LOAD_JOB_TITLES:
    timer.start_task('Load cleaned job titles')
    data_to_add = pd.read_csv(f"{DATA_PATH}data/interim/interim_job_id_and_cleaned_titles.gz", 
        compression = 'gzip', encoding = 'utf-8', dtype = import_dict, 
        usecols = ['posting_id','job_title_processed'])
    timer.end_task()


In [None]:
if LOAD_JOB_TITLES:
    print(len(data_to_add),original_length_df)
    assert(len(data_to_add) == original_length_df)


In [None]:
if LOAD_JOB_TITLES:
    timer.start_task('Joining cleaned job titles with main dataframe')
    data_df = data_df.merge(
        data_to_add, on ='posting_id', how = 'left')
    timer.end_task()


In [None]:
if LOAD_JOB_TITLES:
    assert(len(data_df) == original_length_df)
    print(data_df.job_title_processed.notna().sum(), 
          data_to_add.job_title_processed.notna().sum())


In [None]:
# Few more bits
# release memory
data_to_add = 1


In [None]:
# Change dtype to make some computations faster
data_df.profession_soc_code_value = data_df.profession_soc_code_value.astype('float')
data_df.final_sic_letter = data_df.final_sic_letter.astype('string')
data_df.ttwa11cd = data_df.ttwa11cd.astype('string')


## Load SOC to skill cluster crosswalks


In [None]:
DATE_ID = 'July2020'
with gzip.GzipFile(f"{DATA_PATH}/data/aux/final_all_crosswalks_soc_to_clusters_top_avg_{DATE_ID}.gz",'rb') as f:
    soc_by_clusters = pickle.load(f)

# make sure everything is a float and not an int
for k1 in soc_by_clusters.keys():
    for k2 in soc_by_clusters[k1].keys():
        soc_by_clusters[k1][k2] = soc_by_clusters[k1][k2]+0.0

# normalise all crosswalk so that it sums up to 1 for each occupation
timer.start_task('normalising crosswalks')
soc_by_clusters_norm = {}
for year in soc_by_clusters.keys():
    soc_by_clusters_norm[year] = {}
    for k in soc_by_clusters[year].keys():
#            k = f"{k_soc}d_by_{k_esco}"
        tmp = deepcopy(soc_by_clusters[year][k])
        tmp = tmp/tmp.sum()
        soc_by_clusters_norm[year][k] = tmp
        
timer.end_task()



### Load auxiliary datasets

In [None]:
# reload the esco list, including skills that have not been clustered
res_folder_local = '/Users/stefgarasto/Local-Data/textkernel/results/skills_matches'

# Load full clusters
esco_clusters_dir = f'{DATA_PATH}data/aux'
esco_clusters_file = os.path.join(esco_clusters_dir,
             'ESCO_Essential_clusters_May2020_coreness.csv')
esco_clusters = pd.read_csv(esco_clusters_file)
# make alt labels list
esco_clusters['alt_labels'] = esco_clusters.alt_labels.map(
        lambda x: x.split('\n') if isinstance(x,str) else [])

# adjustments to the labels
esco_clusters.loc[esco_clusters.level_2==20.0,'label_level_2'] = 'land transport (rail)'
esco_clusters.loc[esco_clusters.level_3==191.0,'label_level_3'
                 ] = 'leather production (manufacturing)'
esco_clusters.loc[esco_clusters.level_3==190.0,'label_level_3'] = 'footwear design'
esco_clusters.loc[esco_clusters.level_3==57.0,'label_level_3'] = 'marketing (branding)'

print('All ESCO skills: ', len(esco_clusters))
print('Valid ESCO skills: ',len(esco_clusters[esco_clusters.level_1<15]))

print(esco_clusters.columns)



In [None]:
# load cluster labels
cluster_labels_1 = pd.read_csv(f"{esco_clusters_dir}/ESCO_Essential_clusters_Level_1.csv")
cluster_labels_2 = pd.read_csv(f"{esco_clusters_dir}/ESCO_Essential_clusters_Level_2.csv")
cluster_labels_3 = pd.read_csv(f"{esco_clusters_dir}/ESCO_Essential_clusters_Level_3.csv")

cluster_labels_3 = cluster_labels_3.set_index('level_3')
cluster_labels_2 = cluster_labels_2.set_index('level_2')

# correct for same labels!!
cluster_labels_3.loc[191,'label'] = 'leather production (manufacturing)'
cluster_labels_3.loc[190,'label'] = 'footwear design'
cluster_labels_3.loc[57,'label'] = 'marketing (branding)'
cluster_labels_2.loc[20,'label'] = 'land transport (rail)'

cluster_labels_3.tail(12)



In [None]:
# build the crosswalks between levels
esco_first_to_second={}
esco_second_to_first={}
for name,g in esco_clusters.groupby('level_1').level_2:#.value_counts():
    level_2_all = sorted(g.value_counts().index.to_list())
    esco_first_to_second[name] = level_2_all
    for level_id in level_2_all:
        esco_second_to_first[level_id] = name
    
esco_second_to_third={}
esco_third_to_second = {}
esco_third_to_first = {}
for name,g in esco_clusters.groupby('level_2').level_3:#.value_counts():
    level_3_all = sorted(g.value_counts().index.to_list())
    esco_second_to_third[name] = level_3_all
    for level_id in level_3_all:
        esco_third_to_second[level_id] = name
        esco_third_to_first[level_id] = esco_second_to_first[name]

esco_first_to_third = {}
for name in esco_first_to_second.keys():
    level_2_all = esco_first_to_second[name]
    level_3_all = []
    for level_id in level_2_all:
        level_3_all.append(esco_second_to_third[level_id])
    esco_first_to_third[name] = sorted(flatten_lol(level_3_all))

print('done')

# same but with the levels
esco_first_to_second_label={}
esco_second_to_first_label={}
for name,g in esco_clusters.groupby('label_level_1').label_level_2:#.value_counts():
    level_2_all = sorted(g.value_counts().index.to_list())
    esco_first_to_second_label[name] = level_2_all
    for level_id in level_2_all:
        esco_second_to_first_label[level_id] = name
    
esco_second_to_third_label={}
esco_third_to_second_label = {}
esco_third_to_first_label = {}
for name,g in esco_clusters.groupby('label_level_2').label_level_3:#.value_counts():
    level_3_all = sorted(g.value_counts().index.to_list())
    esco_second_to_third_label[name] = level_3_all
    for level_id in level_3_all:
        esco_third_to_second_label[level_id] = name
        esco_third_to_first_label[level_id] = esco_second_to_first_label[name]

esco_first_to_third_label = {}
for name in esco_first_to_second_label.keys():
    level_2_all = esco_first_to_second_label[name]
    level_3_all = []
    for level_id in level_2_all:
        level_3_all.append(esco_second_to_third_label[level_id])
    esco_first_to_third_label[name] = sorted(flatten_lol(level_3_all))

print('done')


In [None]:
data_df['profession_soc_code_1'] = resample_soc_to_n_digits_df(
    data_df.profession_soc_code_value, n=1)

data_df['profession_soc_code_2'] = resample_soc_to_n_digits_df(
    data_df.profession_soc_code_value, n=2)

data_df['profession_soc_code_3'] = resample_soc_to_n_digits_df(
    data_df.profession_soc_code_value, n=3)


# Analysis of SOC x skill category crosswalk

In [None]:
# print out the skill categories needed for selected occupations
level = 3
for soccode in [2136.0, 2231.0,231.0]:
    k = {2136.0: f'4d_by_level_{level}', 2231.0: f'4d_by_level_{level}', 231.0: f'3d_by_level_{level}'}[soccode]
    print(f"Breakdown at level {level} for occupation '{socnames_dict[soccode]}'")
    labels_to_use = {2: cluster_labels_2, 3: cluster_labels_3}[level]
    print(soc_by_clusters_norm['2019'][k].rename(lambda x: labels_to_use.loc[x].label
                                                    )[soccode].sort_values(ascending=False).head(10)*100)


# Analysis of duration

## Plot distribution of duration

In [None]:
SAVEFIG = False

In [None]:
# select durations less than 55 days
duration_only = deepcopy(data_df.duration)
median_duration = duration_only.median()
duration_minus = median_duration - 7
duration_plus = median_duration + 7
duration_only = duration_only[duration_only<55]


In [None]:
# Plot distribution of duration, with median and +-7 days around the median
with sns.plotting_context('talk'):
    sns.set_style('white')
    plt.figure(figsize = (9,5))
    plt.hist(duration_only, bins = 55, color = 'k')
    plt.xlim([0,55])
    plt.xlabel('Job advert duration (days)')
    plt.ylabel('Number of job adverts')
    plt.plot([median_duration,median_duration],[0,1450000],color= nesta_colours[3], linewidth = 3)
    plt.plot([duration_minus,duration_minus],[0,1450000], '--', color= nesta_colours[3], linewidth = 3)
    plt.plot([duration_plus,duration_plus],[0,1450000], '--', color= nesta_colours[3], linewidth = 3)
    plt.tight_layout()
    if SAVEFIG:
        plt.savefig(f"{output_folder}/duration_histogram_with_median.png")
        plt.savefig(f"{output_folder}/duration_histogram_with_median.svg")
    

In [None]:
# how many job adverts within 7 days from the median?
m = data_df['duration'].median()
sd = 7#data_df['duration'].std()

A = ((data_df['duration']<=m+sd) & (data_df['duration']>=m-sd)).mean()
print(f"Percentage of jobs with durations within 7 days from the median: {A*100:.2f}%")


## Analysis of the duration field against other variables

In [None]:
# subset of the dataset with durations less than 100
good_durations = data_df[data_df.duration<100][['date', 'duration','profession_soc_code_value',
       'final_sic_letter', 'region_label', 'ttwa11cd', 'top_cluster', 'job_title_processed', 
        'profession_soc_code_1', 'profession_soc_code_2','profession_soc_code_3']]



In [None]:
# duration distribution by 1-digit SOC code
sns.boxplot(x = 'profession_soc_code_1', y = 'duration', data= good_durations)


In [None]:
# duration distribution by region
f = plt.figure(figsize = (14,7))
with sns.plotting_context('talk'):
    sns.boxplot(x = 'region_label', y = 'duration',
            data= good_durations,
                ax = f.gca(),
               palette = sns.color_palette())
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)


In [None]:
# duration distribution by region and 1-digit SOC code
f = plt.figure(figsize = (10,5))
with sns.plotting_context('talk'):
    sns.boxplot(hue = 'region_label', y = 'duration', x = 'profession_soc_code_1',
            data= good_durations, ax = f.gca(),
               palette = sns.color_palette())
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)


In [None]:
# duration distribution by sector
f = plt.figure(figsize = (15,7))
sns.boxplot(x = 'final_sic_letter', y = 'duration', data= good_durations, ax=f.gca())


In [None]:
# Duration distribution by skill category
duration_by_cluster = good_durations.groupby('top_cluster').duration.agg(['mean','median','count'])
duration_by_cluster.rename(columns = {'mean': 'mean_duration', 'median': 'median_duration'})

duration_by_cluster = duration_by_cluster[duration_by_cluster['count']>5000]

printdf(duration_by_cluster.sort_values('mean').head(15).rename(lambda x: cluster_labels_3.loc[x].label))

printdf(duration_by_cluster.sort_values('median').head(15).rename(lambda x: cluster_labels_3.loc[x].label))

print('----------------------------------------------------')
printdf(duration_by_cluster.sort_values('mean', ascending=False).head(15).rename(lambda x: 
                                                                            cluster_labels_3.loc[x].label))

printdf(duration_by_cluster.sort_values('median', ascending=False).head(15).rename(lambda x: 
                                                                            cluster_labels_3.loc[x].label))

duration_by_cluster = duration_by_cluster.sort_values('median')


In [None]:
# show distribution of duration by skill category
clusters_to_show = duration_by_cluster[(duration_by_cluster['median']<29) |
                                      (duration_by_cluster['median']>33)]
with sns.plotting_context('talk'):
    f = plt.figure(figsize = (8,7))
    sns.boxplot(x = 'top_cluster', y = 'duration', data= good_durations, ax = f.gca(),
               order = clusters_to_show.index.tolist())

    tmp = f.gca().get_xticklabels()
    f.gca().set_xticklabels([cluster_labels_3.loc[float(x.get_text())].label for x in tmp])
    _ = plt.xticks(rotation=90)
plt.tight_layout()



In [None]:
# get the median duration for each occupation
duration_by_soc = data_df.groupby('profession_soc_code_value')['duration'].median()
duration_by_soc = pd.DataFrame(duration_by_soc.values,
                               index=duration_by_soc.index, columns=['median_duration'])

# group by first digit and show distribution
duration_by_soc['soc_1'] = pd.DataFrame(duration_by_soc).index.map(
    lambda x: resample_soc_to_n_digits(x,n=1))
sns.boxplot(x='soc_1', y = 'median_duration' , data=duration_by_soc)

# show the bottom and top occupations
print('Shortest durations')
printdf(duration_by_soc.sort_values(by='median_duration').head(5).rename(socnames_dict))

print('Longest durations')
printdf(duration_by_soc.sort_values(by='median_duration', ascending=False).head(5).rename(socnames_dict))



In [None]:
# Durations by job title (most common ones)
top_titles= data_df.job_title_processed.value_counts().iloc[:10]


In [None]:
duration_by_title_sector = {}
duration_by_title = {}
timer.start_task()
for ix,t in enumerate(top_titles.index):
    tmp = data_df[data_df.job_title_processed == t][['duration','final_sic_letter']]
    tmp_g = tmp.groupby('final_sic_letter').duration.agg(["median","count"]).reset_index()
    tmp_g['proportion'] = tmp_g['count']/tmp_g['count'].sum()
    duration_by_title[t] = tmp.duration.median()
    duration_by_title_sector[t]= tmp_g
    if ix%10==9:
        timer.end_task()

    

In [None]:
#pd.DataFrame(duration_by_title.values(), index= duration_by_title.keys(), columns = ['median_duration']
#            ).plot(kind='hist')
printdf(pd.DataFrame(duration_by_title.values(), index= duration_by_title.keys(), columns = ['median_duration']
            ).sort_values('median_duration', ascending=False).head(12))
pd.DataFrame(duration_by_title.values(), index= duration_by_title.keys(), columns = ['median_duration']
            ).sort_values('median_duration').head()
