In [None]:
%matplotlib inline
import matplotlib.pyplot as plt


import os
import itertools
import json
import numpy as np
import pandas as pd
import pickle
import requests
import seaborn as sns
import collections
from collections import Counter
import scipy
import time
import copy
from collections import OrderedDict

import matplotlib as mpl
import matplotlib.gridspec as gridspec
from matplotlib.patches import Rectangle


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.decomposition.pca import PCA
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

import gensim
import re
from fuzzywuzzy import process

from gensim.scripts.glove2word2vec import glove2word2vec



In [None]:
from utils_bg import * #nesta_colours, nesta_colours_combos


In [None]:
# set up plot style
print(plt.style.available)
plt.style.use(['seaborn-darkgrid','seaborn-poster','ggplot'])


## Overview

This notebook extract the top keywords for each NOS - it will need to be merged with another notebook


In [None]:
# flatten lists of lists
def flatten_lol(t):
    return list(itertools.chain.from_iterable(t))
flatten_lol([[1,2],[3],[4,5,6]])


In [None]:


def convert_to_undersc(skill):
    '''
    convert spaces in skill phrases into underscores to use with trained
    w2v model.
    '''
    if len(skill.split(' ')) >1:
        new_i = '-'.join(skill.split(' '))
    else:
        new_i = skill
    return(new_i)

def convert_from_undersc(skill):
    '''
    convert underscores between terms in skill phrases back to spaces.
    '''
    if len(skill.split('-')) >1:
        new_i = ' '.join(skill.split('_'))
    else:
        new_i = skill
    return(new_i)

def get_mean_vec(skill_list, model):
    skill_list_conv = [convert_to_undersc(elem) for elem in skill_list]
    vector_list = [model[elem] for elem in skill_list_conv if elem in model]
    vec_array = np.asarray(vector_list)
    avg_vec = np.mean(vec_array, axis=0)
    return avg_vec

def get_average_skill_category(list_of_skills, reference_dict):
    """
    Returns top 10 categories in the averaged cosine sim array.
    """
    pruned_skills = [elem for elem in list_of_skills if elem in reference_dict]
    if len(pruned_skills):
        vec_list = [reference_dict[skill] for skill in pruned_skills]
        vec_array = np.asarray(vec_list)
        avg_vec = np.mean(vec_array, axis=0)
        sorted_vecs = np.argsort(avg_vec)[0, -10:]
        scores = [avg_vec[0,i] for i in sorted_vecs]
        categories_values = zip(sorted_vecs, scores)
        res = list(categories_values)
    else:
        res = []
    return res


def get_best_skill_category(list_of_skills, reference_dict, transversal):
    top10 = get_average_skill_category(list_of_skills, reference_dict)
    if len(top10):
        dom_specific = [elem for elem in top10 if elem[0] not in transversal]
        best = max(dom_specific, key = lambda x: x[1])
    else:
        best = (999, 0.0)
    return best
    
lookup_dir = ''
output_dir = ''




In [None]:
def prep_for_gensim(list_of_terms, some_model):
    # replace space with underscore
    new_terms = [convert_to_undersc(elem) for elem in list_of_terms]
    # check if each element in the list is in the model
    is_in = [elem for elem in new_terms if elem in some_model]
    # only return the element in the model
    return is_in

In [None]:
#A few functions for tyding up text
def tag_for_lemmatise(s):
    pos_to_wornet_dict = {
        'JJ': 'a',
        'JJR': 'a',
        'JJS': 'a',
        'RB': 'r',
        'RBR': 'r',
        'RBS': 'r',
        'NN': 'n',
        'NNP': 'n',
        'NNS': 'n',
        'NNPS': 'n',
        'VB': 'v',
        'VBG': 'v',
        'VBD': 'v',
        'VBN': 'v',
        'VBP': 'v',
        'VBZ': 'v',
    }
    try:
        return pos_to_wornet_dict[nltk.pos_tag([s])[0][1]]
    except:
        return 'n'
    
def lemmatise(title_terms):
    """
    Takes list as input.
    Removes suffixes if the new words exists in the nltk dictionary.
    The purpose of the function is to convert plural forms into singular.
    Allows some nouns to remain in plural form (the to_keep_asis is manually curated).
    Returns a list.
    >>> lemmatise(['teachers'])
    ['teacher']
    >>> lemmatise(['analytics'])
    ['analytics']
    """
    keep_asis = ['sales', 'years', 'goods', 'operations', 'systems',
                    'communications', 'events', 'loans', 'grounds',
                    'lettings', 'claims', 'accounts', 'relations',
                    'complaints', 'services']
    wnl = nltk.WordNetLemmatizer()
    processed_terms = [wnl.lemmatize(i) if i not in keep_asis else i for i in title_terms]
    #processed_terms = [wnl.lemmatize(i, pos = tag_for_lemmatise(i)) 
    #            if i not in keep_asis else i for i in title_terms]
    return processed_terms

def lemmatise_with_pos(title_terms):
    """
    Takes list as input.
    Removes suffixes if the new words exists in the nltk dictionary.
    The purpose of the function is to convert plural forms into singular.
    Allows some nouns to remain in plural form (the to_keep_asis is manually curated).
    Returns a list.
    >>> lemmatise(['teachers'])
    ['teacher']
    >>> lemmatise(['analytics'])
    ['analytics']
    """
    pos_to_wornet_dict = {
        'JJ': 'a',
        'JJR': 'a',
        'JJS': 'a',
        'RB': 'r',
        'RBR': 'r',
        'RBS': 'r',
        'NN': 'n',
        'NNP': 'n',
        'NNS': 'n',
        'NNPS': 'n',
        'VB': 'v',
        'VBG': 'v',
        'VBD': 'v',
        'VBN': 'v',
        'VBP': 'v',
        'VBZ': 'v',
    }
    keep_asis = ['sales', 'years', 'goods', 'operations', 'systems',
                    'communications', 'events', 'loans', 'grounds',
                    'lettings', 'claims', 'accounts', 'relations',
                    'complaints', 'services']
    wnl = nltk.WordNetLemmatizer()
    processed_terms = [wnl.lemmatize(i, pos_to_wornet_dict[p]) if i not in keep_asis else i for i,p in title_terms]
    #processed_terms = [wnl.lemmatize(i, pos = tag_for_lemmatise(i)) 
    #            if i not in keep_asis else i for i in title_terms]
    return processed_terms

def stem_features(s, ps):
    return ps.stem(s)
    
def remove_digits(s):
    """
    Takes a string as input.
    Removes digits in a string.
    Returns a string.
    >>> remove_digits('2 recruitment consultants')
    ' recruitment consultants'
    """
    result = ''.join(i for i in s if not i.isdigit())
    return result

def remove_list_enumeration(s):
    '''
    This is a specific requirement of the NOS that comes from
    the presence of lists enumerated by strings like K+number
    or P+number. Therefore, after "lowerising" and removing 
    digits, I look for and remove strings like "k " and "p "
    '''
    result = re.sub('( k )+',' ',s)
    result = re.sub('( p )+', ' ', result)
    # it might not be necessary if I add 'k' and 'p' to stopwords
    return result

select_punct = set('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~') #only removed "'"
extra_chars = set('–-•’”“µ¾âãéˆﬁ[€™¢±ï…˜')
all_select_chars = select_punct.union(extra_chars)

def replace_punctuation(s):
    """
    Takes string as input.
    Removes punctuation from a string if the character is in select_punct.
    Returns a string.
   >>> replace_punctuation('sales executives/ - london')
   'sales executives   london'
    """
    for i in set(all_select_chars): #set(select_punct):
        if i in s:
            s = s.replace(i, ' ')
    return s

def tidy_desc(desc):
    clean_data = desc.replace('\r\n', '').replace('\xa0', '')
    nodigits = remove_digits(clean_data.lower())
    nopunct = replace_punctuation(nodigits)
    #nopunct = remove_list_enumeration(nopunct)
    lemm = lemmatise(nopunct.split())
    return ' '.join(lemm)

def tokenize(text):
    """
    Takes string as input.
    Returns list of tokens. The function is used as an argument for
    TfidfVectorizer.
    >>> tokenize('some job title')
    ['some', 'job', 'title']
    """
    tokens = nltk.word_tokenize(text)
    return tokens

def tokenize_asis(some_list): #, stopwords):
    """
    Takes list as input.
    Returns the list with elements converted to lower case. The function is 
    used as an argument for TfidfVectorizer.
    
    In [57]: tokenize(['Accounting', 'Microsoft Excel'])
    Out[57]: ['accounting', 'microsoft excel']
    """
    tokens = [elem.lower() for elem in some_list]# if elem.lower() not in stopwords]
    return tokens


In [None]:
#This set of functions is useful for identifying terms with highest tf-idf weights 
#in a single document or set of documents

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding 
        feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25, sparse_output = False):
    ''' Return the top n features that on average are most important 
        amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    if sparse_output:
        return scipy.sparse.csr_matrix(top_tfidf_feats(tfidf_means, features, top_n))
    else:
        return top_tfidf_feats(tfidf_means, features, top_n)

def all_mean_feats(Xtr, grp_ids=None, min_tfidf=0.1):
    ''' Return the average
        amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return tfidf_means

def get_top_words_weights(desc, feature_names, vect, n = 25):
    response = vect.transform(desc)
    words = top_mean_feats(response, feature_names, grp_ids = None, top_n = n)
    return words

def get_mean_tfidf(desc, vect):
    response = vect.transform(desc)
    tfidf_values = all_mean_feats(response, grp_ids = None)
    return tfidf_values

def get_top_words(desc, feature_names, vect, n = 25):
    response = vect.transform(desc)
    words = top_mean_feats(response, feature_names, grp_ids = None, top_n = n)
    return words['feature'].values


In [None]:
def print_elapsed(t0_local, task = 'current task'):
    print('Done with {}. Elapsed time: {:4f}'.format(task,time.time()-t0_local))
    

### Functions for BG data

In [None]:
def bgsoc_is_newsoc(data):
    return np.floor(data['SOC']/10) == data['new_soc']



### More settings

In [None]:
qualifier = 'postjoining_final_no_dropped'
qualifier0 = 'postjoining_final_no_dropped'
pofs = 'nv'


In [None]:
output_dir = '/Users/stefgarasto/Google Drive/Documents/results/NOS/nlp_analysis/'


In [None]:
lookup_dir = '/Users/stefgarasto/Google Drive/Documents/results/NOS/extracted/'


In [None]:
'''
ps = PorterStemmer()
stemmer = SnowballStemmer("english")
tags = [(t,p) for t,p in df_nos['tagged_tokens'].iloc[0] if p[:1] in ['V','N']]
#A = [stem_features(t,ps) for t in df_nos['pruned'].iloc[0]]
A2 = lemmatise_with_pos(tags) #[stem_features(t,stemmer) for t in df_nos['pruned'].iloc[0]]
#A = list(set(A))
#A2 = list(set(A2))
#B = [t in model for t in A]
B2 = [t in model for t in A2]
#C = [t for t in A if t in model]
C2 = [t for t in A2 if t in model]
print(len(df_nos['pruned'].iloc[0]),len(A2))
print(A2)
#print(sum(B),sorted(A))
#print(sum(B2),sorted(A2))
#plt.scatter(model['valu'],model['value'])
'''
print('playground')


In [None]:
#Loading a pre-trained glove model into gensim
WHICH_GLOVE = 'glove.6B.100d' #'glove.6B.100d' #'glove.840B.300d', 
#glove.twitter.27B.100d

glove_dir = '/Users/stefgarasto/Local-Data/wordvecs/'

LOADGLOVE = True
if LOADGLOVE:
    print('Loading glove model')
    t0 = time.time()
    # load the glove model
    model = gensim.models.KeyedVectors.load_word2vec_format\
    (os.path.join(glove_dir, 'word2vec.{}.txt'.format(WHICH_GLOVE)))
    #model = api.load("glove-wiki-gigaword-100")  # load pre-trained word-vectors
    # from gensim-data
    #model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
    #word_vectors = model.wv
    print(time.time() - t0)

vector_matrix = model.vectors
list_of_terms = model.index2word

lookup_terms = [convert_from_undersc(elem) for elem in list_of_terms]




In [None]:
#Get the NOS data for approved apprenticeship standards from api
df_nos = pd.read_pickle(lookup_dir + 'all_nos_input_for_nlp_{}.zip'.format(qualifier0))

# load the cleaned and tokenised dataset
df_nos = df_nos.join(pd.read_pickle(lookup_dir + 'all_nos_input_for_nlp_{}_pruned_{}.zip'.format(qualifier,pofs)))
print('Done')


In [None]:
# manually remove "k"s and "p"s from the pruned columns
def remove_pk(x):
    return [t for t in x if t not in ['k','p']]
df_nos['pruned'] = df_nos['pruned'].map(remove_pk)


In [None]:
df_nos.sample(n=3)


In [None]:
# Load stopwords
with open(lookup_dir + 'stopwords_for_nos_{}_{}.pickle'.format(qualifier,pofs),'rb') as f:
    stopwords0, no_idea_why_here_stopwords, more_stopwords = pickle.load(f)
stopwords = stopwords0 + no_idea_why_here_stopwords 
stopwords += tuple(['¤', '¨', 'μ', 'บ', 'ย', 'ᶟ', '‰', '©', 'ƒ', '°', '„'])
stopwords0 += tuple(['¤', '¨', 'μ', 'บ', 'ย', 'ᶟ', '‰', '©', 'ƒ', '°', '„',"'m", "'re", '£'])
stopwords0 += tuple(set(list(df_nos['Developed By'])))
stopwords0 += tuple(['cosvr'])


In [None]:
# create another column where the texts are lemmatised properly
def lemmatise_pruned(x, pofs):
    if pofs == 'nv':
        tags = [(t,p) for t,p in x if p[:1] in ['V','N']]
    elif pofs == 'n':
        tags = [(t,p) for t,p in x if p[:1] in ['N']]
    else:
        raise ValueError
    return lemmatise_with_pos(tags)

t0 = time.time()
df_nos['pruned_lemmas'] = df_nos['tagged_tokens'].map(lambda x: lemmatise_pruned(x,pofs))
print(time.time()-t0)


### Only keep NOS from a super-suite

In [None]:
super_suites_files=  '/Users/stefgarasto/Google Drive/Documents/data/NOS_meta_data/NOS_Suite_Priority.xlsx'
super_suites_names = ['Engineering','Management','FinancialServices','Construction']
all_super_suites = {}
for which_super_suite in super_suites_names:
    all_super_suites[which_super_suite] = pd.read_excel(super_suites_files, sheet_name = which_super_suite)
    all_super_suites[which_super_suite]['NOS Suite name'] = all_super_suites[which_super_suite]['NOS Suite name'].map(
        lambda x: x.replace('(','').replace('(','').replace('&','and').strip().lower())


In [None]:
standard_labels = list(df_nos.groupby('One_suite').groups.keys())
all_matches = {}
all_match_names = {}
#match_name = []
for which_super_suite in super_suites_names:
    all_matches[which_super_suite] = []
    for suite in all_super_suites[which_super_suite]['NOS Suite name'].values:
        # do manually some selected suites
        if 'insurance claims' in suite:
            tmp = standard_labels.index('general insurance')
            all_matches[which_super_suite].append(tmp)
            continue
        # for the "management and leadership marketing 2013" both marketing and marketing 2013 would fit,
        # but I'm only taking the latter
        # find a fuzzy match between 
        out = process.extract(suite, standard_labels, limit=3)
        if len(out) and out[0][1]>89:
            # note: most of them are above 96% similarity (only one is 90%)
            tmp = standard_labels.index(out[0][0])
            #print(suite, out[0])
            if tmp not in all_matches[which_super_suite]:
                all_matches[which_super_suite].append(tmp)
            else:
                if suite == 'installing domestic fascia, soffit, and bargeboards':
                    # this suite is kind of a duplicate - I aggregated it in my suites list
                    continue
                tmp = standard_labels.index(out[2][0])
                all_matches[which_super_suite].append(tmp)
                print(out[0][0],',',out[1][0],',',out[2][0],',',suite)
        else:
            print(suite, ' not found')
            print(out)
            print('\n')
    print(len(all_matches[which_super_suite]),len(all_super_suites[which_super_suite]))
    all_match_names[which_super_suite] = [standard_labels[t] for t in all_matches[which_super_suite]]

In [None]:
def assign_supersuite(x):
    for supersuite in all_match_names.keys():
        if x in all_match_names[supersuite]:
            return supersuite.lower()
    # if no match has been found
    return 'other'

def adjustsoccode(x):
    y = re.findall(r"[\d']+", str(x))
    if len(y):
        return y[0]
    else:
        return np.nan

def extract2digits(x):
    if isinstance(x,str):
        try:
            return float(x[:2])
        except:
            return np.nan
    else:
        return np.nan
    
def extract3digits(x):
    if isinstance(x,str):
        try:
            return float(x[:3])
        except:
            return np.nan
    else:
        return np.nan
    
def extract1digits(x):
    if isinstance(x,str):
        try:
            return float(x[:1])
        except:
            return np.nan
    else:
        return np.nan

df_nos['supersuite'] = df_nos['One_suite'].apply(assign_supersuite)
# extract 2 digit soc
df_nos['SOC4'] = df_nos['Clean SOC Code'].map(adjustsoccode)
df_nos['SOC1'] = df_nos['SOC4'].map(extract1digits)
df_nos['SOC2'] = df_nos['SOC4'].map(extract2digits)
df_nos['SOC3'] = df_nos['SOC4'].map(extract3digits)
print(df_nos['supersuite'].value_counts())


In [None]:
#print(max(df_nos['Clean SOC Code'].map(lambda x: len(x) if isinstance(x,list) else 0).values))

In [None]:
# select NOS in super-suites of interest
df_nos_select = df_nos[~(df_nos['supersuite']=='other')]
print(len(df_nos_select))

## Get raw data and tokenize

## Choosing parameters for features extraction

ngrams : uni/bi/tri

tfidf thresholds: min and max percentage

which parts of speech were selected before

whether we are working at the level of suites or of invidual NOS, and how we aggregate NOS to form the suit level


In [None]:
# First, create your TFidfVectorizer model. This doesn't depend on whether it's used on suites or NOS. However,
# it does require that the docs collection is already given as a collection of tokens (tokenizer=tokenize_asis)

#Since we now have not just long strings in our documents, but lists of terms, we will use a different tokenizer
def define_tfidf(params, stopwords):
    if params['ngrams'] == 'bi':
        tfidf = TfidfVectorizer(tokenizer=tokenize_asis,
                                lowercase = False,
                                stop_words=stopwords,
                                ngram_range=(1,2), 
                                max_df = params['tfidf_max'], 
                                min_df = params['tfidf_min'])
    elif params['ngrams'] == 'tri':
        tfidf = TfidfVectorizer(tokenizer=tokenize_asis,
                                lowercase = False,
                                stop_words=stopwords,
                                ngram_range=(1,3), 
                                max_df = params['tfidf_max'], 
                                min_df = params['tfidf_min'])
    else:
        # unigrams is the default
        tfidf = TfidfVectorizer(tokenizer=tokenize_asis, #lambda x: tokenize_asis(x,stopwords),
                                lowercase = False,
                                stop_words=stopwords,
                                max_df = params['tfidf_max'], 
                                min_df = params['tfidf_min'])
    return tfidf


In [None]:
# now, collect the text to transform
def combine_nos_text(df_nos_select, col = 'pruned'):
    all_joint_tokens = []
    # group by suites and concatenate all docs in it
    row_names = []
    for name, group in df_nos_select.groupby('One_suite'):
        row_names.append(name)
        joint_tokens = []
        for idoc in group[col].index:
            joint_tokens += group[col].loc[idoc]
        all_joint_tokens.append(joint_tokens)
    # return a dataframe
    return pd.DataFrame({'tokens': all_joint_tokens}, index = row_names)

def get_tfidf_matrix(params, df_nos_select, tfidf, col = 'pruned'):
    # Note: this can simply be used to get the tfidf transform, by setting bywhich=docs and any mode
    t0 = time.time()
    # first, get the dataframe of tokens
    if params['bywhich'] == 'docs':
        textfortoken = df_nos_select[col]
        
    elif params['bywhich'] == 'suites':
        if params['mode'] == 'meantfidf':
            textfortoken = df_nos_select[col]
                
        elif params['mode'] == 'combinedtfidf':
            # note that this is the only case where the tfidf min and max are computed considering the number of 
            # suites as the number of elements in the collection.
            # TODO: allow for the alternative case, where the transform is computed on individual NOS and then 
            # applied to the joint tokens
            textfortoken = combine_nos_text(df_nos_select, col = col)['tokens']
    
    # apply tfidf transform to the tokenised text
    tfidfm = tfidf.fit_transform(textfortoken)
    
    # if the average is needed, compute it and overwrite the matrix. Note that the step above is still needed to
    # initialise the tfidf transform with the proper features and stopwords
    if (params['bywhich'] == 'suites') and (params['mode'] =='meantfidf'):
        row_names = df_nos_select['One_suite'].value_counts().index.values
        tfidfm = scipy.sparse.lil_matrix(np.zeros((len(row_names),len(feature_names)), dtype = np.float32))
        for name, group in df_nos_select.groupby('One_suite'):
            tmp = get_mean_tfidf(group[col], tfidf)
            tfidfm[igroup] = tmp

    feature_names = tfidf.get_feature_names()
    print_elapsed(t0, 'computing the feature vector')
    return tfidfm, feature_names, tfidf, textfortoken


In [None]:
def get_top_keywords(df, name, stopwords, top_n = 20):
    all_keywords = []
    count_keywords = {}
    for ix in df.index:
        if isinstance(df.loc[ix], list):
            for ik in df.loc[ix]:
                # I think that ik can be a collection of words separated by ";"
                #ik_elems = ik.split(';')
                ik_elems = re.findall(r"[\w']+", ik.replace('-',''))
                # remove extra spaces
                ik_elems = [elem.strip() for elem in ik_elems]
                # remove digits
                ik_elems = [elem for elem in ik_elems if not elem.isdigit()]
                for elem in ik_elems:
                    if elem not in stopwords:
                        if elem not in all_keywords:
                            all_keywords.append(elem)
                            count_keywords[elem] = 1
                        else:
                            count_keywords[elem] += 1
        elif isinstance(df.loc[ix],str):
            ik_elems = re.findall(r"[\w']+", df.loc[ix].replace('-',''))
            #ik_elems = re.split('; |, ', df.loc[ix])
            # remove extra spaces
            ik_elems = [elem.strip() for elem in ik_elems]
            # remove digits
            ik_elems = [elem for elem in ik_elems if not elem.isdigit()]
            for elem in ik_elems:
                if elem not in stopwords:
                    if elem not in all_keywords:
                        all_keywords.append(elem)
                        count_keywords[elem] = 1
                    else:
                        count_keywords[elem] += 1
    n_repeated = np.sum(np.array(list(count_keywords.values()))>1)
    n_keywords = len(all_keywords)
    #print('Number of keywords repeated more than once for suite {} is {}. \n'.format(name,
    #                                                            n_repeated))
    # get the top 20 keywords in terms of count
    top_kw_indices = np.argsort(list(count_keywords.values()))[::-1][:top_n]
    top_keywords = [k for t,k in enumerate(all_keywords) if t in top_kw_indices]
    for _ in range(len(top_keywords),top_n):
        top_keywords.append('-')
    return top_keywords, n_keywords, n_repeated

def get_top_keywords_nos(nos,stopwords, top_n = 20):
    all_keywords = []
    count_keywords = {}
    if isinstance(nos, list):
        for ik in nos:
            # I think that ik can be a collection of words separated by ";"
            #ik_elems = ik.split(';')
            ik_elems = re.findall(r"[\w']+", ik.replace('-',''))
            # remove extra spaces
            ik_elems = [elem.strip() for elem in ik_elems]
            # remove digits
            ik_elems = [elem for elem in ik_elems if not elem.isdigit()]
            for elem in ik_elems:
                if elem not in stopwords:
                    if elem not in all_keywords:
                        all_keywords.append(elem)
                        count_keywords[elem] = 1
                    else:
                        count_keywords[elem] += 1
    elif isinstance(nos,str):
        ik_elems = re.findall(r"[\w']+", nos.replace('-',''))
        #ik_elems = re.split('; |, ', nos)
        # remove extra spaces
        ik_elems = [elem.strip() for elem in ik_elems]
        # remove digits
        ik_elems = [elem for elem in ik_elems if not elem.isdigit()]
        for elem in ik_elems:
            if elem not in stopwords:
                if elem not in all_keywords:
                    all_keywords.append(elem)
                    count_keywords[elem] = 1
                else:
                    count_keywords[elem] += 1
    n_repeated = np.sum(np.array(list(count_keywords.values()))>1)
    n_keywords = len(all_keywords)
    #print('Number of keywords repeated more than once for suite {} is {}. \n'.format(name,
    #                                                            n_repeated))
    # get the top 20 keywords in terms of count
    top_kw_indices = np.argsort(list(count_keywords.values()))[::-1][:top_n]
    top_keywords = [k for t,k in enumerate(all_keywords) if t in top_kw_indices]
    for _ in range(len(top_keywords),top_n):
        top_keywords.append('-')
    return top_keywords, n_keywords, n_repeated


In [None]:
params = {}
params['ngrams'] = 'uni'
params['pofs'] = 'nv'
params['tfidf_min'] = 3
params['tfidf_max'] = 0.5

params['bywhich'] = 'docs' #'docs' #'suites'
params['mode'] = 'tfidf' #'tfidf' #'meantfidf' #'combinedtfidf' #'meantfidf'


In [None]:
# define the transform: this one can easily be the same for both keywords and the clustering
tfidf = define_tfidf(params, stopwords0)


### Check keywords at the NOS level


We can take a look at some of the terms with highest tf-idf score in each NOS

In [None]:
SAVEKW= False


In [None]:
# get the features
tfidfm, feature_names, tfidf, textfortokens = get_tfidf_matrix(params, df_nos_select, tfidf, col = 'pruned_lemmas')


In [None]:
print('Number of features: {}'.format(len(feature_names)))
N = 2000
print('Some features:')
print(feature_names[N:N+100])

In [None]:
#import timeit
#timeit.timeit(lambda: tfidfm.todense(), number = 100)

In [None]:
top_terms_dict = {}
top_keywords_dict = {}
#for name, group in ifa_df.groupby('Route'):
igroup = 0
n_keywords =[]
n_repeated = []
#top_terms = {}
t0 = time.time()
tfidfm_dense = tfidfm.todense()
for ix,name in enumerate(df_nos_select.index):
    #top_terms = get_top_words(df_nos_select.loc[name]['pruned'], feature_names, tfidf, n = 20)
    top_ngrams = np.argsort(tfidfm_dense[ix,:])
    top_ngrams = top_ngrams.tolist()[0][-20:]
    top_ngrams = top_ngrams[::-1]
    # only retain the ones with non zero features
    top_ngrams = [elem for elem in top_ngrams if tfidfm_dense[ix,elem]>0]    
    top_features = [feature_names[elem] for elem in top_ngrams]
    top_terms_dict[name] = {}
    top_terms_dict[name] = top_features
    if ix<4:
        print(name, top_features) #, top_keywords)
        print('**************************************')
    
    #top_keywords, n1, n2  = get_top_keywords_nos(df_nos_select.loc[name]['Keywords'], stopwords0, top_n = 20)
    #top_keywords = [t for t in top_keywords if t != '-']
    #n_keywords.append(n1)
    #n_repeated.append(n2)
    #top_keywords_dict[name] = {}
    #top_keywords_dict[name] = top_keywords
    if ix % 1000 == 999:
        print('Got to NOS nb {}. Total time elapsed: {:.4f} s'.format(ix,time.time()-t0))
# save them all as csv
if SAVEKW or True:
    pd.DataFrame.from_dict(top_terms_dict, orient = 'index').to_csv(output_dir +
                                                '/NOS_from_supersuites_top_terms_{}_{}.csv'.format(qualifier,pofs))
tfidfm_dense = None


In [None]:
# just to check results
'''
print(list(top_terms_dict.keys())[885:887])
top_terms_weights = get_top_words_weights([df_nos_select.iloc[0]['pruned_lemmas']], feature_names, tfidf, n = 20)
print(top_terms_weights.sort_values(by = 'tfidf', ascending = False).head(n=20))
'''
# note that the get_top_words_weights function is probably wrong - but it doesn't matter now
print('not now')

In [None]:
# remove top terms that are not in the chosen gensim model
new_top_terms_dict = {}
for k,v in top_terms_dict.items():
    # check if the top terms for each document are in the gensim model
    new_top_terms = prep_for_gensim(v, model)
    # only retains the ones in the model
    new_top_terms_dict[k] = new_top_terms
    if np.random.randn(1)>3:
        print(k, new_top_terms, len(new_top_terms), len(v))


### Find average vector per skill cluster


In [None]:
#Load the file with the skills taxonomy lower layer
with open('/Users/stefgarasto/Google Drive/Documents/scripts/NOS/bottom_cluster_membership.pkl', 'rb') as infile:
    bottom_layer = pickle.load(infile)
    

In [None]:
# Collect skills in clusters
skill_cluster_membership = {}
for clus in collections.Counter(bottom_layer.values()):
    cluster_skills = [elem for elem in bottom_layer if \
                      bottom_layer[elem] == clus]
    skill_cluster_membership[clus] = cluster_skills

print(list(skill_cluster_membership.keys())[::10])
    

In [None]:
# Generate lookup vecs using pre-trained GloVe model
# I guess this is to get a mean vector for each skills cluster
skill_cluster_vecs = {}
for clus in skill_cluster_membership:
    cluster_skills = skill_cluster_membership[clus]
    new_skills = [convert_to_undersc(elem) for elem in cluster_skills]
    other_skills = [elem.split() for elem in cluster_skills if len(elem)>1]
    flat_other_skills = [item for sublist in other_skills for item in sublist]
    all_skills = new_skills + list(set(flat_other_skills))
    skills_in = [elem for elem in all_skills if elem in model]
    print(clus, len(cluster_skills), len(skills_in))
    skill_cluster_vecs[clus] = get_mean_vec(skills_in, 
                            model)

# check all skill clusters have a vector: check has to be empty
check = [k for k,v in skill_cluster_vecs.items() if len(v.shape) == 0]
print('This should be empty.',check)


In [None]:
#with open(os.path.join(output_dir, 'skill_cluster_vecs_pretrained.pkl'), 'wb') as f:
#    pickle.dump(skill_cluster_vecs, f)
 
# show words in the model that are closest to average vector for each skill cluster
for clus in list(skill_cluster_vecs.keys())[:10]:
    print(clus)
    print(model.similar_by_vector(skill_cluster_vecs[clus]))
    print('***********')



In [None]:
# arrange all mean skill vectors in a matrix
comparison_vecs = np.vstack(list(skill_cluster_vecs.values()))
clus_names = list(skill_cluster_vecs.keys())

print(clus_names[:10], comparison_vecs.shape)


### Assign each NOS to a skill cluster

In [None]:
def high_similarity(test_skills, comparison_vecs, clus_names):
    sims = cosine_similarity(test_skills.reshape(1,-1), comparison_vecs)
    
    top_sims = np.argsort(sims)[:, -5:].tolist()[0]
    top_sim_vals = [sims[0, elem] for elem in top_sims]
    top_sim_clus = [clus_names[elem] for elem in top_sims]
    top_sims_res = list(zip(reversed(top_sim_clus), reversed(top_sim_vals)))
    if 'dental assistance' == top_sim_clus[0]: #np.random.randn(1)>3:
        #print(df_nos_select['NOS Title'].loc[k], new_top_terms_dict[k], top_sim_clus)
        #counter +=1
        # do manual adjustment
        top_sims_res = top_sims_res[1:]
    return top_sims_res

def highest_similarity(test_skills, comparison_vecs, clus_names):
    sims = cosine_similarity(test_skills.reshape(1,-1), comparison_vecs)
    
    top_sims = np.argsort(sims)[:, -2:].tolist()[0]
    top_sim_vals = [sims[0, elem] for elem in top_sims]
    top_sim_clus = [clus_names[elem] for elem in top_sims]
    #top_sims_res = list(zip(reversed(top_sim_clus), reversed(top_sim_vals)))
    if 'dental assistance' == top_sim_clus[0]: #np.random.randn(1)>3:
        #print(df_nos_select['NOS Title'].loc[k], new_top_terms_dict[k], top_sim_clus)
        #counter +=1
        # do manual adjustment
        top_sim_clus = top_sim_clus[1]
    else:
        top_sim_clus = top_sim_clus[0]
    return top_sim_clus

In [None]:
st_v_clus = {}
counter = 0
for ix,k in enumerate(new_top_terms_dict):
    test_skills = get_mean_vec(new_top_terms_dict[k], model)

    #sims = cosine_similarity(test_skills.reshape(1,-1), comparison_vecs)
    #top_sims = np.argsort(sims)[:, -5:].tolist()[0]
    #top_sim_vals = [sims[0, elem] for elem in top_sims]
    #top_sim_clus = [clus_names[elem] for elem in top_sims]
    #top_sims_res = list(zip(reversed(top_sim_clus), reversed(top_sim_vals)))
    #if ('dental assistance' == top_sim_clus[0]) & (counter<6): #np.random.randn(1)>3:
    #    print(df_nos_select['NOS Title'].loc[k], new_top_terms_dict[k], top_sim_clus)
    #    counter +=1
    #    # do manual adjustment
    #    top_sims_res = top_sims_res[1:]
    #st_v_clus[k] = top_sims_res 
    
    #st_v_clus[k] = high_similarity(test_skills, comparison_vecs, clus_names)
    st_v_clus[k] = highest_similarity(test_skills, comparison_vecs, clus_names)
    

In [None]:
# add the best clusters to the nos dataframe
tmp = pd.DataFrame.from_dict(st_v_clus, orient = 'index')
tmp = tmp.rename(columns = {0: 'best_cluster_nos'})
df_nos['best_cluster_nos'] = tmp['best_cluster_nos']

### Assign each job advert to a skill cluster

In [None]:
def sentence_to_vectors_nofile(x,model):
    # compute a word embedding for a sentence as the average over the words
    # that make up the job title
    counter_oov = 0
    # first check whether the whole word is in the model
    y = convert_to_undersc(x)
    if y in model.vocab:
        return model[y], 1
    # if not, reconvert to discrete words and take the average of them
    y = convert_from_undersc(y)
    # split the sentence into words after removing hyphens: THERE IS NONE
    y = y.split()
    # remove extra spaces, genitives "'s"
    y = [t.strip().lower().replace('\'s','') for t in y]

    # initialise word embedding and we counter
    we = np.zeros((100), dtype = np.float32)
    we_counter = 0
    missed = []
    for t in y:
        if t in model.vocab:
            we += model[t]
            we_counter += 1
        else:
            we_tmp, flag_oov, recovered_words = oov_to_vectors(t, model)
            if flag_oov:
                we += we_tmp
                we_counter += 1
            else:
                missed.append(t)
                counter_oov += 1
    # normalise by the number of embeddings
    if we_counter>0:
        we = we/we_counter
    return we, we_counter

#%%
def jt_to_vectors_nofile(x, model):
    # transform a job title into a word embedding
    we, we_counter = sentence_to_vectors_nofile(x,model)
    if we_counter>0:
        return np.float32(we)
    else:
        # if nothing has been turned to word embedding then this is 0
        return np.zeros((100), dtype = np.float32)
    
#%%
def skills_to_vectors_nofile(x, model):
    # compute a word embedding for the list of skills as an average of averages
    if isinstance(x, str):
        skills = eval(x)
    else:
        skills = eval(x.values[0])
    we= np.zeros((100), dtype = np.float32)
    we_counter = 0
    for z in skills:
        we_skill, skill_counter = sentence_to_vectors_nofile(z, model)
        # note that we_skill is already normalised by skill_counter
        if skill_counter>0:
            # if at least one word making up this skill was in the vocabulary,
            # add it to the overall word embedding:
            we += we_skill
            we_counter += 1
    if we_counter>0:
        we = we/we_counter
    return np.float32(we)

In [None]:
# load the whole dataset
filename= os.path.join('/Users/stefgarasto/Local-Data/Burning-glass/',
     'Job_ads_2012_2018/{}_reliable_soc_ads.csv')

all_years = ['2012','2013','2014','2015','2016','2017','2018']

print('Loading the dataset')
t0 = time.time()

FULLDS = False
if FULLDS:
    for ix,year in enumerate(all_years):
        if ix == 0:
            bgdata = pd.read_csv(filename.format(year))
        else:
            bgdata = pd.concat((bgdata, pd.read_csv(filename.format(year))))
            print(len(bgdata))
else:
    bgdata = pd.read_csv(filename.format('2012'))
    
print('Time in minutes: {:.4f}'.format((time.time()- t0)/60))



In [None]:
# extract good portions of data
bgdata['empty_clusters'] = bgdata['clusters'].map(lambda x: len(x)==2)
print(bgdata['empty_clusters'].sum()/len(bgdata))

soc_flag = bgsoc_is_newsoc(bgdata)
print(((~bgdata['MinEdu'].isnull()) & (soc_flag) & (~bgdata['empty_clusters'])).sum()/len(bgdata))

edu_or_exp = (~bgdata['MinEdu'].isnull()) | (~bgdata['MinExp'].isnull())

bgdata_select = bgdata[edu_or_exp & (soc_flag) & (~bgdata['empty_clusters'])]
#bgdata_exp = bgdata[(~bgdata['MinExp'].isnull()) & (soc_flag) & (~bgdata['empty_clusters'])]

bgdata_select['Eduv2'] = bgdata_select['MinEdu'].map(group_eduv2)
bgdata_select['Exp3'] = bgdata_select['MinExp'].map(group_exp3)

In [None]:
# compute average word embedding for the skills
t0 = time.time()
print('Computing the word embedding for the skills')
bgdata_select['skills_embedding'] = bgdata_select['converted_skills'].map(
        lambda x: skills_to_vectors_nofile(x, model))
print_elapsed(t0, 'above')

In [None]:
## compute average word embedding for the skills
#t0 = time.time()
#print('Computing the word embedding for the skills')
#bgdata_select['skills_embedding'] = bgdata_exp['converted_skills'].map(
#        lambda x: skills_to_vectors_nofile(x, model))
#print_elapsed(t0, 'above')

In [None]:
# match average skills embedding to closest cluster
t0 = time.time()
#ja_edu_v_clus = {}
bgdata_select['best_cluster'] = bgdata_select['skills_embedding'].map(lambda x: highest_similarity(x, 
                                                                                    comparison_vecs, clus_names))
#for ix,k in enumerate(bgdata_edu.index):
#    test_skills = bgdata_edu['skills_embedding'].loc[k]
#
#    ja_edu_v_clus[k] = high_similarity(test_skills, comparison_vecs, clus_names)

print_elapsed(t0,'')


In [None]:
#t0 = time.time()
##ja_exp_v_clus = {}
##for ix,k in enumerate(bgdata_exp.index):
##    test_skills = bgdata_exp['skills_embedding'].loc[k]
##
##    ja_exp_v_clus[k] = high_similarity(test_skills, comparison_vecs, clus_names)
#bgdata_exp['best_cluster'] = bgdata_exp['skills_embedding'].map(lambda x: highest_similarity(x, 
#                                                                                comparison_vecs, clus_names))
#print_elapsed(t0,'')

In [None]:
print('Computing the average salary')
bgdata_select['MeanSalary'] = (bgdata_select['MinSalary'] + bgdata_select['MaxSalary'])/2.0
#bgdata_select['MeanSalary'] = bgdata_select['MeanSalary'].fillna(
#        bgdata_select['MeanSalary'].mean())

In [None]:
bg_groups = bgdata_select.groupby(by = 'best_cluster')
#exp_groups = bgdata_exp.groupby(by = 'best_cluster')

In [None]:
#### Quick way: just pick the cluster to which skills are most associated
'''
cluster_counters = []
t0 = time.time()
for ix, k in enumerate(bgdata_edu.index):
    cluster_counters.append(collections.Counter(eval(bgdata_edu['clusters'].loc[k])))
    if ix%200000 == 199999:
        print(ix)
print((time.time()- t0)//60)
'''
print('not now')

In [None]:
'''def check_dominant_cluster(x):
    tmp = sorted(list(x.values()))[::-1]
    if len(tmp)>1:
        return tmp[0] - tmp[1]
    else:
        return tmp[0]

t0 =time.time()
check = [check_dominant_cluster(elem) for elem in cluster_counters]
print_elapsed(t0)'''
print('not now')

In [None]:
'''plt.hist(check)
print((sum([elem>0 for elem in check])))
print((sum([elem>0 for elem in check]))/len(check))'''
print('not now')

### NOS are now mapped to job adverts via the skill cluster membership
#### extract the exp/edu/salary requirements

That is: 
1. for each skill cluster, find the distribution of experience, education, SOC (first digit) and salary from the job advert dataset
2. For each distribution, find peak education/experience/SOC1 and median salary
3. Assign the correct distribution and peak requirements to NOS according to skill cluster membership


In [None]:
ix = 0

cols_v_clus = {}

for name,group in bg_groups:
    cols_v_clus[name] = {}
    if not len(group):
        print(name)
        continue
    for col in ['Exp3','Eduv2','MeanSalary','SOC']:
        cols_v_clus[name][col] = group[col].value_counts()
        if col == 'MeanSalary':
            cols_v_clus[name][col + '-peak'] = np.nanmean(group[col])
        else:
            try:
                cols_v_clus[name][col + '-peak'] = cols_v_clus[name][col].idxmax()
            except:
                cols_v_clus[name][col + '-peark'] = 'unknown'
    ix+=1

In [None]:
bgdata_select.columns

In [None]:
# select NOS from engineering
engineering_nos = df_nos[df_nos['supersuite']== 'engineering']
print(len(engineering_nos))

In [None]:
def map_nos_to_req(x,col,cols_v_clus):
    return cols_v_clus[x][col + '-peak']

for col in ['Exp3','Eduv2','MeanSalary','SOC']:
    engineering_nos[col + '-peak'] = engineering_nos['best_cluster_nos'].map(
                            lambda x: map_nos_to_req(x,col,cols_v_clus))
    

In [None]:
engineering_nos.columns

In [None]:
eng_groups = engineering_nos.groupby(by = 'One_suite')
for name, group in eng_groups:
    print(name, len(group))

In [None]:
group = eng_groups.get_group('aeronautical engineering suite 3')
group = group.rename(columns = {'Eduv2-peak': 'Qualification requirements'})
sns.swarmplot(data = group, 
                x = 'Exp3-peak', y ='MeanSalary-peak', hue = 'Qualification requirements',
              order = ['Entry-level', 'Mid-level','Senior-level'])
#               x_bins = ['Entry-level', 'Mid-level','Senior-level'],
#               y_bins = ['Pregraduate','Graduate','Postgraduate'])
plt.ylabel('Average salary', fontsize = 18)
plt.xlabel('Experience requirements', fontsize = 18)
plt.savefig(output_dir + '/NOS_progression_pathway_auronautical_engineering_suite_3_v1.png')

In [None]:
bgdata['converted_skills'].map(lambda x: isinstance(x, str)).value_counts()

In [None]:
bgdata['clusters'].map(lambda x: len(x)>2).value_counts()

### Collect some examples

In [None]:
engineering_nos.columns


In [None]:
eng_groups.get_group('aeronautical engineering suite 3')[
    ['Title','best_cluster_nos','Exp3-peak','Eduv2-peak','MeanSalary-peak','Clean SOC Code','SOC-peak']]

In [None]:
for key in ['Exp3','Eduv2']:
    print(cols_v_clus['welding and machining'][key]/cols_v_clus['welding and machining'][key].sum())
    print('-'*30)
    
print('*'*90)
for key in ['Exp3','Eduv2']:
    print(cols_v_clus['electrical engineering'][key]/cols_v_clus['electrical engineering'][key].sum())
    print('-'*30)
