In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

import os
import itertools
import json
import numpy as np
import pandas as pd
import pickle
import requests
import seaborn as sns
import collections
from collections import Counter
import scipy
import time
import copy
from collections import OrderedDict

#import matplotlib as mpl
#import matplotlib.gridspec as gridspec
#from matplotlib.patches import Rectangle
#import matplotlib.patches as mpatches

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition.pca import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import clone

import nltk
import gensim
import re
from fuzzywuzzy import process
import networkx as nx
import xlrd

from scipy.spatial import distance
import hdbscan

In [None]:
from utils_nos import nesta_colours, nesta_colours_combos
print(nesta_colours, nesta_colours_combos)

In [None]:
# flatten lists of lists
def flatten_lol(t):
    return list(itertools.chain.from_iterable(t))
flatten_lol([[1,2],[3],[4,5,6]])

def print_elapsed(t0_local, task = 'current task'):
    print('Done with {}. Elapsed time: {:4f}'.format(task,time.time()-t0_local))

In [None]:
#These two functions are useful for analysing bi and tri-grams with w2v models in gensim

def convert_to_undersc(skill):
    '''
    convert spaces in skill phrases into underscores to use with trained
    w2v model.
    '''
    if len(skill.split(' ')) >1:
        new_i = '-'.join(skill.split(' '))
    else:
        new_i = skill
    return(new_i)

def convert_from_undersc(skill):
    '''
    convert underscores between terms in skill phrases back to spaces.
    '''
    if len(skill.split('_')) >1:
        new_i = ' '.join(skill.split('_'))
    else:
        new_i = skill
    return(new_i)

pos_to_wornet_dict = {
        'JJ': 'a',
        'JJR': 'a',
        'JJS': 'a',
        'RB': 'r',
        'RBR': 'r',
        'RBS': 'r',
        'NN': 'n',
        'NNP': 'n',
        'NNS': 'n',
        'NNPS': 'n',
        'VB': 'v',
        'VBG': 'v',
        'VBD': 'v',
        'VBN': 'v',
        'VBP': 'v',
        'VBZ': 'v'
    }

#A few functions for tyding up text
def tag_for_lemmatise(s):
    try:
        return pos_to_wornet_dict[nltk.pos_tag([s])[0][1]]
    except:
        return 'n'
    
def lemmatise(title_terms):
    """
    Takes list as input.
    Removes suffixes if the new words exists in the nltk dictionary.
    The purpose of the function is to convert plural forms into singular.
    Allows some nouns to remain in plural form (the to_keep_asis is manually curated).
    Returns a list.
    >>> lemmatise(['teachers'])
    ['teacher']
    >>> lemmatise(['analytics'])
    ['analytics']
    """
    keep_asis = ['sales', 'years', 'goods', 'operations', 'systems',
                    'communications', 'events', 'loans', 'grounds',
                    'lettings', 'claims', 'accounts', 'relations',
                    'complaints', 'services']
    wnl = nltk.WordNetLemmatizer()
    processed_terms = [wnl.lemmatize(i) if i not in keep_asis else i for i in title_terms]
    #processed_terms = [wnl.lemmatize(i, pos = tag_for_lemmatise(i)) 
    #            if i not in keep_asis else i for i in title_terms]
    return processed_terms

def lemmatise_with_pos(title_terms):
    """
    Takes list as input.
    Removes suffixes if the new words exists in the nltk dictionary.
    The purpose of the function is to convert plural forms into singular.
    Allows some nouns to remain in plural form (the to_keep_asis is manually curated).
    Returns a list.
    >>> lemmatise(['teachers'])
    ['teacher']
    >>> lemmatise(['analytics'])
    ['analytics']
    """
    keep_asis = ['sales', 'years', 'goods', 'operations', 'systems',
                    'communications', 'events', 'loans', 'grounds',
                    'lettings', 'claims', 'accounts', 'relations',
                    'complaints', 'services']
    wnl = nltk.WordNetLemmatizer()
    processed_terms = [wnl.lemmatize(i, pos_to_wornet_dict[p]) 
                       if i not in keep_asis else i for i,p in title_terms]
    #processed_terms = [wnl.lemmatize(i, pos = tag_for_lemmatise(i)) 
    #            if i not in keep_asis else i for i in title_terms]
    return processed_terms

def lemmatise_pruned(x, pofs = 'nv'):
    if pofs == 'nv':
        tags = [(t,p) for t,p in x if p[:1] in ['V','N']]
    elif pofs == 'n':
        tags = [(t,p) for t,p in x if p[:1] in ['N']]
    else:
        raise ValueError
    return lemmatise_with_pos(tags)

def remove_digits(s):
    """
    Takes a string as input.
    Removes digits in a string.
    Returns a string.
    >>> remove_digits('2 recruitment consultants')
    ' recruitment consultants'
    """
    result = ''.join(i for i in s if not i.isdigit())
    return result

def remove_list_enumeration(s):
    '''
    This is a specific requirement of the NOS that comes from
    the presence of lists enumerated by strings like K+number
    or P+number. Therefore, after "lowerising" and removing 
    digits, I look for and remove strings like "k " and "p "
    '''
    result = re.sub('( k )+',' ',s)
    result = re.sub('( p )+', ' ', result)
    # it might not be necessary if I add 'k' and 'p' to stopwords
    return result

select_punct = set('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~') #only removed "'"
extra_chars = set('–-•’”“µ¾âãéˆﬁ[€™¢±ï…˜')
all_select_chars = select_punct.union(extra_chars)

def replace_punctuation(s):
    """
    Takes string as input.
    Removes punctuation from a string if the character is in select_punct.
    Returns a string.
   >>> replace_punctuation('sales executives/ - london')
   'sales executives   london'
    """
    for i in set(all_select_chars): #set(select_punct):
        if i in s:
            s = s.replace(i, ' ')
    return s

def tidy_desc(desc):
    clean_data = desc.replace('\r\n', '').replace('\xa0', '')
    nodigits = remove_digits(clean_data.lower())
    nopunct = replace_punctuation(nodigits)
    #nopunct = remove_list_enumeration(nopunct)
    lemm = lemmatise(nopunct.split())
    return ' '.join(lemm)

def tidy_desc_with_pos(desc):
    clean_data = desc.replace('\r\n', '').replace('\xa0', '')
    nodigits = remove_digits(clean_data.lower())
    nopunct = replace_punctuation(nodigits)
    # add part of speech tagging
    nopunct = [(t,nltk.pos_tag([t])[0][1]) for t in nopunct.split()]
    nopunct = [t for t in nopunct if t[1] in pos_to_wornet_dict.keys()]
    lemm = lemmatise_with_pos(nopunct)
    return ' '.join(lemm)

def tokenize(text):
    """
    Takes string as input.
    Returns list of tokens. The function is used as an argument for
    TfidfVectorizer.
    >>> tokenize('some job title')
    ['some', 'job', 'title']
    """
    tokens = nltk.word_tokenize(text)
    return tokens

def tokenize_asis(some_list):
    """
    Takes list as input.
    Returns the list with elements converted to lower case. The function is 
    used as an argument for TfidfVectorizer.
    
    In [57]: tokenize(['Accounting', 'Microsoft Excel'])
    Out[57]: ['accounting', 'microsoft excel']
    """
    tokens = [elem.lower() for elem in some_list]
    return tokens


#This set of functions is useful for identifying terms with highest tf-idf weights 
#in a single document or set of documents

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding 
        feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25, sparse_output = False):
    ''' Return the top n features that on average are most important 
        amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    if sparse_output:
        return scipy.sparse.csr_matrix(top_tfidf_feats(tfidf_means, features, top_n))
    else:
        return top_tfidf_feats(tfidf_means, features, top_n)

def all_mean_feats(Xtr, grp_ids=None, min_tfidf=0.1):
    ''' Return the average
        amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return tfidf_means

def get_top_words_weights(desc, feature_names, vect, n = 25):
    response = vect.transform(desc)
    words = top_mean_feats(response, feature_names, grp_ids = None, top_n = n)
    return words

def get_mean_tfidf(desc, vect):
    response = vect.transform(desc)
    tfidf_values = all_mean_feats(response, grp_ids = None)
    return tfidf_values

def get_top_words(desc, feature_names, vect, n = 25):
    response = vect.transform(desc)
    words = top_mean_feats(response, feature_names, grp_ids = None, top_n = n)
    return words['feature'].values


# manually remove "k"s and "p"s from the pruned columns
def remove_pk(x):
    return [t for t in x if t not in ['k','p']]
#df_nos['pruned'] = df_nos['pruned'].map(remove_pk)


In [None]:
# First, create your TFidfVectorizer model. This doesn't depend on whether it's used on suites or NOS. However,
# it does require that the docs collection is already given as a collection of tokens (tokenizer=tokenize_asis)

#Since we now have not just long strings in our documents, but lists of terms, we will use a different tokenizer
def define_tfidf(params, stopwords):
    if params['ngrams'] == 'bi':
        tfidf = TfidfVectorizer(tokenizer=tokenize_asis,
                                lowercase = False,
                                stop_words=stopwords,
                                ngram_range=(1,2), 
                                max_df = params['tfidf_max'], 
                                min_df = params['tfidf_min'])
    elif params['ngrams'] == 'tri':
        tfidf = TfidfVectorizer(tokenizer=tokenize_asis,
                                lowercase = False,
                                stop_words=stopwords,
                                ngram_range=(1,3), 
                                max_df = params['tfidf_max'], 
                                min_df = params['tfidf_min'])
    else:
        # unigrams is the default
        tfidf = TfidfVectorizer(tokenizer=tokenize_asis,
                                lowercase = False,
                                stop_words=stopwords,
                                max_df = params['tfidf_max'], 
                                min_df = params['tfidf_min'])
    return tfidf


# now, collect the text to transform
def combine_nos_text(df_nos, col = 'pruned'):
    all_joint_tokens = []
    # group by suites and concatenate all docs in it
    row_names = []
    for name, group in df_nos.groupby('One_suite'):
        row_names.append(name)
        joint_tokens = []
        for idoc in group[col].index:
            joint_tokens += group[col].loc[idoc]
        all_joint_tokens.append(joint_tokens)
    # return a dataframe
    return pd.DataFrame({'tokens': all_joint_tokens}, index = row_names)

def get_tfidf_matrix(params, df_nos, tfidf, col = 'pruned'):
    # Note: this can simply be used to get the tfidf transform, by setting bywhich=docs and any mode
    t0 = time.time()
    # first, get the dataframe of tokens
    if params['bywhich'] == 'docs':
        textfortoken = df_nos[col]
        
    elif params['bywhich'] == 'suites':
        if params['mode'] == 'meantfidf':
            textfortoken = df_nos[col]
                
        elif params['mode'] == 'combinedtfidf':
            # note that this is the only case where the tfidf min and max are computed considering the number of 
            # suites as the number of elements in the collection.
            # TODO: allow for the alternative case, where the transform is computed on individual NOS and then 
            # applied to the joint tokens
            textfortoken = combine_nos_text(df_nos, col)['tokens']
    
    # apply tfidf transform to the tokenised text
    tfidfm = tfidf.fit_transform(textfortoken)
    
    feature_names = tfidf.get_feature_names()
    
    # if the average is needed, compute it and overwrite the matrix. Note that the step above is still needed to
    # initialise the tfidf transform with the proper features and stopwords
    if (params['bywhich'] == 'suites') and (params['mode'] =='meantfidf'):
        row_names = df_nos['One_suite'].value_counts().index.values
        tfidfm = scipy.sparse.lil_matrix(np.zeros((len(row_names),len(feature_names)), dtype = np.float32))
        for name, group in df_nos.groupby('One_suite'):
            tmp = get_mean_tfidf(group[col], tfidf)
            tfidfm[igroup] = tmp

    print_elapsed(t0, 'computing the tfidf matrix')
    return tfidfm, feature_names, tfidf, textfortoken


In [None]:
def assign_supersuite(x):
    for supersuite in all_match_names.keys():
        if x in all_match_names[supersuite]:
            return supersuite.lower()
    # if no match has been found
    return 'other'

def adjustsoccode(x):
    y = re.findall(r"[\d']+", str(x))
    if len(y):
        return y[0][1:-1]
    else:
        return np.nan

def extract2digits(x):
    if isinstance(x,str):
        try:
            return float(x[:2])
        except:
            return np.nan
    else:
        return np.nan
    
def extract3digits(x):
    if isinstance(x,str):
        try:
            return float(x[:3])
        except:
            return np.nan
    else:
        return np.nan
    
def extract1digits(x):
    if isinstance(x,str):
        try:
            return float(x[:1])
        except:
            return np.nan
    else:
        return np.nan

def extract4digits(x):
    if isinstance(x,str):
        try:
            return float(x)
        except:
            return np.nan
    else:
        return np.nan
    

In [None]:
def prep_for_gensim(list_of_terms, some_model, weights = None):
    # replace space with underscore
    new_terms = [convert_to_undersc(elem) for elem in list_of_terms]
    # check if each element in the list is in the model
    is_in = [elem for elem in new_terms if elem in some_model]
    # also check the weights
    if weights:
        weights_in = [weights[ix] for ix,elem in enumerate(new_terms) 
                        if elem in some_model]
    # only return the element in the model
    return is_in, weights_in

def get_mean_vec(skill_list, model, weights= None):
    if not weights:
        weights = np.ones(len(skill_list))
    skill_list_conv = [convert_to_undersc(elem) for elem in skill_list]
    wvector_list = [model[elem]*weights[ix] for ix,elem in enumerate(skill_list_conv) 
                if elem in model]
    vector_list = [model[elem] for ix,elem in enumerate(skill_list_conv) 
                if elem in model]
    vec_array = np.asarray(vector_list)
    wvec_array = np.asarray(wvector_list)
    avg_vec = np.mean(wvec_array, axis=0)
    return avg_vec, vec_array

def extract_top_words(tfidfv, feature_names, N=20):
    top_ngrams = np.argsort(tfidfv[0,:])
    top_ngrams = top_ngrams.tolist()[0][-N:]
    top_ngrams = top_ngrams[::-1]
    # only retain the ones with non zero features
    top_ngrams = [elem for elem in top_ngrams if tfidfv[0,elem]>0]
    top_weights = [tfidfv[0,elem] for elem in top_ngrams]
    top_features = [feature_names[elem] for elem in top_ngrams]
    return top_ngrams, top_weights, top_features



In [None]:
def select_subdf(SELECT_MODE, clusters2use, suites_clusters, df_nos_select, dependent_lists = None):
    if isinstance(SELECT_MODE, str):
        tmp_dict = {'engineering': 'Engineering', 'management': 'Management',
                    'financialservices': 'Financial services', 
                    'construction': 'Construction'}
        # select NOS from super suite
        cluster_name = SELECT_MODE
        cluster_name_save = cluster_name
        cluster_name_figs = tmp_dict[SELECT_MODE]
        indices = df_nos_select['supersuite']== SELECT_MODE
        subset_nos = df_nos_select[indices]
    elif isinstance(SELECT_MODE, int):
        cluster_name = clusters2use[SELECT_MODE][1]
        cluster_name_save = cluster_name.replace(' ','_')
        cluster_name_figs = cluster_name.capitalize()
        suites2use = list(suites_clusters[suites_clusters['hierarchical'].map(
                lambda x: x in clusters2use[SELECT_MODE][0])]['Suite_names'].values)
        indices = df_nos_select['One_suite'].map(
                lambda x: x in suites2use)
        subset_nos = df_nos_select[indices]
    print('Number of NOS selected: ', len(subset_nos))
    #print(subset_nos.columns)
    # select from each item in the list too
    if dependent_list:
        dependent_list2 = []
        for item in dependent_list:
            dependent_list2.append(dependent_list[indices])
    else:
        dependent_list2 = None
    
    #%
    # only select those engineering nos with SOC codes
    nosoc = subset_nos['SOC4'].isnull()
    print('percentage of nos without SOC codes: ', nosoc.sum()/len(nosoc))
    if (nosoc.sum())/len(nosoc)<0.9:
        final_nos = subset_nos[~nosoc] #np.isnan(engineering_nos['SOC4'])]
        indices = ~nosoc
    else:
        final_nos = subset_nos
        indices = nosoc
    # select from each item in the list too
    if dependent_list2:
        dependent_list = []
        for item in dependent_list2:
            dependent_list.append(dependent_list[indices])
    else:
        dependent_list = None
    final_groups = final_nos.groupby(by = 'One_suite')
    larger_suites = []
    all_lengths = final_groups.agg(len)['NOS Title'].values
    all_lengths[::-1].sort()
    print('Number of NOS in suites belonging to this cluster: ',all_lengths)
    #th_supers = ['engineering': 40, 'financialservices': ]
    for name, group in final_groups:
        if isinstance(SELECT_MODE, int):
            larger_suites.append(name)
        elif len(group)> all_lengths[15]:#th_supers[SELECT_MODE]:
            #print(name, len(group))
            larger_suites.append(name)

    return final_nos, final_groups, larger_suites, cluster_name,  \
                    cluster_name_save, cluster_name_figs, dependent_list


## Set up some parameters and directories

In [None]:
svq_directory = '/Users/stefgarasto/Google Drive/Documents/data/NOS_meta_data/svq_qualifications/'

#%%
'''  Start of the main script   '''

#%% set up main parameters
#from set_params_thematic_groups import qualifier, qualifier0, pofs, WHICH_GLOVE, 
#from set_params_thematic_groups import glove_dir, paramsn
qualifier = 'postjoining_final_no_dropped'
qualifier0 = 'postjoining_final_no_dropped'


WHICH_GLOVE = 'glove.6B.100d' #'glove.6B.100d' #'glove.840B.300d', 
#glove.twitter.27B.100d

glove_dir = '/Users/stefgarasto/Local-Data/wordvecs/'


# In[3]:

# set up plot style
print(plt.style.available)
plt.style.use(['seaborn-darkgrid','seaborn-poster','ggplot'])
pofs = 'n'
output_dir = '/Users/stefgarasto/Google Drive/Documents/results/NOS/nlp_analysis/'
output_dir += 'svq_qualifications_{}_{}'.format(pofs, qualifier)
print(output_dir)
lookup_dir = '/Users/stefgarasto/Google Drive/Documents/results/NOS/extracted/'


## Load the svq data and do some processing

In [None]:
def lower_and_strip(x):
    if isinstance(x,str):
        x = x.lower().strip().replace('\xa0',' ').replace('fualt', 'fault')
        x = x.replace('\n',' ')
        x = x.replace('sub-assemblies', 'sub assemblies')
        x = x.replace('commision', 'commission')
        x = x.replace('assesmbly', 'assembly')
        x = x.replace('sheetmetal', 'sheet metal')
        x = x.replace('compoents', 'components')
        x = x.replace('prodution', 'production')
        #x = x.replace('– ','')
        return x
    else:
        return x
    
all_svq_files = os.listdir(svq_directory)
print('Number of files in directory: ',len(all_svq_files))
#print(all_svq_files)

svq_dict= {}
unit_dict = {}
unit_dict_mand = {}
unit_codes = {}
for svq_file in all_svq_files:
    # don't try to open non excel files
    if not svq_file.endswith('xlsx'):
        continue
    # don't try to open file beginning with ~
    if svq_file[0] == '~':
        continue
    try:
        xls = xlrd.open_workbook(os.path.join(svq_directory,svq_file), on_demand=True)
    except:
        print(svq_file)
        print('Can not read the file')
        continue
    Nrows = 14
    try:
        if xls.sheet_names()[0] == 'Drop Downs': #len(xls.sheet_names)
            sheet_name = 1
        else:
            sheet_name = 0
        df_svq = pd.read_excel(os.path.join(svq_directory,svq_file), sheet_name = sheet_name, 
                               header = None, nrows= Nrows)
        # make everything lwoer case and remove extra white spaces
        df_svq = df_svq.applymap(lower_and_strip)
    except:
        print('Failed with ', svq_file)
    if len(df_svq)<Nrows:
        print(svq_file)
        continue
    svq_dict[svq_file] = {}
    # collect the entries in the first column if they are a string - togethere with the good indices
    L = [df_svq.iloc[t][0] for t in range(Nrows) if isinstance(df_svq.iloc[t][0],str)]
    tL = [t for t in range(Nrows) if isinstance(df_svq.iloc[t][0],str)]
    # find the row where "structure information" is
    try:
        separator_row = tL[['structure information' in t.lower() for t in L].index(True)]
        #if not any(['structure information' in t.lower() for t in L]):
        #    print(svq_file)
    except:
        print('*** Separator string not found in file \'{}\''.format(svq_file))
        separator_row = False
    if separator_row:
        df_svq_header = df_svq.iloc[:separator_row+1]
        df_svq = pd.read_excel(os.path.join(svq_directory,svq_file), sheet_name= sheet_name, header= separator_row+1)
        # make everything lwoer case and remove extra white spaces
        df_svq = df_svq.applymap(lower_and_strip)
    else:
        print('*** Something went wrong with file \'{}\''.format(svq_file))
    # extract info
    # 1. look for the title and the scqf in the header
    for search_item in ['title','scqf overall level']:
        flag = df_svq_header[0].map(lambda x: search_item in x.lower() if isinstance(x,str) else False)
        row = df_svq_header[flag]
        if len(row):
            # take the non null columns
            tmp = row.columns[(~row.isnull()).values[0]]
            # the first one is column 0 which is the description, the second non-null column is the value
            svq_dict[svq_file][search_item] = row[tmp[1]].values[0]
    # 2. look for the svq level
    flag = df_svq_header.applymap(lambda x: 'svq level' in x.lower() if isinstance(x,str) else False)
    if flag.sum().sum()>0:
        icol = flag.columns[flag.sum(axis = 0)==1].values[0]
        irow = flag.index[flag.sum(axis = 1)==1].values[0]
        for col in range(icol+1, len(df_svq_header.columns)):
            tmp = df_svq_header.loc[irow][col]
            if not np.isnan(tmp):#.isnull():
                svq_dict[svq_file]['svq level'] = tmp
                break
    else:
        svq_dict[svq_file]['svq level'] = np.nan
    # 3. Get the Unit titles and their scqf levels
    #try:
    all_cols = [t for t in df_svq.columns if isinstance(t,str)]
    col_unit = [t for t in all_cols if 'title' in t.lower()][0]
    series_unit = df_svq[col_unit]
    col_scqf = [t for t in all_cols if 'level' in t.lower().replace('\n','')][0]
    series_scqf = df_svq[col_scqf]
    svq_dict[svq_file]['units'] = {}
    col_dev = [t for t in all_cols if 'unit code' in t.lower().replace('\n','')]
    if len(col_dev):
        col_dev = col_dev[0]
        has_a_code = True
    else:
        has_a_code = False
        #print(svq_file)
    col_mand = [t for t in all_cols if 'mandatory' in t.lower().replace('\n','')][0]
    series_mand = df_svq[col_mand]
    for ix,unit in enumerate(series_unit):
        if isinstance(unit,str):
            # check if I've found the unit already (exact or near identical match)
            is_a_match = unit in unit_dict
            if (not is_a_match)&('heating and ventilating systems' not in unit)&('engineering drawings' not in unit):
                # check if it's a near match
                all_matches = process.extract(unit, list(unit_dict.keys()))
                good_matches = [out for out in all_matches if out[1]>=98]
                if len(good_matches)>0:
                    print(unit, good_matches)
                    unit = good_matches[0][0]
                    is_a_match = True
            if not is_a_match:
                unit_dict[unit] = {}
            #if unit not in unit_dict_mand:
                unit_dict_mand[unit] = {}
            unit_dict[unit][svq_dict[svq_file]['title']] = series_scqf.iloc[ix]
            unit_dict_mand[unit][svq_dict[svq_file]['title']] = series_mand.iloc[ix]
            # save it in a dict structure by framework
            svq_dict[svq_file]['units'][unit] =  series_scqf.iloc[ix]
            #svq_dict[svq_file]['units'].append({'title': unit, 'scqf level': series_scqf.iloc[ix]}) 
            # if the framework also gives me the code that's good
            if has_a_code:
                if unit not in unit_codes:
                    unit_codes[unit] = []
                unit_codes[unit].append(df_svq[col_dev].iloc[ix])
print('Done')


In [None]:
svq_data = pd.DataFrame.from_dict(svq_dict, orient = 'index')


In [None]:
# print units that appear with different levels in different frameworks
unit_data = pd.DataFrame.from_dict(unit_dict, orient = 'index')
unit_data_mand = pd.DataFrame.from_dict(unit_dict_mand, orient = 'index')
print(unit_data.index[unit_data.std(axis = 1)>0])
for unit in unit_data.index[unit_data.std(axis = 1)>0]:
    print(unit_dict[unit])
    print()
#print((~unit_data.isnull()).sum())

### Process the codes associated with some of the units 
Cleaning the codes means that I'll be able to match them to URNs

In [None]:

new_unit_codes = copy.deepcopy(unit_codes)

def transform_codes(code):
    code = code.lower()
    # eliminate occurrences of version and unit number:
    code = re.sub('v\d+|v\s+\d+', '', code)
    code = re.sub('unit\d+|unit\s+\d+', '', code)
    code = code.replace('()','')
    # if it starts with vr, it probably was missing cos
    if 'vr' == code[:2]:
        code = 'cos' + code
    # eliminate extra spaces and punctuation
    code = code.strip().replace('-','').replace('()','')
    #code = re.sub('\s+()|()|()','',code)
    # special case
    if code == 'semman12302':
        code = 'semman2302'
    if code == 'semman12301()':
        code = 'semman12301'
    return code.split()

for unit in unit_codes:
    new_codes = [transform_codes(t) for t in unit_codes[unit]]
    new_codes = flatten_lol(new_codes)
    new_unit_codes[unit] = Counter(new_codes)
    #if len(set(new_codes))>1:
    #    print(unit)
    #    print(Counter(new_codes).most_common())
#unit_codes

In [None]:
'''# what units appear more often?
# note: this is not necessarily very informative because some frameworks are split into multiple paths 
# and some are not
((unit_data_mand == 'mandatory') | (unit_data_mand =='optional')).sum(axis = 1).sort_values(ascending= False)[:20]'''
print('-')

In [None]:
unit_levels = pd.DataFrame(unit_data.apply(np.nanmean, axis = 1).map(np.round), 
                           columns = ['scqf_levels'])
group_unit_levels = unit_levels.groupby(by='scqf_levels')
for name, _ in group_unit_levels:
    print(name)

In [None]:
def tidy_desc_with_pos_pof(desc):
    pof='v'
    clean_data = desc.replace('\r\n', '').replace('\xa0', '')
    nodigits = remove_digits(clean_data.lower())
    nopunct = replace_punctuation(nodigits)
    # add part of speech tagging
    nopunct = [(t,nltk.pos_tag([t])[0][1]) for t in nopunct.split()]
    nopunct = [t for t in nopunct if t[1] in pos_to_wornet_dict.keys()]
    if pof == 'v':
        nopunct = [t for t in nopunct if t[1][0] == 'V']
    lemm = lemmatise_with_pos(nopunct)
    return ' '.join(lemm)
    
# check if there's any easy to spot difference between levels
for name, group in group_unit_levels:
    # count bi grams in each group
    count_vec = CountVectorizer(stop_words = [], ngram_range= (1,1),
                           preprocessor = tidy_desc_with_pos_pof)
    raw_docs= group.index.values
    count_vec = count_vec.fit(raw_docs)
    level_counts = count_vec.transform(raw_docs).toarray()
    print('Level ',name)
    counts_by_term = level_counts.sum(axis=0)
    #print(len(counts_by_term),len(count_vec.get_feature_names()))
    #print(count_vec.vocabulary_)
    if name < 7.0:
        for doc in raw_docs:
            if 'bus' in doc:
                print(name,doc)
    terms_counter = {}
    for term in count_vec.vocabulary_:
        ix = count_vec.vocabulary_[term]
        terms_counter[term] = counts_by_term[ix]
    terms_counter = Counter(terms_counter).most_common()
    print(terms_counter[:10])

## Load NOS data

In [None]:
#Get the NOS data 
df_nos = pd.read_pickle(lookup_dir + 'all_nos_input_for_nlp_{}.zip'.format(qualifier0))

# load the cleaned and tokenised dataset and join it with the rest
df_nos = df_nos.join(pd.read_pickle(lookup_dir + 
                    'all_nos_input_for_nlp_{}_pruned_{}.zip'.format(qualifier,pofs)))

# remove p and k
df_nos['pruned'] = df_nos['pruned'].map(remove_pk)
print('Done with loading the dataset')


# Load stopwords
with open(lookup_dir + 'stopwords_for_nos_{}_{}.pickle'.format(qualifier,pofs),'rb') as f:
    stopwords0, no_idea_why_here_stopwords, more_stopwords = pickle.load(f)
stopwords = stopwords0 + no_idea_why_here_stopwords 
stopwords += tuple(['¤', '¨', 'μ', 'บ', 'ย', 'ᶟ', '‰', '©', 'ƒ', '°', '„'])
stopwords0 += tuple(['¤', '¨', 'μ', 'บ', 'ย', 'ᶟ', '‰', '©', 'ƒ', '°', '„',
                     "'m", "'re", '£','—','‚°','●'])
stopwords0 += tuple(set(list(df_nos['Developed By'])))
stopwords0 += tuple(['cosvr','unit','standard','sfl','paramount','tp','il','al','ad','hoc',
                    'lanleo','ireland','something'])


# In[20]:


# create another column where the texts are lemmatised properly
t0 = time.time()
df_nos['pruned_lemmas'] = df_nos['tagged_tokens'].map(lambda x: lemmatise_pruned(x,pofs))
print(time.time()-t0)


# ### Only keep NOS from a super-suite

# In[21]:


super_suites_files=  ''.join(['/Users/stefgarasto/Google Drive/Documents/data/',
                              'NOS_meta_data/NOS_Suite_Priority.xlsx'])
super_suites_names = ['Engineering','Management','FinancialServices','Construction']
all_super_suites = {}
for which_super_suite in super_suites_names:
    all_super_suites[which_super_suite] = pd.read_excel(super_suites_files, 
                    sheet_name = which_super_suite)
    all_super_suites[which_super_suite]['NOS Suite name'] = all_super_suites[
        which_super_suite]['NOS Suite name'].map(
        lambda x: x.replace('(','').replace('(','').replace('&','and').strip().lower())


# Match given suites names in super-suites with the names we have in the data
standard_labels = list(df_nos.groupby('One_suite').groups.keys())
all_matches = {}
all_match_names = {}
#match_name = []
for which_super_suite in super_suites_names:
    all_matches[which_super_suite] = []
    for suite in all_super_suites[which_super_suite]['NOS Suite name'].values:
        # do manually some selected suites
        if 'insurance claims' in suite:
            tmp = standard_labels.index('general insurance')
            all_matches[which_super_suite].append(tmp)
            continue
        # for the "management and leadership marketing 2013" both marketing 
        # and marketing 2013 would fit,
        # but I'm only taking the latter
        # find a fuzzy match between 
        out = process.extract(suite, standard_labels, limit=3)
        if len(out) and out[0][1]>89:
            # note: most of them are above 96% similarity (only one is 90%)
            tmp = standard_labels.index(out[0][0])
            #print(suite, out[0])
            if tmp not in all_matches[which_super_suite]:
                all_matches[which_super_suite].append(tmp)
            else:
                if suite == 'installing domestic fascia, soffit, and bargeboards':
                    # this suite is kind of a duplicate - I aggregated it in my suites list
                    continue
                tmp = standard_labels.index(out[2][0])
                all_matches[which_super_suite].append(tmp)
                print(out[0][0],',',out[1][0],',',out[2][0],',',suite)
        else:
            print(suite, ' not found')
            print(out)
            print('\n')
    print(len(all_matches[which_super_suite]),len(all_super_suites[which_super_suite]))
    all_match_names[which_super_suite] = [standard_labels[t] 
                    for t in all_matches[which_super_suite]]


# In[32]:
# assign supersuite and SOC codes
df_nos['supersuite'] = df_nos['One_suite'].apply(assign_supersuite)
# extract 2 digit soc
df_nos['SOC4str'] = df_nos['Clean SOC Code'].map(adjustsoccode)
df_nos['SOC1'] = df_nos['SOC4str'].map(extract1digits)
df_nos['SOC2'] = df_nos['SOC4str'].map(extract2digits)
df_nos['SOC3'] = df_nos['SOC4str'].map(extract3digits)
df_nos['SOC4'] = df_nos['SOC4str'].map(extract4digits)
print(df_nos['supersuite'].value_counts())
print('All done')


## Join NOS with svq units

In [None]:
# for those units that have a code, get the NOS with the same code in its file name

col_to_match= 'URN'
nos_matched = {}
for counter,unit in enumerate(unit_data_mand.index):
    if unit in new_unit_codes:
        all_codes = list(new_unit_codes[unit].keys())
        
        for code in all_codes:
            #print(tmp)
            if col_to_match == 'NOS Title':
                matches = df_nos[df_nos.index.map(lambda x: code in x.replace('_','&').replace('-',''))]
            else:
                matches = df_nos[df_nos['URN'].map(lambda x: code in x.replace('-','').replace('_','&'))]

            # unique match
            if len(matches) == 1:
                # check that also the titles are similar
                flag = process.extract(unit, matches['NOS Title'])[0][1]
                match = matches.index[0]
                #print(match)
                if flag>=90:
                    if match not in nos_matched:
                        nos_matched[match] = []
                    # what's missing is that I need to add all the information related to this unit
                    nos_matched[match].append(unit)
                else:
                    print('Similar code, but different title: ', code, flag)
                    print(unit) 
                    print(matches['NOS Title'])
                    print()
            # more than one match
            elif len(matches)>1:
                flag = process.extract(unit, matches['NOS Title'])
                # pick the best match, if there is a best match
                values = [t[1] for t in flag]
                if len(set(values))>1:
                    idx = np.argmax(values)
                else: 
                    # all matches are the same. Does it change if I remove "legacy" from the title?
                    flag = process.extract(unit, matches['NOS Title'].map(lambda x: x.replace('legacy','')))
                    # pick the best match, if there is a best match
                    values = [t[1] for t in flag]
                    if len(set(values))>1:
                        idx = np.argmax(values)
                    else:
                        #pick the one that's not legacy
                        a = [i for i,t in enumerate(flag) if not t[2].endswith('l.pdf')]
                        a = [i for i,t in enumerate(flag) if not t[2].endswith('l')]
                        idx = a[0]
                match= matches.index[idx]
                flag = values[idx]
                if flag>=90:
                    if match not in nos_matched:
                        nos_matched[match] = []
                    # what's missing is that I need to add all the information related to this unit
                    nos_matched[match].append(unit)
                else:
                    print('multiple matches, none very good', code)
                    print(unit)
                    print()
            else:
                1 # no match
                print('No match: ', code, unit)
                print()
print('Done')

In [None]:
#pd.DataFrame.from_dict(nos_matched, orient = 'index')

In [None]:
manual_matching_list = {
    'deal with hazards in process engineering maintenance': 'cogpem3.pdf', 
    'perform electrical work on mechanical building services systems': 'sumbse04.pdf',
    'prepare loads for moving during process engineering maintenance': 'cogpem06.pdf',
    'working efficiently and effectively in engineering': 'sempeo202.pdf',
    'working efficiently and effectively in the power sector': 'sempeo202.pdf',
    'work efficiently and effectively in engineering food and drink operations': 'sempeo202.pdf',
    'working safely in the power sector': 'sempeo201.pdf',
    'install underground cables': 'eusepus042l.pdf',
    'carry out excavation activities on underground cables': 'eusepus043l.pdf',
    'location and identification of underground utility services': 'eusepus044l.pdf',
    ''.join(['identify and suggest improvements to working practices and procedures whilst maintaining ',
         'electrical plant and equipment']): 'none',
    'diagnose mechanical/electrical faults in ancillary systems and compoents in buses/coaches': 'pplbacem32.pdf',
    'recondition electrical components in buses/coaches': 'none',
'operate powered units, tools or pedestrian plant, machinery or equipment' : 'cosvr400.pdf',
 'operate powered units, tools or pedestrian plant, machinery or equipment (generators)': 'cosvr400.pdf',
'operate powered units, tools or pedestrian plant, machinery or equipment (pumps)': 'cosvr400.pdf',
    'operate powered units, tools or pedestrian plant, machinery or equipment (mixers)' : 'cosvr400.pdf',
    'operate powered units, tools or pedestrian plant, machinery or equipment (compressors)' : 'cosvr400.pdf',
'operate powered units, tools or pedestrian plant, machinery or equipment (pedestrian operated plant or machines)' : 
                                                                                            'cosvr400.pdf',
    'operate powered units, tools or pedestrian plant, machinery or equipment (self-powered tools)':'cosvr400.pdf',
    'operate plant or machinery to receive and transport loads (forward tipping dumper wheeled)' : 'cosvr391.pdf',
    'operate plant or machinery to receive and transport loads (forward tipping dumper tracked)' : 'cosvr391.pdf',
    'customer relations for working in the power sector': 'semets358.pdf',
    'maintain compressed air systems': 'impem0151s.pdf',
  'rectify body damage to bus/coach body components': 'pplbacem09.pdf',
    'resolving engineering or manufacturing support problems': 'semem403.pdf',
    'inspecting fabricated components and structures': 'semts212.pdf'
}
manual_matching_reverse= {}
for k in manual_matching_list:
    manual_matching_reverse[manual_matching_list[k]] = k

nos_matched2 = copy.deepcopy(nos_matched)
# match the unit titles to NOS in the dataframe, if they have not already been matched by code
t0 = time.time()
#unit_indices = []
#unit_indices_dict = {}
unit_missed = {}
#unit_data_mand['index'] = 'none'
for counter,unit in enumerate(unit_data_mand.index):
    if counter%200 == 199:
        print('got to unit nb {}'.format(counter))
    if unit in new_unit_codes:
        #unit_data_mand['index'].loc[unit] = 'matched by code'
        continue
    ix = df_nos[df_nos['NOS Title'] == unit.lower()].index
    if unit in manual_matching_list:
        #unit_indices.append(manual_matching_list[unit])
        #unit_indices_dict[unit] = manual_matching_list[unit]
        #unit_data_mand['index'].loc[unit] = manual_matching_list[unit]
        if manual_matching_list[unit] not in nos_matched2:
            nos_matched2[manual_matching_list[unit]] = []
        nos_matched2[manual_matching_list[unit]].append(unit)
    elif len(ix):
        #unit_indices.append(ix[0])
        #unit_indices_dict[unit] = ix[0]
        if ix[0] not in nos_matched2:
            nos_matched2[ix[0]] = []
        nos_matched2[ix[0]].append(unit)
    else:
        out = process.extract(unit, df_nos['NOS Title'].values, limit = 1)
        if out[0][1]>=95:
            ix = df_nos[df_nos['NOS Title'] == out[0][0]].index
            #unit_indices.append(ix[0])
            #unit_indices_dict[unit] = ix[0]
            #unit_data_mand['index'].loc[unit] = ix[0]
            if ix[0] not in nos_matched2:
                nos_matched2[ix[0]] = []
            nos_matched2[ix[0]].append(unit)
        else:
            unit_missed[unit] = out
            #unit_indices.append('none')
            #unit_indices_dict[unit] = 'none'
        #break
print_elapsed(t0, 'matching units')

In [None]:
# number of unmatched units
len(unit_missed)

In [None]:
'''# check that everything is consistent (including the order)
# the content of the dictionary should be the same as the list
print(list(unit_indices_dict.values()) == unit_indices)
# the content of the new column should also be the same as the list (not anymore because I skip units with a code)
print(list(unit_data_mand['index'].values) == unit_indices)
# the keys of the dictionary should be in the same order as the indices in the dataframe
# (not anymore because I skip units with a code)
print(list(unit_indices_dict.keys()) == list(unit_data_mand.index))'''
units_to_nos_last = {}
for nos in nos_matched2:
    if nos== 'none':
        continue
    for i in nos_matched2[nos]:
        units_to_nos_last[i] = df_nos.loc[nos]['NOS Title']
print('-')

In [None]:
# now crosswalk to pathways via the units
nos_matched_to_svq = {}
for match in nos_matched2:
    # extract the subset of unit_data matched to this NOS
    rows_of_interest = unit_data.loc[nos_matched2[match]]
    # combine them so that I take all pathways that appear at least once
    combined_rows = rows_of_interest.agg(np.nanmean)
    nos_matched_to_svq[match] = combined_rows
    

In [None]:
'''# look for units that are matched to the same NOS
for ix in unit_indices:
    #a = [t for t in unit_indices if t ==ix]
    b = [i for i,t in enumerate(unit_indices) if t ==ix]
    if (len(b)>1) and (ix != 'none'):
        print(ix, list(unit_data_mand.index[b]))
        print()'''
print('-')

In [None]:
# join the units in the SVQ frameworks with the NOS database
svq_cols = svq_data['title'].values 
# use this for the order of the columns since it's the same as the count_matrix that'll be built later
'''
# this one was when I went by units
unit_tmp = copy.deepcopy(unit_data)
unit_tmp['index'] = unit_indices
unit_tmp = unit_tmp.set_index('index')
unit_tmp = unit_tmp[~(unit_tmp.index == 'none')]
df_nos2 = df_nos.join(unit_tmp, how = 'left')
# extract the engineering ones
df_nos_eng2 = df_nos2[df_nos2['supersuite'] == 'engineering']
'''
# this is for when I went by NOS
df_nos2 = df_nos.join(pd.DataFrame.from_dict(nos_matched_to_svq, orient = 'index'), how = 'left')
df_nos_eng2 = df_nos2[df_nos2['supersuite'] == 'engineering']
#df_nos_eng2

In [None]:
A = df_nos_eng2[df_nos_eng2[svq_cols].mean(axis =1).notna()][df_nos_eng2['NOS Title'].map(lambda x: 'coaches' in x)
                                                            ]['NOS Title']
for a in A:
    print(a)
print()
for ix,a in enumerate(A):
    #print(A.index[ix])
    #print(a)
    #print(nos_matched2[A.index[ix]])
    tt= []
    for t in nos_matched2[A.index[ix]]:
        #print(list(unit_dict[t].values()))
        tt += list(unit_dict[t].values())
    print(np.mean(tt))
    #print()

## compute similarity between pathways using word counts

In [None]:

count_vec = CountVectorizer(stop_words = 'english', ngram_range= (1,2),
                           preprocessor = tidy_desc_with_pos)
unit_corpus = []
for row in svq_data.index:
    unit_corpus.append( ' '.join(list(svq_data['units'].loc[row].keys())))
count_matrix = count_vec.fit_transform(unit_corpus) #pd.DataFrame.from_dict(unit_corpus,orient= 'index'))
print('Some vocabulary terms')
count_features = count_vec.get_feature_names()
print(count_features[1000:1050])
#count_matrix = count_vec.transform(unit_corpus) #pd.DataFrame.from_dict(unit_corpus,orient= 'index'))
print('nb of documents x nb of features')
print(count_matrix.shape)

In [None]:
for ix,row in enumerate(svq_data.index):
    if np.random.randn()>1.3:
        print(row)
        top_ngrams, top_weights, top_features = extract_top_words(count_matrix.todense()[ix], count_features)
        print(top_features)
        print()

In [None]:
# compute distance matrix
count_dist = distance.squareform(distance.pdist(count_matrix.toarray(), metric = 'cosine'))

In [None]:
# plot distance matrix
plt.figure(figsize = (8,8))
sns.heatmap(1 - count_dist)
plt.title('Similarity')
plt.gca().set_yticks(np.arange(.5,65.5))
plt.gca().set_yticklabels(svq_data.index.map(lambda x: x[6:-5]), {'fontsize': 7, 'rotation' : 'horizontal'})
plt.gca().set_xticks(np.arange(.5,65.5))
tmp = plt.gca().set_xticklabels(svq_data.index.map(lambda x: x[6:-5]), {'fontsize': 7, 'rotation' : 'vertical'})
#plt.tight_layout()

In [None]:
# for each engineering NOS assign the average count_matrix of all the pathways the NOS is associated with
#df_nos_eng_counts = df_nos_eng2[['NOS Title','One_suite']]
# NOS not associated to any units have np.nan
nos_eng_counts = np.empty((len(df_nos_eng2),count_matrix.shape[1]))
nos_eng_counts[:] = np.nan
for ix,nos in enumerate(df_nos_eng2.index):
    svq_matrix = df_nos_eng2[svq_cols].loc[nos].values
    if np.isnan(svq_matrix).sum() == len(svq_cols):
        continue
    indices = ~np.isnan(svq_matrix)
    # get all the relevant rows of the count matrix
    count_sub_matrix = count_matrix[indices,:].toarray()
    # take the average across pathways
    tmp = np.nanmean(count_sub_matrix, axis = 0) #, keepdims = True)
    # assign to this NOS
    nos_eng_counts[ix] = tmp
print('Done. Count matrix shape:', nos_eng_counts.shape)


In [None]:
# compute cosine distances between NOS based on the svq pathways they are associated with
nos_eng_count_dist = distance.pdist(nos_eng_counts, metric = 'cosine')
print(nos_eng_count_dist.shape)

In [None]:
# plot cosine similarities (higher = more similar). NOS not in SVQ frameworks should have nans
sns.heatmap(distance.squareform(1 - nos_eng_count_dist))

In [None]:
'''# select NOS in super-suites of interest
df_nos_select = df_nos[~(df_nos['supersuite']=='other')]
print('Nb of NOS in all supersuites' ,len(df_nos_select))
df_nos_eng = df_nos_select[df_nos_select['supersuite']=='engineering']

'''#%%
'''
# ## Get raw data and tokenize

# ## Choosing parameters for features extraction
# 
# ngrams : uni/bi/tri
# 
# tfidf thresholds: min and max percentage
# 
# which parts of speech were selected before
# 
# whether we are working at the level of suites or of invidual NOS, 
# and how we aggregate NOS to form the suit level
# 
'''

#

'''
# First, create your TFidfVectorizer model. This doesn't depend on whether 
it's used on suites or NOS. However,
it does require that the docs collection is already given as a collection of
tokens (tokenizer=tokenize_asis)

#Since we now have not just long strings in our documents, but lists of terms, 
we will use a different tokenizer
'''

paramsn = {}
paramsn['ngrams'] = 'uni'
paramsn['pofs'] = pofs #'nv'
paramsn['tfidf_min'] = 3
paramsn['tfidf_max'] = 0.5

paramsn['bywhich'] = 'docs' #'docs' #'suites'
paramsn['mode'] = 'tfidf' #'tfidf' #'meantfidf' #'combinedtfidf' #'meantfidf'
# define the transform: this one can easily be the same for both 
# keywords and the clustering
tfidf_n = define_tfidf(paramsn, stopwords0)

# get the transform from the whole NOS corpus
FULL_CORPUS = True
if FULL_CORPUS:
    _, feature_names_n, tfidf_n, _ = get_tfidf_matrix(
            paramsn, df_nos2, tfidf_n, col = 'pruned_lemmas')
else:
    _, feature_names_n, tfidf_n, _ = get_tfidf_matrix(
            paramsn, df_nos_eng2, tfidf_n, col = 'pruned_lemmas')


print('Number of features: {}'.format(len(feature_names_n)))
N = 2000
print('Some features:')
print(feature_names_n[N:N+100:3])
print('*'*70)



# In[77]:

# first transform via tfidf all the NOS in one supersuite because you need the top keywords
textfortoken = df_nos_eng2['pruned_lemmas']
tfidfm = tfidf_n.transform(textfortoken)


In [None]:
# The two important collections of vectors are: df_nos_eng_counts (counts based on pathways presence) and
# tfidfm (tfidfm transform) of each engineering NOS
tfidf_cols = ['tfidf {}'.format(t) for t in range(tfidfm.toarray().shape[1])]
df_nos_eng_tfidfm = pd.DataFrame(tfidfm.toarray(), index = df_nos_eng2.index,
                         columns = tfidf_cols)
counts_cols= ['counts {}'.format(t) for t in range(nos_eng_counts.shape[1])]
df_nos_eng_counts = pd.DataFrame(nos_eng_counts, index = df_nos_eng2.index, 
                                 columns = counts_cols)

In [None]:

# Now load NOS that I have to remove/merge before producing pathways
# load transferable NOS
transferable_file = ''.join(['/Users/stefgarasto/Google Drive/Documents/results/NOS/nlp_analysis/',
                            'transferable_nos_n_postjoining_final_no_dropped/estimated_transferable_nos.csv'])
transferable_nos = pd.read_csv(transferable_file)
transferable_nos = transferable_nos.set_index('Unnamed: 0')
transferable_nos['transferable'] = True


In [None]:
# load duplicated NOS
lshduplicate_file = ''.join(['/Users/stefgarasto/Google Drive/Documents/results/NOS/nlp_analysis/',
         'LSH_results_grouped_with_score_postjoining_final_no_dropped_th0.8.csv'])
lshduplicate_nos = pd.read_csv(lshduplicate_file)

In [None]:
'''# TODO: find a way to speed this up, there has to be one
t0 = time.time()
df_nos_eng_lsh = df_nos_eng2[['NOS Title']]
df_nos_eng_lsh['lsh_group'] = 'empty'
df_nos_eng_lsh['lsh_simil'] = 0
for ix, row in enumerate(lshduplicate_nos.index):
    if ix%200 == 199:
        print('got to group number {}'.format(ix))
    if lshduplicate_nos.loc[row][0] == 'group 1':
        continue
    for c in lshduplicate_nos.loc[row][2:]:
        if isinstance(c, str):
            c = c.replace('(','').replace(')','').replace('\'','')
            t = [i.strip() for i in c.split(',')]
            if t[-1] in df_nos_eng_lsh.index:
                df_nos_eng_lsh.loc[t[-1]]['lsh_group'] = lshduplicate_nos.loc[row][0]
                df_nos_eng_lsh.loc[t[-1]]['lsh_simil'] = lshduplicate_nos.loc[row][1]
print_elapsed(t0, 'assigning group ID')'''
print('-')

In [None]:
def split_nos_in_groups(x):
    if isinstance(x,str):
        x = [t.strip() for t in x.replace(')','').replace('(','').replace('\'','').split(',')]
        return x[-1]
    else:
        return x
tmp0 = lshduplicate_nos.applymap(split_nos_in_groups)
df_nos_eng_lsh = tmp0[['Unnamed: 0','Avg group similarity','1']]
t0 = time.time()
for i in range(2, len(lshduplicate_nos.columns)-2):
    tmp = tmp0[['Unnamed: 0','Avg group similarity','{}'.format(i)]].rename(columns = {'{}'.format(i):'1'})
    tmp = tmp[tmp['1'].notna()]
    df_nos_eng_lsh = pd.concat([df_nos_eng_lsh, tmp])
print_elapsed(t0, 'concatenating first 10 rows')
# NOTE: these are all the groups, not just engineering NOS
df_nos_eng_lsh.sample(n=5)

In [None]:
# join all NOS with their LSH group and the transferable NOS
df_nos2 = df_nos.join(df_nos_eng_lsh.rename(columns = {'Unnamed: 0': 'lsh_group', 
                                                      'Avg group similarity': 'lsh_simil',
                                                      '1':'index'}).set_index('index'), how = 'left')
df_nos2 = df_nos2.join(transferable_nos[['transferable', 'avg similarity2','engineeringness',
                                                 'centralities2','we_spread']], how = 'left')
df_nos2['lsh_group'].mask(df_nos2['lsh_group'].isnull(), 'na', inplace= True)
# remove LSH groups with a low overall similarity
th_lsh = 0.7
df_nos2['lsh_group'].mask(df_nos2['lsh_simil']<th_lsh, 'na', inplace= True)
df_nos2['transferable'].mask(df_nos2['transferable'].isnull(), False, inplace= True)
## join with tfidfm and counts
#df_nos2 = df_nos2.join(df_nos_eng_tfidfm).join(df_nos_eng_counts)
df_nos2.sample(n=3)

In [None]:
# join with group ID and transferable ID
df_nos_eng3 = df_nos_eng2.join(df_nos_eng_lsh.rename(columns = {'Unnamed: 0': 'lsh_group', 
                                                      'Avg group similarity': 'lsh_simil',
                                                      '1':'index'}).set_index('index'), how = 'left')
df_nos_eng3 = df_nos_eng3.join(transferable_nos[['transferable', 'avg similarity2','engineeringness',
                                                 'centralities2','we_spread']], how = 'left')
df_nos_eng3['lsh_group'].mask(df_nos_eng3['lsh_group'].isnull(), 'na', inplace= True)
# remove LSH groups with a low overall similarity
th_lsh = 0.7
df_nos_eng3['lsh_group'].mask(df_nos_eng3['lsh_simil']<th_lsh, 'na', inplace= True)
df_nos_eng3['transferable'].mask(df_nos_eng3['transferable'].isnull(), False, inplace= True)
#df_nos_eng3['transferable'][df_nos_eng3['transferable'].isnull()]= False
#df_nos_eng3.sample(n=5)

# join with tfidfm and counts
df_nos_eng3 = df_nos_eng3.join(df_nos_eng_tfidfm).join(df_nos_eng_counts)

In [None]:
df_nos_eng3[df_nos_eng3['NOS Title'].map(lambda x: 'e-mail' in x)][['NOS Title','SOC4','transferable']]

In [None]:
# first remove legacy and nos without socs
not_legacy_flag = (df_nos_eng3['NOS Title'].map(lambda x: 'legacy' not in x)
                   ) & (df_nos_eng3.index.map(lambda x: ~(x.endswith('l.pdf'))))
with_soc_flag = df_nos_eng3['SOC4'].notna()
df_nos_eng4 = df_nos_eng3[not_legacy_flag & with_soc_flag]
# now remove transferable NOS too
df_nos_eng4 = df_nos_eng4[df_nos_eng4['transferable'] == False]

In [None]:
# then aggregate by lsh groups and combine by averaging/selecting (so that I don't have to create complicated
# rules based on legacies, missing socs and transferability)
def aggregate_floats_lsh(x):
    x = list(set(list(x)))
    #print(x.iloc[0])
    if len(x)==1:
        x = x[0]
    return x

def aggregate_suites_lsh(x):
    #x = sum(list(x))
    print('!',x)
    return "{%s}" % ', '.join(x) #x = [x]
    #return 

def aggregate_titles_lsh(x):
    # join the strings
    x = ';'.join(x)
    # remove duplicates and join again
    x = ';'.join(list(set(x.split(';'))))
    return x

def aggregate_by_first(x):
    return x.iloc[0]

'''
# questions: 
1. some LSH groups don't really have a similarity higher than 0.8. Do I only want to merge those that do?
Probably yes, but relax the threshold a bit because it's an average similarity for groups with > 2 NOS (0.75?)
2. What to do with groups with more than one SOC when assigning requirements?
3. How to assign skills cluster to groups? Just using the concatenated lemmas?

# TOREMEMBERs: 
2D. check whether any SOC is not the same (g.SOC4.map(lambda x: len(x)).value_counts()): 3 rows have 2 SOCs.
3D. print which rows have two socs
'''

#print('-')


# group details about engineering nos by custom/specific functions
cols_of_interest = ['NOS Title','supersuite','One_suite','SOC4','SOC3','SOC2','SOC1','pruned_lemmas', 
                    'URN', 'lsh_group', 'lsh_simil', 'transferable'] + list(svq_cols) + tfidf_cols + counts_cols

# separate singletons NOS from grouped NOS
df_nos_eng_singles = df_nos_eng4[df_nos_eng4['lsh_group']=='na'][cols_of_interest]

df_nos_eng_grouped = df_nos_eng4[df_nos_eng4['lsh_group']!='na'][cols_of_interest].reset_index().groupby('lsh_group')
agg_of_interest= {'URN': aggregate_floats_lsh, 'supersuite': aggregate_by_first, 
                 'SOC4': aggregate_floats_lsh, 'index': aggregate_floats_lsh,
                 'SOC3': aggregate_floats_lsh, 'SOC2': aggregate_floats_lsh,
                 'SOC1': aggregate_floats_lsh, 'pruned_lemmas': sum,
                  'transferable': aggregate_by_first, #'lsh_group': aggregate_by_first,
                'lsh_simil': np.mean}
transform_of_interest = {'NOS Title': ','.join, 'One_suite': ','.join}

t0 = time.time()
gm1 = df_nos_eng_grouped[list(svq_cols)].agg(np.nanmean)
g0 = df_nos_eng_grouped[tfidf_cols].agg(np.nanmean)
g1 = df_nos_eng_grouped[counts_cols].agg(np.nanmean)
g2 = df_nos_eng_grouped[list(agg_of_interest.keys())].agg(agg_of_interest)#.reset_index()
g3 = df_nos_eng_grouped['One_suite'].apply(aggregate_titles_lsh)#.reset_index()
g4 = df_nos_eng_grouped['NOS Title'].apply(aggregate_titles_lsh)#.reset_index()
df_nos_eng_grouped = g2.join(g4, on = 'lsh_group').join(g3, on = 'lsh_group').join(
        g1, on='lsh_group').join(g0, on = 'lsh_group').join(gm1, on = 'lsh_group')
print_elapsed(t0,'aggregating')

# extract the columns of interest (minus lsh group) and concatenate single NOS and groups
cols_of_interest = ['NOS Title','supersuite','One_suite','SOC4','SOC3','SOC2','SOC1','pruned_lemmas', 
                    'URN', 'lsh_simil', 'transferable'] + list(svq_cols) + tfidf_cols + counts_cols
df_nos_eng5 = pd.concat([df_nos_eng_singles[cols_of_interest], df_nos_eng_grouped[cols_of_interest]])

print('nb NOS x nb columns: ', df_nos_eng5.shape)

In [None]:
# print rows with more than one SOC
df_nos_eng5[df_nos_eng5['SOC4'].map(lambda x: type(x)==list)]

In [None]:
# compute similarity for NOS keywords and SVQ counts
eng_keyword_distance = distance.pdist(df_nos_eng5[tfidf_cols].values, metric = 'cosine')
eng_svq_distance = distance.pdist(df_nos_eng5[counts_cols].values, metric = 'cosine')


In [None]:
_ = plt.hist(eng_svq_distance)

In [None]:
plt.figure(figsize = (20,20))
_ = sns.heatmap(distance.squareform(eng_keyword_distance)[:100,:100], yticklabels = df_nos_eng5['NOS Title'][:100])
#.shape, distance.squareform(eng_svq_distance).shape

In [None]:
#We calculate cosine distance between tf-idf vectors of the documents
from scipy.cluster.hierarchy import ward, dendrogram
from scipy.spatial import distance
from scipy.cluster.hierarchy import cophenet
from scipy.cluster.hierarchy import fcluster 

def do_hierarch_clustering2(tfidfm, get_distances = True, method='average', metric = 'cosine', DOPLOTS = True):
    t0 = time.time()
    N2 = 11914
    N = 400 #400*400 = 160000 distance calls per second. For N=21500 -- > 462250000 calls --> 2900*160000 calls 
    # --> I'm guessing 2900 seconds = 48 minutes (I think it's likely to be more actually)
    # 4000*4000 takes approximately 110 seconds. It's double for the cophenet. So, for N=22500, the three functions 
    # together will take approx 4 hours (I'll do it tonight)

    if get_distances:
        try:
            distances = distance.pdist(tfidfm.todense(), metric = metric) #+np.random.randn(N,N2), metric = 'cosine')
            sparse_flag = True
        except:
            distances = distance.pdist(tfidfm, metric = 'cosine')
            sparse_flag = False
    else:
        distances = tfidfm
    print_elapsed(t0, 'calculating cosine distances of tfidf vectors')

    #We then build linkage matrix using the distances and specifying the method. For euclidean distances typically
    # 'Ward' produces best results. For cosine we can only use 'average' and 'single'.
    linkage_matrix = scipy.cluster.hierarchy.linkage(distances,
                                                     method = method,
                                                     metric = metric)
    print_elapsed(t0, 'hierarchical clustering of cosine distances')
    #We can test how well the groupings reflect actual distances. If c > 0.75 this is considered to be sufficiently
    #good representation
    #if get_ditan
    #if get:
    c, coph_dists = cophenet(linkage_matrix, distances)
    #else:
    #    c, coph_dists = cophenet(linkage_matrix, 
    #                         distance.pdist(tfidfm, metric = 'cosine'))

    print_elapsed(t0, 'computing the cophenetic correlation')

    if DOPLOTS:
        fig, ax =plt.subplots(figsize = (5,5))
        plt.imshow(scipy.spatial.distance.squareform(distances))
        plt.title('cosine distances between suites')
        plt.colorbar()

        fig, ax = plt.subplots(figsize = (5,5))
        tmp = plt.imshow(scipy.spatial.distance.squareform(coph_dists))
        plt.colorbar()
    print('The cophenetic coefficient is {:.4f}'.format(c))
    return distances, linkage_matrix, c, coph_dists



In [None]:
output_dir

In [None]:
# perform clustering
SAVEHC = False
STRATEGY = 'tfidf' #'we' or 'tfidf'
SELECT_MODE = 'engineering'
cluster_name = SELECT_MODE
cluster_name_figs = 'Engineering (using SVQ)'
cluster_name_save = 'engineering_svq'
#print('Computing clusters for {}'.format(cluster_name_figs))

suites_in_clus = {}
groups_clus = df_nos_eng5.groupby('One_suite')
for name, group in groups_clus:
    suites_in_clus[name] = list(group['NOS Title'].values)

'''if STRATEGY == 'tfidf':
    # this is to get the restricted corpus (to transform, not for fitting)
    textfortoken = df_nos_n['pruned_lemmas']
    tfidfm_n = tfidf_n.transform(textfortoken)
elif STRATEGY == 'we':
    tfidfm_n = df_nos_n[all_we_cols].values
else:
    raise ValueError
'''
# get labels
standard_labels_n2 = list(df_nos_eng5['NOS Title'].values)
# use NOS titles for single NOS and group ID for grouped ones?
cond = np.array(df_nos_eng5.index.map(lambda x: 'group' in x).values).astype(bool)
tmp = df_nos_eng5['NOS Title'].mask(cond, df_nos_eng5.index.values)
standard_labels_n = list(tmp.values)
#standard_labels_n = list(df_nos_eng5['NOS Title'].mask(
#    df_nos_eng5.index.map(lambda x: 'group' in x), df_nos_eng5.index).values)

for ix,t in enumerate(standard_labels_n):
    if len(t)>500:
        # manual correction because of pdf extraction
        standard_labels_n[ix] = standard_labels_n[ix][:50]


# use the two similarity matrices into a clustering input
WARD = 'substitute'
if WARD == 'multiply':
    clustering_input = copy.deepcopy(eng_svq_distance)
    clustering_input[np.isnan(clustering_input)] = 1 # replace nans with the max distance
    clustering_input= eng_keyword_distance * clustering_input #multiply by the usual NOS distances
else:
    clustering_input = copy.deepcopy(eng_keyword_distance)
    clustering_input[~np.isnan(eng_svq_distance)] = eng_svq_distance[~np.isnan(eng_svq_distance)]
# perform hierarchical clustering
_, linkage_matrix_n, c_n, _ = do_hierarch_clustering2(clustering_input, get_distances = False, 
                                                               method = 'average',
                                                               metric = 'cosine',
                                                               DOPLOTS= False)


# Plotting the distance between successive clusters: is there a knee?
z = linkage_matrix_n[::-1,2]
knee = np.diff(z, 2)

#fig = plt.figure(figsize = (6,6))
fig, ax1 = plt.subplots(figsize = (12,6))

ax2 = ax1.twinx()
ax1.plot(range(1, len(z)+1), z, 'g-')
ax2.plot(range(2, len(linkage_matrix_n)), knee, 'b-')
plt.xlim([0,500])
ax1.set_ylabel('cluster distance', color='g')
ax2.set_ylabel('derivative', color='b')

plt.title(cluster_name_figs)
goodness = []
for i in range(3,100): #len(z)-2):
    lr = LinearRegression(normalize = True)
    lr = lr.fit(np.arange(1,i+1).reshape(-1, 1), z[:i].reshape(-1, 1))
    a1 = lr.score(np.arange(1,i+1).reshape(-1, 1), z[:i].reshape(-1, 1))
    lr = LinearRegression(normalize = True)
    lr = lr.fit(np.arange(i, len(z)).reshape(-1, 1), z[i:].reshape(-1, 1))
    a2 = lr.score(np.arange(i, len(z)).reshape(-1, 1), z[i:].reshape(-1, 1))
    goodness.append(np.around(a1 + a2,4))

plt.figure(figsize = (12,6))
#print(goodness)
plt.plot(np.arange(3,100), goodness)#len(z)-2),goodness)
plt.title(cluster_name_figs)
ixg = np.array(goodness).argmax()+3
print('best t-point: ',ixg)

num_ideal = np.ceil(len(df_nos_eng5)/10)
print('The ideal number of clusters would be: ',num_ideal)
num_clust1 = knee.argmax() + 2
knee[knee.argmax()] = 0
num_clust2 = knee.argmax() + 2

if SELECT_MODE == 'engineering':
    num_clust = 50 #170 #ixg #max([num_clust1,num_clust2]) #clusters2use[SELECT_MODE][2]
else:
    if num_clust1 == 2:
        num_clust = num_clust2 #2000
    elif num_clust2 == 2:
        num_clust = num_clust1 #2000
    else:
        num_clust = min([num_clust1,num_clust2])

print('The two peaks are, in order: ',num_clust1, num_clust2)
print('The selected num clust is ',num_clust)
#num_clust = max([num_clust1,num_clust2])

for t in np.arange(0,1,0.05):
    labels_n = fcluster(linkage_matrix_n, t, criterion='distance')
    n_clust = len(set(labels_n))
    if n_clust <= num_clust:
        cutting_th_n = t
        break
# set the threshold manually
cutting_th_n = 0.79
print('cutting threshold: {}'.format(cutting_th_n))       

#Plot the dendrogram (cutting at threshold)
#cutting_th_n = 0.6
h = .05*len(df_nos_eng5)
fig, ax = plt.subplots(figsize=(28, h)) # set size
ax = dendrogram(linkage_matrix_n, 
                labels = [t.capitalize() for t in standard_labels_n], 
                orientation = 'right', 
                leaf_font_size=6,
               color_threshold = cutting_th_n,
               truncate_mode = 'level', p =20)#,
               #above_threshold_color = 'k');

plt.tick_params(axis= 'y',
                labelsize = 24)
plt.title('Hierarchical clustering for {}'.format(cluster_name_figs), fontsize = 30)
#              'Hierarchical Clustering Dendrogram of Selected NOS', fontsize = 20)
plt.xlabel('Distance', fontsize = 30)
plt.ylabel('NOS title',fontsize = 30)

plt.tight_layout()
if SAVEHC:
    plt.savefig(os.path.join(output_dir, 
                             'all_nos_cut_dendrogram_in_{}_{}_{}_{}_{}.png'.format(
        cluster_name_save,qualifier,paramsn['ngrams'],STRATEGY,WARD)), 
                bbox_inches = "tight")   

# now get and save the clusters
labels_n = fcluster(linkage_matrix_n, cutting_th_n, criterion='distance')
print('The actual number of clusters is {}'.format(np.unique(labels_n).size))
short_df_n = df_nos_eng5.reset_index()[['index','NOS Title', 'One_suite','supersuite']]

short_df_n['hierarchical'] = labels_n
short_df_n = short_df_n.set_index('index')
if SAVEHC:
    short_df_n.to_csv(os.path.join(output_dir, 
                             'all_nos_cut_labels_in_{}_{}_{}_{}_{}.csv'.format(
        cluster_name_save,qualifier,paramsn['ngrams'],STRATEGY,WARD)))

# print the result of the cut dendrogram
hierarchical_dict= {}
L = {}
D = {}
for ic in range(1,num_clust+1):
    tmp_local = short_df_n['NOS Title'][
        short_df_n['hierarchical']==ic].values
    if len(tmp_local)<3:
        continue
    hierarchical_dict['{}'.format(ic)] = list(np.unique(tmp_local))
    A = distance.squareform(clustering_input)[(short_df_n['hierarchical']==ic).values,:][:,
                        (short_df_n['hierarchical']==ic).values]
    if A.sum()>0:
        A = np.triu(A)
        A = A[A[:]>0]
    else:
        A = np.ones(1)
    D['{}'.format(ic)] = np.around(np.mean(A),3)
    L['{}'.format(ic)] = (short_df_n['hierarchical']==ic).sum()
print('number of clusters with at least three nos is {}'.format(len(hierarchical_dict)))
L = pd.DataFrame.from_dict(L, orient = 'index', columns = ['lenght'])
D = pd.DataFrame.from_dict(D, orient = 'index', columns = ['avg dist'])
L = L.join(D)
if SAVEHC:
    L.join(pd.DataFrame.from_dict(hierarchical_dict, orient = 'index')).sort_values(
        by = 'avg dist', ascending = True).T.to_csv(output_dir +
                            '/all_nos_cut_clusters_in_{}_{}_{}_{}_{}.csv'.format(
                            cluster_name_save,qualifier,paramsn['ngrams'],STRATEGY,WARD))

In [None]:
len(df_nos_eng5[df_nos_eng5[list(svq_cols)].notna().sum(axis = 1)>0])

In [None]:
# perform clustering
SAVEHC = False
STRATEGY = 'tfidf' #'we' or 'tfidf'
SELECT_MODE = 'engineering'
cluster_name = SELECT_MODE
cluster_name_figs = 'Engineering (using SVQ)'
cluster_name_save = 'engineering_svq'
#print('Computing clusters for {}'.format(cluster_name_figs))

df_nos_eng6 = df_nos_eng5[df_nos_eng5[list(svq_cols)].notna().sum(axis = 1)>0]

suites_in_clus = {}
groups_clus = df_nos_eng6.groupby('One_suite')
for name, group in groups_clus:
    suites_in_clus[name] = list(group['NOS Title'].values)

'''if STRATEGY == 'tfidf':
    # this is to get the restricted corpus (to transform, not for fitting)
    textfortoken = df_nos_n['pruned_lemmas']
    tfidfm_n = tfidf_n.transform(textfortoken)
elif STRATEGY == 'we':
    tfidfm_n = df_nos_n[all_we_cols].values
else:
    raise ValueError
'''
# get labels
standard_labels_n2 = list(df_nos_eng6['NOS Title'].values)
# use NOS titles for single NOS and group ID for grouped ones?
cond = np.array(df_nos_eng6.index.map(lambda x: 'group' in x).values).astype(bool)
tmp = df_nos_eng6['NOS Title'].mask(cond, df_nos_eng6.index.values)
standard_labels_n = list(tmp.values)
#standard_labels_n = list(df_nos_eng5['NOS Title'].mask(
#    df_nos_eng5.index.map(lambda x: 'group' in x), df_nos_eng5.index).values)

for ix,t in enumerate(standard_labels_n):
    if len(t)>500:
        # manual correction because of pdf extraction
        standard_labels_n[ix] = standard_labels_n[ix][:50]


# use the two similarity matrices into a clustering input
WARD = 'svq_only'
if WARD == 'svq_only':
    clustering_input = distance.pdist(df_nos_eng6[counts_cols].values, metric = 'cosine')
else:
    raise ValueError
# perform hierarchical clustering
_, linkage_matrix_n, c_n, _ = do_hierarch_clustering2(clustering_input, get_distances = False, 
                                                               method = 'average',
                                                               metric = 'cosine',
                                                               DOPLOTS= False)


# Plotting the distance between successive clusters: is there a knee?
z = linkage_matrix_n[::-1,2]
knee = np.diff(z, 2)

#fig = plt.figure(figsize = (6,6))
fig, ax1 = plt.subplots(figsize = (12,6))

ax2 = ax1.twinx()
ax1.plot(range(1, len(z)+1), z, 'g-')
ax2.plot(range(2, len(linkage_matrix_n)), knee, 'b-')
plt.xlim([0,500])
ax1.set_ylabel('cluster distance', color='g')
ax2.set_ylabel('derivative', color='b')

plt.title(cluster_name_figs)
goodness = []
for i in range(3,100): #len(z)-2):
    lr = LinearRegression(normalize = True)
    lr = lr.fit(np.arange(1,i+1).reshape(-1, 1), z[:i].reshape(-1, 1))
    a1 = lr.score(np.arange(1,i+1).reshape(-1, 1), z[:i].reshape(-1, 1))
    lr = LinearRegression(normalize = True)
    lr = lr.fit(np.arange(i, len(z)).reshape(-1, 1), z[i:].reshape(-1, 1))
    a2 = lr.score(np.arange(i, len(z)).reshape(-1, 1), z[i:].reshape(-1, 1))
    goodness.append(np.around(a1 + a2,4))

plt.figure(figsize = (12,6))
#print(goodness)
plt.plot(np.arange(3,100), goodness)#len(z)-2),goodness)
plt.title(cluster_name_figs)
ixg = np.array(goodness).argmax()+3
print('best t-point: ',ixg)

num_ideal = np.ceil(len(df_nos_eng6)/10)
print('The ideal number of clusters would be: ',num_ideal)
num_clust1 = knee.argmax() + 2
knee[knee.argmax()] = 0
num_clust2 = knee.argmax() + 2

if SELECT_MODE == 'engineering':
    num_clust = 22 #170 #ixg #max([num_clust1,num_clust2]) #clusters2use[SELECT_MODE][2]
else:
    if num_clust1 == 2:
        num_clust = num_clust2 #2000
    elif num_clust2 == 2:
        num_clust = num_clust1 #2000
    else:
        num_clust = min([num_clust1,num_clust2])

print('The two peaks are, in order: ',num_clust1, num_clust2)
print('The selected num clust is ',num_clust)
#num_clust = max([num_clust1,num_clust2])

for t in np.arange(0.0,1.0,0.05):
    labels_n = fcluster(linkage_matrix_n, t, criterion='distance')
    n_clust = len(set(labels_n))
    if n_clust <= num_clust:
        cutting_th_n = t
        break
print('cutting threshold: {}'.format(cutting_th_n))       

#Plot the dendrogram (cutting at threshold)
#cutting_th_n = 0.6
h = .07*len(df_nos_eng6)
fig, ax = plt.subplots(figsize=(22, h)) # set size
ax = dendrogram(linkage_matrix_n, 
                labels = [t.capitalize() for t in standard_labels_n], 
                orientation = 'right', 
                leaf_font_size=6,
               color_threshold = cutting_th_n,
               truncate_mode = 'level', p =20)#,
               #above_threshold_color = 'k');

plt.tick_params(axis= 'y',
                labelsize = 24)
plt.title('Hierarchical clustering for {}'.format(cluster_name_figs), fontsize = 30)
#              'Hierarchical Clustering Dendrogram of Selected NOS', fontsize = 20)
plt.xlabel('Distance', fontsize = 30)
plt.ylabel('NOS title',fontsize = 30)

plt.tight_layout()
if SAVEHC:
    plt.savefig(os.path.join(output_dir, 
                             'svq_nos_cut_dendrogram_in_{}_{}_{}_{}_{}.png'.format(
        cluster_name_save,qualifier,paramsn['ngrams'],STRATEGY,WARD)), 
                bbox_inches = "tight")   

# now get and save the clusters
labels_n = fcluster(linkage_matrix_n, cutting_th_n, criterion='distance')
print('The actual number of clusters is {}'.format(np.unique(labels_n).size))
short_df_n = df_nos_eng6.reset_index()[['index','NOS Title', 'One_suite','supersuite']]

short_df_n['hierarchical'] = labels_n
short_df_n = short_df_n.set_index('index')
if SAVEHC:
    short_df_n.to_csv(os.path.join(output_dir, 
                             'svq_nos_cut_labels_in_{}_{}_{}_{}_{}.csv'.format(
        cluster_name_save,qualifier,paramsn['ngrams'],STRATEGY,WARD)))

# print the result of the cut dendrogram
hierarchical_dict= {}
L = {}
D = {}
for ic in range(1,num_clust+1):
    tmp_local = short_df_n['NOS Title'][
        short_df_n['hierarchical']==ic].values
    if len(tmp_local)<0:
        continue
    hierarchical_dict['{}'.format(ic)] = list(tmp_local)
    A = distance.squareform(clustering_input)[(short_df_n['hierarchical']==ic).values,:][:,
                        (short_df_n['hierarchical']==ic).values]
    if A.sum()>0:
        A = np.triu(A)
        A = A[A[:]>0]
    else:
        A = np.ones(1)
    D['{}'.format(ic)] = np.around(np.mean(A),3)
    L['{}'.format(ic)] = (short_df_n['hierarchical']==ic).sum()
L = pd.DataFrame.from_dict(L, orient = 'index', columns = ['lenght'])
D = pd.DataFrame.from_dict(D, orient = 'index', columns = ['avg dist'])
L = L.join(D)
if SAVEHC:
    L.join(pd.DataFrame.from_dict(hierarchical_dict, orient = 'index')).sort_values(
        by = 'avg dist', ascending = True).T.to_csv(output_dir +
                            '/svq_nos_cut_clusters_in_{}_{}_{}_{}_{}.csv'.format(
                            cluster_name_save,qualifier,paramsn['ngrams'],STRATEGY,WARD))

In [None]:
svq_semi_manual_names = ['power sector', 'bus and coach',
                        'construction and civil engineering', 'conductor engineering', 'land based',
                        'construction site', 'signals', 'electrification', 'process engineering maintenance',
                        'fabrication and welding', 'performing engineering operations','mechanical manifacturing',
                        'marine engineering','engineering maintenance','engineering tech support',
                         'engineering manufacture',
                         'automotive engineering', 'aeronautical engineering','food and drink','electrical power',
                        'engineering surveying','building services']

svq_semi_manual_groups = [
    ['20181212 R227 04 EPE Substation Plant.xlsx',
 '20181212 R229 04 EPE Overhead Lines.xlsx',
 '20181212 R228 04 EPE Underground Cables.xlsx'] ,
['SVQ 2   Bus and Coach Engineering and Maintenance- Electrical at SCQF Level 5.xlsx',
 'SVQ 2 Bus and Coach Engineering and Maintenance- Body Cladding at SCQF Level 5.xlsx',
 'SVQ 2 \xa0 Bus and Coach Engineering and Maintenance- Mechanical at SCQF Level 5.xlsx',
 'SVQ 3   Bus and Coach Engineering and Maintenance- Electrical at SCQF Level 6.xlsx',
 'SVQ 3   Bus and Coach Engineering and Maintenance- Mechanical at SCQF Level 6.xlsx',
 'SVQ 3 Bus and Coach Engineering and Maintenance- Body Cladding at SCQF Level 6.xlsx',
 'SVQ 3 \xa0 Bus and Coach Engineering and Maintenance- Electrical-Mechanical at SCQF Level 6.xlsx'] ,
['20181212 GP36 45 QS SVQ C&CEO.xlsx',
 'GM84 22 Construction and Civil Engineering Operations SCQF L5.xlsx',
 '20170703 GM83 21 QS SVQ Construction CES L4.xlsx'] ,
['GM34 22 QS Reaccred Conductor Engineering.xlsx'] ,
['GK3G23_Landbased_eng_ops_agriculture_April.xlsx',
 'GK3H23_Landbased_eng_ops_arboriculture_forestry_April.xlsx',
 'GK3J23_Landbased_eng_ops_ground_care_April.xlsx',
 'GK42 22  Land-based Engineering (ArboricultureForestry) 1.xlsx',
 'GK43 22  Land-based Engineering (Ground Care) 1.xlsx',
 'GK44 22 Land-based Engineering (Agriculture) 1.xlsx',
 'GK45 23 Land-based Engineering (ArboricultureForestry) 1.xlsx',
 'GK46 23  Land-based Engineering (Ground Care) 1.xlsx',
 'GK47 23  Land-based Engineering (Agriculture) 1.xlsx',
 'GP6H 22 SVQ in LB Eng (Ground Care) at SCQF Level 5.xlsx',
 'GP6J 23 SVQ in LB Eng (Ground Care) at SCQF Level 6.xlsx',
 'GP6K 22 SVQ in LB Eng (Agriculture) at SCQF Level 5.xlsx',
 'GP6L 23 SVQ in LB Eng (Agriculture) at SCQF Level 6.xlsx',
 'GP6M 22 SVQ in LB Eng (Arb and For) at SCQF level 5.xlsx',
 'GP6N 23 SVQ in LB Eng (Arb.Forestry) at SCQF Level 6.xlsx'] ,
['GJ19 24 SVQ Cons Site Mgmt Bld and Civ Eng.xlsx',
 'GJ1C 23 SVQ Cons Site Superv Build & Civ Eng.xlsx'] ,
['201.02.26 SVQ2 Signal and Telecoms.xlsx',
 '201.02.26 SVQ3 Signal and Telecoms.xlsx'] ,
['2015.02.26 SVQ 2  Electrification.xlsx',
 '2015.02.26 SVQ 3  Electrification.xlsx'] ,
['GD0J 23 PEM Instrument & Control SCQF L7.xlsx',
 'GP75 47 SVQ in Process Eng Maint Instrument and Control at SCQF level 7.xlsx',
 'GD0D 22 SVQ in Process Engineering Maintenance (Electrical) at SCQF level 5.xlsx',
 'GD0E 22 PEM Instrument & Control SCQF L5.xlsx',
 'GD0F 22 SVQ in Process Engineering Maintenance (Mechanical) at SCQF level 5.xlsx',
 'GD0G 23 SVQ_PEM_Extension_SCQFL7 (SQA).xlsx',
 'GD0H 23 SVQ in Process Engineering Maintenance (Mechanical) at SCQF level 7.xlsx',
 'GP74 47 SVQ in Process Engineering Maintenance Electrical at SCQF level 7.xlsx',
 'GP76 47 SVQ in Process Engineering Maintenance Mechanical at SCQF level 7.xlsx'] ,
['2016.03.30 GL2N 22 SVQ 2 Fabrication and Welding Engineering QS (EAL).xlsx',
 '2016.04.20 GL3N 23 Reaccred SVQ Fabrication & Welding (EAL).xlsx',
 '20160608 QS Reaccred SVQ F&WE (SQA) GL6F 23.xlsx',
 '20180516 QS Accred SVQ Fab&Weld (QFI) GP07 46.xlsx'],
['GL6E 22 SVQ Performing Engineering Oprations at SCQF L5.xlsx',
 'GM0V 21 SVQ in Performing Engineering Operations at SCQF L4.xlsx',
 'SVQ PEO2 Amend SCQF L5 AC2 EAL.xlsx'] ,
['GM1V 46 Mechanical Manufacturing Engineering Level 6.xlsx'] ,
['2016.04.20 GL3P 23 Reaccred SVQ Marine Engineering (EAL).xlsx'] ,
['2016.03.30 GL2M 23 SVQ 3 Engineering Maintenance Reaccred (EAL).xlsx'] ,
['2016.04.20 GL3M 23 Reaccred SVQ Engineering Tech Support (EAL).xlsx'] ,
['GJ9D 24 Engineering Manufacture.xlsx'] ,
['GJ9C 23 Automotive Engineering.xlsx'] ,
['20180613 QS SVQ Aeronautical Engineering L6 (EAL).xlsx'] ,
['2016.03.09  SVQ 3 Food and Drink Ops Eng Maint (GL46 23) (SQA)v2.xlsx'] ,
['R094 04 Dip Elec Power Eng - Wind Turb Ops and Maint SCQF 6.xlsx',
 'Certificate in Electrical Power Engineering - Distribution and Transmission (Technical Knowledge) at SCQF Level 5.xlsx',
 'R20104 Diploma in Electrical Power Engineering.xlsx',
 'R095 04 Dip Elec Power Eng - Wind Turb Maint (Tech Kno) SCQF 6.xlsx'] ,
['2015.04.01  SVQ 2 Eng Surv Ops.xlsx'] ,
['2015.07.08 GK9M 23 SVQ3 SMCBSE SQA.xlsx']]
print(len(svq_semi_manual_names))
print(len(svq_semi_manual_groups))

In [None]:
svq_semi_groups_dict = {}
svq_semi_groups_dict2 = {}
for ix,g in enumerate(svq_semi_manual_groups):
    # save units as they are
    svq_semi_groups_dict[svq_semi_manual_names[ix]] = list(np.unique(flatten_lol(svq_data.loc[g]['units'].values)))
    # crosswalk to NOS based on joining above
    tmp = []
    for i in list(np.unique(flatten_lol(svq_data.loc[g]['units'].values))):
        if i in units_to_nos_last:
            tmp.append(units_to_nos_last[i])
    svq_semi_groups_dict2[svq_semi_manual_names[ix]] = list(np.unique(tmp))
pd.DataFrame.from_dict(svq_semi_groups_dict, orient = 'index').T.to_csv(
    output_dir + '/svq_semi_manual_groups_units.csv')
pd.DataFrame.from_dict(svq_semi_groups_dict2, orient = 'index').T.to_csv(
    output_dir + '/svq_semi_manual_groups_crosswalked_nos.csv')


## Apply HDBSCAN to similarity matrix overriden with the similarity from the pathways

In [None]:
# perform clustering

STRATEGY = 'tfidf' #'we' or 'tfidf'
SELECT_MODE = 'engineering'
cluster_name = SELECT_MODE
cluster_name_figs = 'Engineering (using SVQ)'
cluster_name_save = 'engineering_svq'
#print('Computing clusters for {}'.format(cluster_name_figs))

suites_in_clus = {}
#groups_clus = df_nos_eng5.groupby('One_suite')
#for name, group in groups_clus:
#    suites_in_clus[name] = list(group['NOS Title'].values)

'''if STRATEGY == 'tfidf':
    # this is to get the restricted corpus (to transform, not for fitting)
    textfortoken = df_nos_n['pruned_lemmas']
    tfidfm_n = tfidf_n.transform(textfortoken)
elif STRATEGY == 'we':
    tfidfm_n = df_nos_n[all_we_cols].values
else:
    raise ValueError
'''
# get labels
standard_labels_hdb2 = list(df_nos_eng5['NOS Title'].values)
# use NOS titles for single NOS and group ID for grouped ones?
cond = np.array(df_nos_eng5.index.map(lambda x: 'group' in x).values).astype(bool)
tmp = df_nos_eng5['NOS Title'].mask(cond, df_nos_eng5.index.values)
standard_labels_hdb = list(tmp.values)
#standard_labels_n = list(df_nos_eng5['NOS Title'].mask(
#    df_nos_eng5.index.map(lambda x: 'group' in x), df_nos_eng5.index).values)

for ix,t in enumerate(standard_labels_hdb):
    if len(t)>500:
        # manual correction because of pdf extraction
        standard_labels_hdb[ix] = standard_labels_hdb[ix][:50]


# use the two similarity matrices into a clustering input
WARD = 'substitute'
if WARD == 'multiply':
    clustering_input = copy.deepcopy(eng_svq_distance)
    clustering_input[np.isnan(clustering_input)] = 1 # replace nans with the max distance
    clustering_input= eng_keyword_distance * clustering_input #multiply by the usual NOS distances
else:
    clustering_input = copy.deepcopy(eng_keyword_distance)
    clustering_input[~np.isnan(eng_svq_distance)] = eng_svq_distance[~np.isnan(eng_svq_distance)]

clusterer = hdbscan.HDBSCAN(metric = 'precomputed')#, prediction_data = True)
clusterer = clusterer.fit(distance.squareform(clustering_input))


In [None]:
print('Nb of clusters, ', 'nb of singletons, ', 'nb of data points')
np.unique(clusterer.labels_).max(), np.sum(clusterer.labels_ == -1), clusterer.labels_.size

In [None]:
plt.hist(clusterer.probabilities_)
plt.ylabel('NOS counts')
plt.xlabel('Certainty of assignment')

In [None]:
# save the results of the hdbscan
SAVEHDB = True

short_df_hdb = df_nos_eng5.reset_index()[['index','NOS Title', 'One_suite','supersuite']]

short_df_hdb['hierarchical'] = clusterer.labels_
short_df_hdb = short_df_hdb.set_index('index')
if SAVEHDB:
    short_df_hdb.to_csv(os.path.join(output_dir, 
                             'hdbscan_nos_cut_labels_in_{}_{}_{}_{}_{}.csv'.format(
        cluster_name_save,qualifier,paramsn['ngrams'],STRATEGY,WARD)))


hdb_dict= {}
L = {}
D = {}
for ic in np.unique(clusterer.labels_):
    if ic<0:
        continue
    # get ID of nos inside cluster
    tmp_local = [t for ix,t in enumerate(standard_labels_hdb2) if clusterer.labels_[ix] == ic]
    if len(tmp_local)<0:
        continue
    hdb_dict['{}'.format(ic)] = sorted(list(tmp_local))
    A = distance.squareform(clustering_input)[(short_df_hdb['hierarchical']==ic).values,:][:,
                        (short_df_hdb['hierarchical']==ic).values]
    if A.sum()>0:
        A = np.triu(A)
        A = A[A[:]>0]
    else:
        A = np.ones(1)
    D['{}'.format(ic)] = np.around(np.mean(A),3)
    L['{}'.format(ic)] = len(tmp_local)
L = pd.DataFrame.from_dict(L, orient = 'index', columns = ['lenght'])
D = pd.DataFrame.from_dict(D, orient = 'index', columns = ['avg dist'])
L = L.join(D)
if SAVEHDB:
    L.join(pd.DataFrame.from_dict(hdb_dict, orient = 'index')).sort_values(
        by = 'avg dist', ascending = True).T.to_csv(output_dir +
                            '/hdbscan_nos_cut_clusters_in_{}_{}_{}_{}_{}.csv'.format(
                            cluster_name_save,qualifier,paramsn['ngrams'],STRATEGY,WARD))
    
#hdb_dict = {}
#for i in np.unique(clusterer.labels_):
#    if i<0:
#        continue
#    hdb_dict[i]= [t for ix,t in enumerate(standard_labels_hdb2) if clusterer.labels_[ix] == i]
#pd.DataFrame.from_dict(hdb_dict,orient = 'index').T.to_csv(output_dir + 
#                                                           '/hdbscan_result_all_nos_tfidf_substitute.csv')


In [None]:
# use the two similarity matrices into a clustering input
WARD = 'substitute'
if WARD == 'multiply':
    clustering_input = copy.deepcopy(eng_svq_distance)
    clustering_input[np.isnan(clustering_input)] = 1 # replace nans with the max distance
    clustering_input= eng_keyword_distance * clustering_input #multiply by the usual NOS distances
else:
    clustering_input = copy.deepcopy(eng_keyword_distance)
    clustering_input[~np.isnan(eng_svq_distance)] = eng_svq_distance[~np.isnan(eng_svq_distance)]

# get the labels information
standard_labels_c2 = list(df_nos_eng5['NOS Title'].values)
# use NOS titles for single NOS and group ID for grouped ones?
cond = np.array(df_nos_eng5.index.map(lambda x: 'group' in x).values).astype(bool)
tmp = df_nos_eng5['NOS Title'].mask(cond, df_nos_eng5.index.values)
standard_labels_c = list(tmp.values)


In [None]:
all_nos_cliques = []
for th in np.arange(0.75,0.91,0.05):
    short_df_cliques = df_nos_eng5.reset_index()[['index','NOS Title', 'One_suite','supersuite']]
    t0 = time.time()
    similarity_graph = nx.convert_matrix.from_numpy_array((1 - distance.squareform(clustering_input))>th)
    print_elapsed(t0, 'building the graph')
    nos_cliques = list(nx.find_cliques(similarity_graph))
    nos_cliques = [t for t in nos_cliques if len(t)>4]
    all_nos_cliques.append(nos_cliques)
    print_elapsed(t0, 'finding cliques')
    print(th,len(all_nos_cliques[-1]))
    cliques_dict = {}
    for i, clique in enumerate(nos_cliques):
        cliques_dict[i] = sorted([standard_labels_c2[t] for t in clique])
        short_df_cliques[i] = False
        short_df_cliques[i].iloc[[t for t in clique]] = True
    pd.DataFrame.from_dict(cliques_dict, orient = 'index').T.to_csv(output_dir + 
                                                                    '/clusters_by_clique_th{:.2f}.csv'.format(th))
    short_df_cliques.to_csv(output_dir + '/labels_by_clique_th{:.2f}.csv'.format(th))


In [None]:
_ = plt.hist((1 - clustering_input).ravel())
print(np.percentile((1 - clustering_input).ravel(),99))

In [None]:
distance.squareform(clustering_input).shape

In [None]:

#Loading a pre-trained glove model into gensim
# model should have already been loaded in bg_load_prepare_and_run. 
# If not, load it here    

LOADGLOVE = False
if LOADGLOVE:
    print('Loading glove model')
    t0 = time.time()
    # load the glove model
    model = gensim.models.KeyedVectors.load_word2vec_format\
    (os.path.join(glove_dir, 'word2vec.{}.txt'.format(WHICH_GLOVE)))
    #model = api.load("glove-wiki-gigaword-100")  # load pre-trained word-vectors
    # from gensim-data
    #model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
    #word_vectors = model.wv
    print_elapsed(t0, 'loading the glove model')

    vector_matrix = model.vectors
    list_of_terms = model.index2word

    lookup_terms = [convert_from_undersc(elem) for elem in list_of_terms]


