In [None]:
%matplotlib inline
import matplotlib.pyplot as plt


import os
import itertools
import json
import numpy as np
import pandas as pd
import pickle
import requests
import seaborn as sns
import collections
from collections import Counter
import scipy
import time
import copy
from collections import OrderedDict

import matplotlib as mpl
import matplotlib.gridspec as gridspec
from matplotlib.patches import Rectangle
import matplotlib.patches as mpatches

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition.pca import PCA


import nltk
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
import gensim
import re
from fuzzywuzzy import process


In [None]:
from utils_nos import nesta_colours, nesta_colours_combos
print(nesta_colours, nesta_colours_combos)

In [None]:
# set up plot style
print(plt.style.available)
plt.style.use(['seaborn-darkgrid','seaborn-poster','ggplot'])

#### TODOs for data cleaning:

1. remove square brackets
2. make everything lower case


## Overview

This notebook contains a few functions and snippets of code that are useful for analysing text. Most of the techniques used are unsupervised. Functions are defined up front and then used in sections below.

This notebook is to apply:
- Tokenizers (based on n-grams and 'as_is')
- Calculating distance
- Hierarchical clustering and plotting
- K-means clustering
- LSH

This specific instance of the notebook will be applied to the analysis of NOS


In [None]:
# flatten lists of lists
def flatten_lol(t):
    return list(itertools.chain.from_iterable(t))
flatten_lol([[1,2],[3],[4,5,6]])


In [None]:
#These two functions are useful for analysing bi and tri-grams with w2v models in gensim

def convert_to_undersc(skill):
    '''
    convert spaces in skill phrases into underscores to use with trained
    w2v model.
    '''
    if len(skill.split(' ')) >1:
        new_i = '-'.join(skill.split(' '))
    else:
        new_i = skill
    return(new_i)

def convert_from_undersc(skill):
    '''
    convert underscores between terms in skill phrases back to spaces.
    '''
    if len(skill.split('_')) >1:
        new_i = ' '.join(skill.split('_'))
    else:
        new_i = skill
    return(new_i)


In [None]:
#A few functions for tyding up text
def tag_for_lemmatise(s):
    pos_to_wornet_dict = {
        'JJ': 'a',
        'JJR': 'a',
        'JJS': 'a',
        'RB': 'r',
        'RBR': 'r',
        'RBS': 'r',
        'NN': 'n',
        'NNP': 'n',
        'NNS': 'n',
        'NNPS': 'n',
        'VB': 'v',
        'VBG': 'v',
        'VBD': 'v',
        'VBN': 'v',
        'VBP': 'v',
        'VBZ': 'v',
    }
    try:
        return pos_to_wornet_dict[nltk.pos_tag([s])[0][1]]
    except:
        return 'n'
    
def lemmatise(title_terms):
    """
    Takes list as input.
    Removes suffixes if the new words exists in the nltk dictionary.
    The purpose of the function is to convert plural forms into singular.
    Allows some nouns to remain in plural form (the to_keep_asis is manually curated).
    Returns a list.
    >>> lemmatise(['teachers'])
    ['teacher']
    >>> lemmatise(['analytics'])
    ['analytics']
    """
    keep_asis = ['sales', 'years', 'goods', 'operations', 'systems',
                    'communications', 'events', 'loans', 'grounds',
                    'lettings', 'claims', 'accounts', 'relations',
                    'complaints', 'services']
    wnl = nltk.WordNetLemmatizer()
    processed_terms = [wnl.lemmatize(i) if i not in keep_asis else i for i in title_terms]
    #processed_terms = [wnl.lemmatize(i, pos = tag_for_lemmatise(i)) 
    #            if i not in keep_asis else i for i in title_terms]
    return processed_terms

def lemmatise_with_pos(title_terms):
    """
    Takes list as input.
    Removes suffixes if the new words exists in the nltk dictionary.
    The purpose of the function is to convert plural forms into singular.
    Allows some nouns to remain in plural form (the to_keep_asis is manually curated).
    Returns a list.
    >>> lemmatise(['teachers'])
    ['teacher']
    >>> lemmatise(['analytics'])
    ['analytics']
    """
    pos_to_wornet_dict = {
        'JJ': 'a',
        'JJR': 'a',
        'JJS': 'a',
        'RB': 'r',
        'RBR': 'r',
        'RBS': 'r',
        'NN': 'n',
        'NNP': 'n',
        'NNS': 'n',
        'NNPS': 'n',
        'VB': 'v',
        'VBG': 'v',
        'VBD': 'v',
        'VBN': 'v',
        'VBP': 'v',
        'VBZ': 'v',
    }
    keep_asis = ['sales', 'years', 'goods', 'operations', 'systems',
                    'communications', 'events', 'loans', 'grounds',
                    'lettings', 'claims', 'accounts', 'relations',
                    'complaints', 'services']
    wnl = nltk.WordNetLemmatizer()
    processed_terms = [wnl.lemmatize(i, pos_to_wornet_dict[p]) if i not in keep_asis else i for i,p in title_terms]
    #processed_terms = [wnl.lemmatize(i, pos = tag_for_lemmatise(i)) 
    #            if i not in keep_asis else i for i in title_terms]
    return processed_terms

def lemmatise_pruned(x, pofs = 'nv'):
    if pofs == 'nv':
        tags = [(t,p) for t,p in x if p[:1] in ['V','N']]
    elif pofs == 'n':
        tags = [(t,p) for t,p in x if p[:1] in ['N']]
    else:
        raise ValueError
    return lemmatise_with_pos(tags)

def remove_digits(s):
    """
    Takes a string as input.
    Removes digits in a string.
    Returns a string.
    >>> remove_digits('2 recruitment consultants')
    ' recruitment consultants'
    """
    result = ''.join(i for i in s if not i.isdigit())
    return result

def remove_list_enumeration(s):
    '''
    This is a specific requirement of the NOS that comes from
    the presence of lists enumerated by strings like K+number
    or P+number. Therefore, after "lowerising" and removing 
    digits, I look for and remove strings like "k " and "p "
    '''
    result = re.sub('( k )+',' ',s)
    result = re.sub('( p )+', ' ', result)
    # it might not be necessary if I add 'k' and 'p' to stopwords
    return result

select_punct = set('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~') #only removed "'"
extra_chars = set('–-•’”“µ¾âãéˆﬁ[€™¢±ï…˜')
all_select_chars = select_punct.union(extra_chars)
def replace_punctuation(s):
    """
    Takes string as input.
    Removes punctuation from a string if the character is in select_punct.
    Returns a string.
   >>> replace_punctuation('sales executives/ - london')
   'sales executives   london'
    """
    for i in set(all_select_chars): #set(select_punct):
        if i in s:
            s = s.replace(i, ' ')
    return s

def tidy_desc(desc):
    clean_data = desc.replace('\r\n', '').replace('\xa0', '')
    nodigits = remove_digits(clean_data.lower())
    nopunct = replace_punctuation(nodigits)
    #nopunct = remove_list_enumeration(nopunct)
    lemm = lemmatise(nopunct.split())
    return ' '.join(lemm)

def tokenize(text):
    """
    Takes string as input.
    Returns list of tokens. The function is used as an argument for
    TfidfVectorizer.
    >>> tokenize('some job title')
    ['some', 'job', 'title']
    """
    tokens = nltk.word_tokenize(text)
    return tokens

def tokenize_asis(some_list):
    """
    Takes list as input.
    Returns the list with elements converted to lower case. The function is 
    used as an argument for TfidfVectorizer.
    
    In [57]: tokenize(['Accounting', 'Microsoft Excel'])
    Out[57]: ['accounting', 'microsoft excel']
    """
    tokens = [elem.lower() for elem in some_list]
    return tokens

In [None]:
#This set of functions is useful for identifying terms with highest tf-idf weights 
#in a single document or set of documents

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding 
        feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25, sparse_output = False):
    ''' Return the top n features that on average are most important 
        amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    if sparse_output:
        return scipy.sparse.csr_matrix(top_tfidf_feats(tfidf_means, features, top_n))
    else:
        return top_tfidf_feats(tfidf_means, features, top_n)

def all_mean_feats(Xtr, grp_ids=None, min_tfidf=0.1):
    ''' Return the average
        amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return tfidf_means

def get_top_words_weights(desc, feature_names, vect, n = 25):
    response = vect.transform(desc)
    words = top_mean_feats(response, feature_names, grp_ids = None, top_n = n)
    return words

def get_mean_tfidf(desc, vect):
    response = vect.transform(desc)
    tfidf_values = all_mean_feats(response, grp_ids = None)
    return tfidf_values

def get_top_words(desc, feature_names, vect, n = 25):
    response = vect.transform(desc)
    words = top_mean_feats(response, feature_names, grp_ids = None, top_n = n)
    return words['feature'].values

In [None]:
#Function to parse html

from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):

#HTML Parser Methods
#Initializing lists
    lsData = list()
    
    def handle_data(self, data):
        self.lsData.append(data)
        
    def get_data(self):
        return ''.join(self.lsData)

           
def strip_tags(some_html):
    """
    Takes string as input.
    Removes html tags.
    Returns a string.
    """
    s = MyHTMLParser()
    s.lsData = list()
    s.feed(some_html)
    data = s.get_data()
    s.reset
    return data


In [None]:
def print_elapsed(t0_local, task = 'current task'):
    print('Done with {}. Elapsed time: {:4f}'.format(task,time.time()-t0_local))
    

In [None]:
qualifier = 'postjoining_final_no_dropped'
qualifier0 = 'postjoining_final_no_dropped'
pofs = 'nv'


In [None]:
output_dir = '/Users/stefgarasto/Google Drive/Documents/results/NOS/nlp_analysis/'


In [None]:
lookup_dir = '/Users/stefgarasto/Google Drive/Documents/results/NOS/extracted/'


In [None]:
#Loading a pre-trained glove model into gensim
from gensim.scripts.glove2word2vec import glove2word2vec

glove_dir = '/Users/stefgarasto/Local-Data/wordvecs/'

# to make the glove model file compatible with gensim
#for dim in ['25','50','100','200']:
##    glove_file = os.path.join(glove_dir,'glove.twitter.27B.{}d.txt'.format(dim))
#    tmp_file = os.path.join(glove_dir, 'word2vec.glove.twitter.27B.{}d.txt'.format(dim) )
#    _ = glove2word2vec(glove_file, tmp_file)

LOADGLOVE = False
if LOADGLOVE:
    # load the glove model
    model = gensim.models.KeyedVectors.load_word2vec_format\
    (os.path.join(glove_dir, 'word2vec.glove.6B.100d.txt'))
    #model = api.load("glove-wiki-gigaword-100")  # load pre-trained word-vectors from gensim-data
    #model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
    #word_vectors = model.wv
print('Done')


In [None]:
#Get the NOS data for approved apprenticeship standards from api
#r2 = requests.get("https://www.instituteforapprenticeships.org/api/fullstandards/")
#df_api= pd.DataFrame(r2.json())
df_nos = pd.read_pickle(lookup_dir + 'all_nos_input_for_nlp_{}.zip'.format(qualifier0))

# load the cleaned and tokenised dataset
df_nos = df_nos.join(pd.read_pickle(lookup_dir + 'all_nos_input_for_nlp_{}_pruned_{}.zip'.format(qualifier,pofs)))
print('Done')


In [None]:
# manually remove "k"s and "p"s from the pruned columns
def remove_pk(x):
    return [t for t in x if t not in ['k','p']]
df_nos['pruned'] = df_nos['pruned'].map(remove_pk)

In [None]:
# create another column where the texts are lemmatised properly
t0 = time.time()
df_nos['pruned_lemmas'] = df_nos['tagged_tokens'].map(lambda x: lemmatise_pruned(x,pofs))
print(time.time()-t0)


In [None]:
df_nos.sample(n=3)


In [None]:
# Load stopwords
with open(lookup_dir + 'stopwords_for_nos_{}_{}.pickle'.format(qualifier,pofs),'rb') as f:
    stopwords0, no_idea_why_here_stopwords, more_stopwords = pickle.load(f)
stopwords = stopwords0 + no_idea_why_here_stopwords 
stopwords += tuple(['¤', '¨', 'μ', 'บ', 'ย', 'ᶟ', '‰', '©', 'ƒ', '°', '„'])
stopwords0 += tuple(['¤', '¨', 'μ', 'บ', 'ย', 'ᶟ', '‰', '©', 'ƒ', '°', '„',"'m", "'re", '£',
                    '&', '1', '@'])
stopwords0 += tuple(set(list(df_nos['Developed By'])))
stopwords0 += tuple(['cosvr'])
# more stopwords from some suites of interest
stopwords0 += tuple(['pdf','dc','db','gov','auspex','december','wa','non','go','get','ask',
                    'thing','ha','hm'])


### Define more functions on how to create the TfIdf vectoriser and matrix

In [None]:
# First, create your TFidfVectorizer model. This doesn't depend on whether it's used on suites or NOS. However,
# it does require that the docs collection is already given as a collection of tokens (tokenizer=tokenize_asis)

#Since we now have not just long strings in our documents, but lists of terms, we will use a different tokenizer
def define_tfidf(params, stopwords):
    if params['ngrams'] == 'bi':
        tfidf = TfidfVectorizer(tokenizer=tokenize_asis,
                                lowercase = False,
                                stop_words=stopwords,
                                ngram_range=(1,2), 
                                max_df = params['tfidf_max'], 
                                min_df = params['tfidf_min'])
    elif params['ngrams'] == 'tri':
        tfidf = TfidfVectorizer(tokenizer=tokenize_asis,
                                lowercase = False,
                                stop_words=stopwords,
                                ngram_range=(1,3), 
                                max_df = params['tfidf_max'], 
                                min_df = params['tfidf_min'])
    else:
        # unigrams is the default
        tfidf = TfidfVectorizer(tokenizer=tokenize_asis,
                                lowercase = False,
                                stop_words=stopwords,
                                max_df = params['tfidf_max'], 
                                min_df = params['tfidf_min'])
    return tfidf


In [None]:
# now, collect the text to transform
def combine_nos_text(df_nos, col = 'pruned'):
    all_joint_tokens = []
    # group by suites and concatenate all docs in it
    row_names = []
    for name, group in df_nos.groupby('One_suite'):
        row_names.append(name)
        joint_tokens = []
        for idoc in group[col].index:
            joint_tokens += group[col].loc[idoc]
        all_joint_tokens.append(joint_tokens)
    # return a dataframe
    return pd.DataFrame({'tokens': all_joint_tokens}, index = row_names)

def get_tfidf_matrix(params, df_nos, tfidf, col = 'pruned'):
    # Note: this can simply be used to get the tfidf transform, by setting bywhich=docs and any mode
    t0 = time.time()
    # first, get the dataframe of tokens
    if params['bywhich'] == 'docs':
        textfortoken = df_nos[col]
        
    elif params['bywhich'] == 'suites':
        if params['mode'] == 'meantfidf':
            textfortoken = df_nos[col]
                
        elif params['mode'] == 'combinedtfidf':
            # note that this is the only case where the tfidf min and max are computed considering the number of 
            # suites as the number of elements in the collection.
            # TODO: allow for the alternative case, where the transform is computed on individual NOS and then 
            # applied to the joint tokens
            textfortoken = combine_nos_text(df_nos, col)['tokens']
    
    # apply tfidf transform to the tokenised text
    tfidfm = tfidf.fit_transform(textfortoken)
    
    # if the average is needed, compute it and overwrite the matrix. Note that the step above is still needed to
    # initialise the tfidf transform with the proper features and stopwords
    if (params['bywhich'] == 'suites') and (params['mode'] =='meantfidf'):
        row_names = df_nos['One_suite'].value_counts().index.values
        tfidfm = scipy.sparse.lil_matrix(np.zeros((len(row_names),len(feature_names)), dtype = np.float32))
        for name, group in df_nos.groupby('One_suite'):
            tmp = get_mean_tfidf(group['pruned'], tfidf)
            tfidfm[igroup] = tmp

    feature_names = tfidf.get_feature_names()
    print_elapsed(t0, 'computing the feature vector')
    return tfidfm, feature_names, tfidf, textfortoken


### Load the file with the list of super-suites and match the suites listed inside

In [None]:
super_suites_files=  '/Users/stefgarasto/Google Drive/Documents/data/NOS_meta_data/NOS_Suite_Priority.xlsx'
super_suites_names = ['Engineering','Management','FinancialServices','Construction']
all_super_suites = {}
for which_super_suite in super_suites_names:
    all_super_suites[which_super_suite] = pd.read_excel(super_suites_files, sheet_name = which_super_suite)
    all_super_suites[which_super_suite]['NOS Suite name'] = all_super_suites[which_super_suite]['NOS Suite name'].map(
        lambda x: x.replace('(','').replace('(','').replace('&','and').strip().lower())


In [None]:
standard_labels = list(df_nos.groupby('One_suite').groups.keys())
all_matches = {}
all_match_names = {}
#match_name = []
for which_super_suite in super_suites_names:
    all_matches[which_super_suite] = []
    for suite in all_super_suites[which_super_suite]['NOS Suite name'].values:
        # do manually some selected suites
        if 'insurance claims' in suite:
            tmp = standard_labels.index('general insurance')
            all_matches[which_super_suite].append(tmp)
            continue
        # for the "management and leadership marketing 2013" both marketing and marketing 2013 would fit,
        # but I'm only taking the latter
        # find a fuzzy match between 
        out = process.extract(suite, standard_labels, limit=3)
        if len(out) and out[0][1]>89:
            # note: most of them are above 96% similarity (only one is 90%)
            tmp = standard_labels.index(out[0][0])
            #print(suite, out[0])
            if tmp not in all_matches[which_super_suite]:
                all_matches[which_super_suite].append(tmp)
            else:
                if suite == 'installing domestic fascia, soffit, and bargeboards':
                    # this suite is kind of a duplicate - I aggregated it in my suites list
                    continue
                tmp = standard_labels.index(out[2][0])
                all_matches[which_super_suite].append(tmp)
                print(out[0][0],',',out[1][0],',',out[2][0],',',suite)
        else:
            print(suite, ' not found')
            print(out)
            print('\n')
    print(len(all_matches[which_super_suite]),len(all_super_suites[which_super_suite]))
    all_match_names[which_super_suite] = [standard_labels[t] for t in all_matches[which_super_suite]]
    #print(super_suites['NOS Suite name'].values)


## Relationships between standards

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram
from scipy.spatial import distance
from scipy.cluster.hierarchy import cophenet
from scipy.cluster.hierarchy import fcluster 


In [None]:
# if we only want to cluster one suite
#steel_construction = df_nos[df_nos['One_suite'] == 'steelfix construction']


In [None]:
sns.set_style("whitegrid")


In [None]:
SAVEHC = False


In [None]:
#We calculate cosine distance between tf-idf vectors of the documents

def do_hierarch_clustering(tfidfm, DOPLOTS = True):
    t0 = time.time()
    N2 = 11914
    N = 400 #400*400 = 160000 distance calls per second. For N=21500 -- > 462250000 calls --> 2900*160000 calls 
    # --> I'm guessing 2900 seconds = 48 minutes (I think it's likely to be more actually)
    # 4000*4000 takes approximately 110 seconds. It's double for the cophenet. So, for N=22500, the three functions 
    # together will take approx 4 hours (I'll do it tonight)

    try:
        distances = distance.pdist(tfidfm.todense(), metric = 'cosine') # + np.random.randn(N,N2), metric = 'cosine')
        sparse_flag = True
    except:
        distances = distance.pdist(tfidfm, metric = 'cosine')
        sparse_flag = False
    print_elapsed(t0, 'calculating cosine distances of tfidf vectors')

    #We then build linkage matrix using the distances and specifying the method. For euclidean distances typically
    # 'Ward' produces best results. For cosine we can only use 'average' and 'single'.
    linkage_matrix = scipy.cluster.hierarchy.linkage(distances,
                                                     method = 'average',
                                                     metric = 'cosine')
    print_elapsed(t0, 'hierarchical clustering of cosine distances')
    #We can test how well the groupings reflect actual distances. If c > 0.75 this is considered to be sufficiently
    #good representation
    if sparse_flag:
        c, coph_dists = cophenet(linkage_matrix, 
                             distance.pdist(tfidfm.todense(), metric = 'cosine'))
    else:
        c, coph_dists = cophenet(linkage_matrix, 
                             distance.pdist(tfidfm, metric = 'cosine'))

    print_elapsed(t0, 'computing the cophenetic correlation')

    if DOPLOTS:
        fig, ax =plt.subplots(figsize = (5,5))
        plt.imshow(scipy.spatial.distance.squareform(distances))
        plt.title('cosine distances between suites')
        plt.colorbar()

        fig, ax = plt.subplots(figsize = (5,5))
        tmp = plt.imshow(scipy.spatial.distance.squareform(coph_dists))
        plt.colorbar()
    print('The cophenetic coefficient is {:.4f}'.format(c))
    return distances, linkage_matrix, c, coph_dists



## Choosing parameters for features extraction

ngrams : uni/bi/tri

tfidf thresholds: min and max percentage

which parts of speech were selected before

whether we are working at the level of suites or of invidual NOS, and how we aggregate NOS to form the suit level


In [None]:
# change the parameters if needed
params = {}
params['ngrams'] = 'bi'
params['pofs'] = 'nv'
params['tfidf_min'] = 3
params['tfidf_max'] = 0.4

params['bywhich'] = 'suites' #'docs' #'suites'
params['mode'] = 'combinedtfidf' #'tfidf' #'meantfidf' #'combinedtfidf' #'meantfidf'

# get the transform tfidf
tfidf = define_tfidf(params, stopwords0)

# get the matrix again (even though if the parameters stay the same, this one is the same still)
# get the features
tfidfm, feature_names, tfidf, textfortoken = get_tfidf_matrix(params, df_nos, tfidf, 
                                                              col = 'pruned_lemmas')

# get labels
if params['bywhich'] == 'suites':
    standard_labels = list(df_nos.groupby('One_suite').groups.keys())
else:
    standard_labels = list(df_nos['NOS Title'].values)
    

In [None]:
# check best features in one suite
suites2check = ['pension scheme trusteeship','pension trustee board secretaryship',
    'secretary to the trustees of pension funds', 
    'trustee and management committee', 'stop1','investment operations',
    'investment strategy and management','providing financial advice and financial planning',
    'generic financial advice','providing advice on savings for retirement','stop2',
    'electricity network control engineer','gas network construction',
    'gas networks engineering management','leakage detection and control',
    'multi utility network construction','network construction operations',
    'utilities network planning and management','utility infrastructure management']
tmp_df = []
if True:
    standard_labels = list(df_nos.groupby('One_suite').groups.keys())
    for i,suite0 in enumerate(suites2check):
        print(suite0)
        if suite0[:4] == 'stop':
            tmp_list = ['-']*50
        else:
            #standard_labels2 = textfortoken.index.values
            s_idx = standard_labels.index(suite0) #'business continuity management 2013')
            TF= tfidfm[s_idx,:].T.todense()
            sort_ix = np.array(np.argsort(TF.T).ravel()).T
            sort_ix = sort_ix[::-1]
            tmp_list = [feature_names[ix[0]] for ix in sort_ix[:50]]
            #print(tmp_list)
        if i == 0:
            tmp_df = pd.DataFrame(tmp_list, columns = [suite0])
        else:
            tmp_df = tmp_df.join(pd.DataFrame(tmp_list, columns = [suite0]))
        #for ix in sort_ix[:50]: #,t in enumerate(feature_names[sort_ix]):
        #    ix = ix[0]
        #    if TF[ix]>0.05:
        #        print(feature_names[ix])
        #        break
        print('*'*70)
    #for it in range(len(standard_labels)):
    #    print(standard_labels[it],',',standard_labels2[it])
#print(np.sort(TF,axis = 0))
#tmp_df = pd.merge(tmp_df)
tmp_df.T.to_csv('/Users/stefgarasto/Google Drive/Documents/results/NOS/nlp_analysis/tmp_best_keywords.csv')

In [None]:
feature_names.index('engineer'), len(feature_names), tfidfm.shape

In [None]:
#collections.Counter(textfortoken.loc['animal care v2'])

In [None]:
# perform hierarchical clustering
distances, linkage_matrix, c, _ = do_hierarch_clustering(tfidfm)


In [None]:
distances.shape


In [None]:
#Plot the dendrogram
fig, ax = plt.subplots(figsize=(10, 80)) # set size
ax = dendrogram(linkage_matrix, 
                labels = [t.capitalize() for t in standard_labels], 
                orientation = 'right', 
                leaf_font_size=6,
               color_threshold = 0,
               truncate_mode = 'level', p =20,
               above_threshold_color = 'k');

plt.tick_params(axis= 'y',
                labelsize = 13)
plt.title('Hierarchical Clustering Dendrogram of Standards', fontsize = 20)
plt.xlabel('Distance', fontsize = 20)
plt.ylabel('NOS suites',fontsize = 20)
if SAVEHC and False:
    plt.savefig(os.path.join(output_dir, 'nos_aggregated_dendrogram_{}_{}_{}.png'.format(qualifier,params['bywhich'],
                                                    params['mode'])), bbox_inches = "tight")   
    

In [None]:
#Re-plot the dendrogram
fig, ax = plt.subplots(figsize=(10, 160)) # set size
ax = dendrogram(linkage_matrix, 
                labels = [t.capitalize() for t in standard_labels], 
                orientation = 'right', 
                leaf_font_size=6,
               color_threshold = 0,
               truncate_mode = 'level', p =43,
               above_threshold_color = 'k');

plt.tick_params(axis= 'y',
                labelsize = 12)
plt.title('Hierarchical Clustering Dendrogram of Standards', fontsize = 20)
plt.xlabel('Distance', fontsize = 20)
plt.ylabel('NOS suites',fontsize = 20)
T = plt.yticks()
super_suites_colours = {'Management': nesta_colours[1], 'Engineering': nesta_colours[3],
                        'FinancialServices':nesta_colours[6], 'Construction': nesta_colours[4]}
for t in T[1]:
    for which_super_suite in super_suites_names:
        if t.get_text().lower() in all_match_names[which_super_suite]:
            #print(t,',',which_super_suite)
            #plt.text(t,'r')
            t.set_color(super_suites_colours[which_super_suite])
            break

if SAVEHC and False:
    plt.savefig(os.path.join(output_dir, 'nos_aggregated_dendrogram_{}_{}_{}.pdf'.format(qualifier,params['bywhich'],
                                                    params['mode'])), bbox_inches = "tight")   
    

In [None]:
if False:
    for t in all_matches['Engineering']:
        TF= tfidfm[t,:].T.todense()
        print(standard_labels[t])
        for ix,fn in enumerate(feature_names):
            if TF[ix]>0.08:
                print(fn)
        print('\n')

In [None]:
# extract the super-suites subset from the cosine matrix and print average similarity matrix
average_super_similarity = {}
tfidf_super = {}
for which_super_suite in super_suites_names:
    tfidf_super[which_super_suite] = tfidfm[all_matches[which_super_suite],:]

for which_super_suite in super_suites_names:
    average_super_similarity[which_super_suite] = {}
    for which_super_suite2 in super_suites_names:
        A = distance.cdist(tfidf_super[which_super_suite].todense(),
                           tfidf_super[which_super_suite2].todense(), metric = 'cosine')
        N = A.shape
        if which_super_suite == which_super_suite2:
            B = A[np.triu_indices(N[0],1)]
        else:
            B = A#[np.triu_indices(N[0],m=N[1])]
        average_super_similarity[which_super_suite][which_super_suite2] = (np.around(np.mean(B),2),
                                                                           np.around(np.median(B),2),
                                                                           np.around(np.std(B),2))
print('Done')


In [None]:
# check that the cosine distances I computed are correct
if False:
    s1='Construction'
    A = tfidfm[all_matches[s1],:].todense()
    print(A.shape)
    B = np.zeros((A.shape[0],A.shape[0]))
    Bl = []
    for ia in range(2,A.shape[0]):
        a1 = A[ia].T
        a1 = np.asarray(a1)
        for ia2 in range(ia+1,A.shape[0]):
            a2 = A[ia2].T
            a2 = np.asarray(a2)
            den1 = np.sqrt((np.asarray(a1.T)*np.asarray(a1.T)).sum())
            den2 = np.sqrt((np.asarray(a2.T)*np.asarray(a2.T)).sum())
            B[ia,ia2] = ((np.asarray(a1.T)*np.asarray(a2.T)).sum())/den1/den2
            num = (np.asarray(a1.T)*np.asarray(a2.T)).sum()
            Bl.append(num/den1/den2)
            if ia2==10:
                print(np.corrcoef(a1,a2)[0,1])
                #plt.plot(a1,a2,'o')
                #plt.plot(a2)
                print(Bl[-1],num,den1,den2)
                print(standard_labels[all_matches[s1][ia2]])
                print([t for ix,t in enumerate(feature_names) if a2[ix]>.08])
                print(standard_labels[all_matches[s1][ia]])
                print([t for ix,t in enumerate(feature_names) if a1[ix]>.08])
                break
        break
    plt.matshow(B)
    plt.colorbar()
    print(1-np.mean(Bl))

In [None]:
print(average_super_similarity)
pd.DataFrame.from_dict(average_super_similarity)

If we wanted to cut the dendrogram at a certain distance threshold, we would use fcluster as shown below

In [None]:

labels_h = fcluster(linkage_matrix, 0.8, criterion='distance')

if params['bywhich']== 'suites':
    short_df = pd.DataFrame(standard_labels)
    short_df.columns = ['Suite_names']
else:
    short_df = df_nos[['NOS Title', 'One_suite']].iloc

short_df['hierarchical'] = labels_h

n_clusters = len(collections.Counter(labels_h))
print(n_clusters)


In [None]:
# print the result of the cut dendrogram
hierarchical_dict= {}
for ic in range(1,n_clusters+1):
    hierarchical_dict['{}'.format(ic)] = short_df['Suite_names'][short_df['hierarchical']==ic].values
if SAVEHC:
    pd.DataFrame.from_dict(hierarchical_dict, orient = 'index').to_csv(output_dir +
                            '/Hierarchical_results_{}_{}_{}.csv'.format(qualifier,params['bywhich'],params['mode']))
    

## Perform hierarchical clustering on all suites that belong to at least one super-suite 

In [None]:
def assign_supersuite(x):
    for supersuite in all_match_names.keys():
        if x in all_match_names[supersuite]:
            return supersuite.lower()
    # if no match has been found
    return 'other'

def adjustsoccode(x):
    y = re.findall(r"[\d']+", str(x))
    if len(y):
        return y[0][1:-1]
    else:
        return np.nan

def extract2digits(x):
    if isinstance(x,str):
        try:
            return float(x[:2])
        except:
            return np.nan
    else:
        return np.nan
    
def extract3digits(x):
    if isinstance(x,str):
        try:
            return float(x[:3])
        except:
            return np.nan
    else:
        return np.nan
    
def extract1digits(x):
    if isinstance(x,str):
        try:
            return float(x[:1])
        except:
            return np.nan
    else:
        return np.nan

def extract4digits(x):
    if isinstance(x,str):
        try:
            return float(x)
        except:
            return np.nan
    else:
        return np.nan

In [None]:
# extract relevant dataset
df_nos['supersuite'] = df_nos['One_suite'].apply(assign_supersuite)
df_nos_select = df_nos[~(df_nos['supersuite']=='other')]
print(len(df_nos_select))


In [None]:
# change the parameters if needed
params = {}
params['ngrams'] = 'uni'
params['pofs'] = 'nv'
params['tfidf_min'] = 3
params['tfidf_max'] = 0.6

params['bywhich'] = 'suites' #'docs' #'suites'
params['mode'] = 'combinedtfidf' #'tfidf' #'meantfidf' #'combinedtfidf' #'meantfidf'

# get the transform tfidf
tfidf_select = define_tfidf(params, stopwords0)

# get the matrix again (even though if the parameters stay the same, this one is the same still)
# get the features
tfidfm_select, feature_names_select, tfidf_select, _ = get_tfidf_matrix(params, df_nos_select, 
                                                        tfidf_select, col= 'pruned_lemmas')

# get labels
if params['bywhich'] == 'suites':
    standard_labels_select = list(df_nos_select.groupby('One_suite').groups.keys())
else:
    standard_labels_select = list(df_nos_select['NOS Title'].values)
    

In [None]:
# compute again which suite is in which super-suite from this subsample
tmp = np.arange(len(standard_labels_select))
all_matches_select = {}
for super_suite in ['Engineering','Management','Construction','FinancialServices']:
    all_matches_select[super_suite] = tmp[[t in all_match_names[super_suite] for t in standard_labels_select]]


In [None]:
# check best features in one suite
if False:
    #standard_labels_select = list(df_nos_select.groupby('One_suite').groups.keys())
    #standard_labels2 = textfortoken.index.values
    s_idx = 0#standard_labels_select.index('business continuity management 2013')
    TF= tfidfm_select[s_idx,:].T.todense()
    print(standard_labels_select[s_idx])
    for ix,t in enumerate(feature_names_select):
        if TF[ix]>0.05:
            print(t)
    #for it in range(len(standard_labels)):
    #    print(standard_labels[it],',',standard_labels2[it])

In [None]:
feature_names_select.index('engineer'), len(feature_names_select), tfidfm_select.shape

In [None]:
# perform hierarchical clustering
distances_select, linkage_matrix_select, c_select, _ = do_hierarch_clustering(tfidfm_select)


In [None]:
distances_select.shape


In [None]:
#Plot the dendrogram (cutting at threshold)
cutting_th = 0.71
fig, ax = plt.subplots(figsize=(10, 80)) # set size
ax = dendrogram(linkage_matrix_select, 
                labels = [t.capitalize() for t in standard_labels_select], 
                orientation = 'right', 
                leaf_font_size=6,
               color_threshold = cutting_th,
               truncate_mode = 'level', p =30)#,
               #above_threshold_color = 'k');

plt.tick_params(axis= 'y',
                labelsize = 13)
plt.title('Hierarchical Clustering Dendrogram of Selected Suites', fontsize = 20)
plt.xlabel('Distance', fontsize = 20)
plt.ylabel('NOS suites',fontsize = 20)
if SAVEHC or True:
    plt.savefig(os.path.join(output_dir, ''.join(['suitesnos_clusters_nv_final_no_dropped/',
                    'all_supersuites_nos_cut_dendrogram_{}_{}_{}_{}.png'.format(qualifier,
        params['bywhich'],params['tfidf_max'],params['ngrams'])])), bbox_inches = "tight")   
    

In [None]:
#Re-plot the dendrogram (no cutting)
fig, ax = plt.subplots(figsize=(10, 80)) # set size
ax = dendrogram(linkage_matrix_select, 
                labels = [t.capitalize() for t in standard_labels_select], 
                orientation = 'right', 
                leaf_font_size=6,
               color_threshold = 0,
               truncate_mode = 'level', p =43,
               above_threshold_color = 'k');

plt.tick_params(axis= 'y',
                labelsize = 12)
plt.title('Hierarchical Clustering Dendrogram of Selected Suites', fontsize = 20)
plt.xlabel('Distance', fontsize = 20)
plt.ylabel('NOS suites',fontsize = 20)
T = plt.yticks()
super_suites_colours = {'Management': nesta_colours[1], 'Engineering': nesta_colours[3],
                        'FinancialServices':nesta_colours[6], 'Construction': nesta_colours[4]}
for t in T[1]:
    for which_super_suite in super_suites_names:
        if t.get_text().lower() in all_match_names[which_super_suite]:
            #print(t,',',which_super_suite)
            #plt.text(t,'r')
            t.set_color(super_suites_colours[which_super_suite])
            break

if SAVEHC or True:
    plt.savefig(os.path.join(output_dir, ''.join(['suitesnos_clusters_nv_final_no_dropped/',
                             'all_supersuites_nos_dendrogram_{}_{}_{}_{}.pdf'.format(
        qualifier,params['bywhich'], params['tfidf_max'],params['ngrams'])])), 
                bbox_inches = "tight")   
    

In [None]:
if True:
    for t in all_matches_select['Engineering']:
        TF= tfidfm_select[t,:].T.todense()
        print(standard_labels_select[t])
        for ix,fn in enumerate(feature_names_select):
            if TF[ix]>0.08:
                print(fn)
        print('\n')

In [None]:

labels_h = fcluster(linkage_matrix_select, cutting_th, criterion='distance')

if params['bywhich']== 'suites':
    short_df = pd.DataFrame(standard_labels_select)
    short_df.columns = ['Suite_names']
else:
    short_df = df_nos_select[['NOS Title', 'One_suite', 'supersuite']].iloc

short_df['hierarchical'] = labels_h

cluster_sizes = collections.Counter(labels_h)
n_clusters = len(cluster_sizes)
print(n_clusters, len(standard_labels_select), cluster_sizes)

print(output_dir)


In [None]:
# print the result of the cut dendrogram
hierarchical_dict= {}
L = {}
D = {}
for ic in range(1,n_clusters+1):
    hierarchical_dict['{}'.format(ic)] = list(short_df['Suite_names'][
        short_df['hierarchical']==ic].values)
    A = distance.squareform(distances_select)[(short_df['hierarchical']==ic).values,:][:,
                        (short_df['hierarchical']==ic).values]
    if A.sum()>0:
        A = np.triu(A)
        A = A[A[:]>0]
    else:
        A = np.ones(1)
    D['{}'.format(ic)] = np.around(np.mean(A),3)
    L['{}'.format(ic)] = (short_df['hierarchical']==ic).sum()
L = pd.DataFrame.from_dict(L, orient = 'index', columns = ['lenght'])
D = pd.DataFrame.from_dict(D, orient = 'index', columns = ['avg dist'])
L = L.join(D)
if SAVEHC or True:
    short_df.to_csv(output_dir + '/suitesnos_clusters_nv_final_no_dropped' + 
                            '/all_supersuites_hierarchical_results_{}_{}_{}_{}.csv'.format(
                            qualifier,params['bywhich'],params['tfidf_max'],params['ngrams']))
    L.join(pd.DataFrame.from_dict(hierarchical_dict, orient = 'index')).sort_values(
        by='avg dist', ascending=True).to_csv(output_dir +
                            '/suitesnos_clusters_nv_final_no_dropped'+ 
                            '/all_supersuites_clusters_{}_{}_{}_{}.csv'.format(
                            qualifier,params['bywhich'],params['tfidf_max'],params['ngrams']))
    

In [None]:
# check best features in one suite
'''
suites2check = ['pension scheme trusteeship','pension trustee board secretaryship',
    'secretary to the trustees of pension funds', 
    'trustee and management committee', 'stop1','investment operations',
    'investment strategy and management','paraplanning','providing financial advice and financial planning',
    'generic financial advice','providing advice on savings for retirement','stop2',
    'electricity network control engineer','gas network construction',
    'gas networks engineering management','leakage detection and control',
    'multi utility network construction','network construction operations',
    'utilities network planning and management','utility infrastructure management',
    'stop3', 'glazing','fenestration, installation and surveying',
                'solar thermal photovoltaic panel installation and surveying']
suites2check = [['pension scheme trusteeship', 'pension trustee board secretaryship', 'secretary to the trustees of pension funds', 'trustee and management committee'],
['providing advice on securities or derivatives in the non retail market', 'providing advice on securities or derivatives in the retail market'],
['credit management', 'financing and credit'],
['gas network construction', 'multi utility network construction', 'network construction operations'],
['harbour masters', 'supervision of port operations'],
['directors', 'live events and promotions management', 'production accounting', 'senior producers'],
['expedition leadership and management', 'management and leadership', 'managing in road passenger transport', 'outdoor sector senior roles', 'professional skills for government psg'],
['constructing capital plant steel structures erecting', 'design and draughting', 'fabricating steel structures plating', 'installation, testing and commissioning of electrical systems and equipment plant', 'installing plant and systems mechanical', 'installing plant and systems pipefitting', 'maintaining plant and systems electrical', 'maintaining plant and systems instrument and controls', 'maintaining plant and systems mechanical', 'monitoring engineering construction activities', 'supporting activities in engineering construction', 'welding plate and pipework', 'welding supervision'],
['rail engineering overhead line equipment construction suite 2', 'rail engineering overhead line equipment construction suite 3'],
['rail engineering', 'rail engineering signalling suite 2', 'rail engineering signalling suite 3', 'rail engineering telecoms suite 2', 'rail engineering telecoms suite 3'],
['performing engineering operations suite 1 2006', 'performing engineering operations suite 2 2006'],
['composite engineering suite 2', 'composite engineering suite 3'],
['engineering and manufacture suite 4', 'engineering leadership and management suite 3', 'engineering leadership and management suite 4', 'engineering leadership and management suite 5', 'engineering leadership suite 3', 'engineering technical support suite 2 2007', 'engineering technical support suite 3 2009'],
['heating and ventilating', 'plumbing and domestic heating', 'refrigeration and air conditioning'],
['fenestration, installation and surveying', 'glazing']]
'''
tmp_df = []
if True:
    j=0
    #standard_labels = list(df_nos.groupby('One_suite').groups.keys())
    for key in hierarchical_dict:
    #for j in range(len(suites2check)):
        suites2check = hierarchical_dict[key]
        if len(suites2check)==1:
            continue
        for i,suite0 in enumerate(suites2check):
            print(suite0)
            if suite0[:4] == 'stop':
                tmp_list = ['-']*50
            else:
                #standard_labels2 = textfortoken.index.values
                s_idx = standard_labels_select.index(suite0) #'business continuity management 2013')
                TF= tfidfm_select[s_idx,:].T.todense()
                sort_ix = np.array(np.argsort(TF.T).ravel()).T
                sort_ix = sort_ix[::-1]
                tmp_list = [feature_names_select[ix[0]] for ix in sort_ix[:50]]
                #print(tmp_list)
            if isinstance(tmp_df,list):
                tmp_df = pd.DataFrame(tmp_list, columns = [suite0])
            else:
                tmp_df = tmp_df.join(pd.DataFrame(tmp_list, columns = [suite0]))
            print('*'*70)
        tmp_list = ['-']*50
        tmp_df = tmp_df.join(pd.DataFrame(tmp_list, columns = ['stop{}'.format(j)]))
        j+=1
tmp_df.to_csv(''.join(['/Users/stefgarasto/Google Drive/Documents/results/NOS/',
                'nlp_analysis/suitesnos_clusters_nv_final_no_dropped/',
                       'tmp_best_keywords_{}_{}.csv'.format(
                        params['tfidf_max'], params['ngrams'])]))


# do it again just for a relevant subset
tmp_df = []
if True:
    j=0
    #standard_labels = list(df_nos.groupby('One_suite').groups.keys())
    for key in hierarchical_dict:
        if D.loc[key].values>0.65:
            continue
    #for j in range(len(suites2check)):
        suites2check = hierarchical_dict[key]
        if (len(suites2check)==1) | (len(suites2check)>15):
            continue
        for i,suite0 in enumerate(suites2check):
            print(suite0)
            if suite0[:4] == 'stop':
                tmp_list = ['-']*50
            else:
                #standard_labels2 = textfortoken.index.values
                s_idx = standard_labels_select.index(suite0) #'business continuity management 2013')
                TF= tfidfm_select[s_idx,:].T.todense()
                sort_ix = np.array(np.argsort(TF.T).ravel()).T
                sort_ix = sort_ix[::-1]
                tmp_list = [feature_names_select[ix[0]] for ix in sort_ix[:50] if TF[ix]>.0001]
                #print(tmp_list)
            if isinstance(tmp_df,list):
                tmp_df = pd.DataFrame(tmp_list, columns = [suite0])
            else:
                tmp_df = tmp_df.join(pd.DataFrame(tmp_list, columns = [suite0]))
            print('*'*70)
        tmp_list = ['-']*50
        tmp_df = tmp_df.join(pd.DataFrame(tmp_list, columns = ['stop{}'.format(j)]))
        j+=1
tmp_df.to_csv(''.join(['/Users/stefgarasto/Google Drive/Documents/results/NOS/',
                       'nlp_analysis/suitesnos_clusters_nv_final_no_dropped/',
                       'tmp_best_keywords_{}_{}_subset.csv'.format(
                    params['tfidf_max'], params['ngrams'])]))

In [None]:
# extract the super-suites subset from the cosine matrix and print average similarity matrix
avg_super_dissimilarity = {}
tfidf_super_select = {}
for which_super_suite in super_suites_names:
    tfidf_super_select[which_super_suite] = tfidfm_select[all_matches_select[which_super_suite],:]

for which_super_suite in super_suites_names:
    avg_super_dissimilarity[which_super_suite] = {}
    for which_super_suite2 in super_suites_names:
        A = distance.cdist(tfidf_super_select[which_super_suite].todense(),
                           tfidf_super_select[which_super_suite2].todense(), metric = 'cosine')
        N = A.shape
        if which_super_suite == which_super_suite2:
            B = A[np.triu_indices(N[0],1)]
        else:
            B = A#[np.triu_indices(N[0],m=N[1])]
        avg_super_dissimilarity[which_super_suite][which_super_suite2] = (np.around(np.mean(B),2),
                                                                           np.around(np.median(B),2),
                                                                           np.around(np.std(B),2))
print('Done')


In [None]:
print(avg_super_dissimilarity)
pd.DataFrame.from_dict(avg_super_dissimilarity)


In [None]:
# check that the cosine distances I computed are correct
if True:
    s1='Construction'
    supsuite_tfidf = tfidfm_select[all_matches_select[s1],:].todense()
    print(supsuite_tfidf.shape)
    supsuite_simil = np.empty((supsuite_tfidf.shape[0],supsuite_tfidf.shape[0]))
    supsuite_simil[:] = np.nan
    supsuite_simil_l = []
    for ia in range(supsuite_tfidf.shape[0]):
        vector1 = supsuite_tfidf[ia].T
        vector1 = np.asarray(vector1)
        # only compute values for upper triangular part of the matrix
        for ia2 in range(ia+1,supsuite_tfidf.shape[0]):
            vector2 = supsuite_tfidf[ia2].T
            vector2 = np.asarray(vector2)
            # compute norm of vector 1
            den1 = np.sqrt((np.asarray(vector1.T)*np.asarray(vector1.T)).sum())
            # norm of vector 2
            den2 = np.sqrt((np.asarray(vector2.T)*np.asarray(vector2.T)).sum())
            # scalar product
            num = (np.asarray(vector1.T)*np.asarray(vector2.T)).sum()
            supsuite_simil[ia,ia2] = num/den1/den2
            supsuite_simil_l.append(num/den1/den2)
            if num>.88: #(num<.1) or (num>.9):
                print('Correlation between vectors: ',np.corrcoef(vector1.T,vector2.T)[0,1])
                plt.plot(vector1,vector2,'o')
                #plt.plot(a2)
                print('Similarity between vectors',supsuite_simil_l[-1])
                #,', num, den1, den2: ',num,den1,den2)
                print('Suite2: ',standard_labels_select[all_matches_select[s1][ia2]])
                print('Suite 2 keywords: ',[t for ix,t in enumerate(feature_names_select) 
                                            if vector2[ix]>.08])
                print('Suite 1: ',standard_labels_select[all_matches_select[s1][ia]])
                print('Suite 1 keywords: ',[t for ix,t in enumerate(feature_names_select) 
                                            if vector1[ix]>.08])
                #break
        #break
    plt.matshow(supsuite_simil)
    plt.title('similarities')
    plt.colorbar()
    print('Average dissimilarity for', s1, 1-np.mean(supsuite_simil_l))
    

### Hierarchical cluster of NOS in suites clusters
For selected clusters of suite, perform hierarchical clustering of the NOS inside

In [None]:
# add some more things
df_nos_select['SOC4str'] = df_nos_select['Clean SOC Code'].map(adjustsoccode)
df_nos_select['SOC4'] = df_nos_select['SOC4str'].map(extract4digits)


In [None]:
def select_subdf(SELECT_MODE, clusters2use, suites_clusters,df_nos_select):
    if isinstance(SELECT_MODE, str):
        tmp_dict = {'engineering': 'Engineering', 'management': 'Management',
                    'financialservices': 'Financial services', 
                    'construction': 'Construction'}
        # select NOS from super suite
        cluster_name = SELECT_MODE
        cluster_name_save = cluster_name
        cluster_name_figs = tmp_dict[SELECT_MODE]
        subset_nos = df_nos_select[df_nos_select['supersuite']== SELECT_MODE]
    elif isinstance(SELECT_MODE, int):
        cluster_name = clusters2use[SELECT_MODE][1]
        cluster_name_save = cluster_name.replace(' ','_')
        cluster_name_figs = cluster_name.capitalize()
        suites2use = list(suites_clusters[suites_clusters['hierarchical'].map(
                lambda x: x in clusters2use[SELECT_MODE][0])]['Suite_names'].values)
        subset_nos = df_nos_select[df_nos_select['One_suite'].map(
                lambda x: x in suites2use)]
    print('Number of NOS selected: ', len(subset_nos))
    #print(subset_nos.columns)
    
    #%
    # only select those engineering nos with SOC codes
    nosoc = subset_nos['SOC4'].isnull()
    print('percentage of nos without SOC codes: ', nosoc.sum()/len(nosoc))
    if (nosoc.sum())/len(nosoc)<0.9:
        final_nos = subset_nos[~nosoc] #np.isnan(engineering_nos['SOC4'])]
    else:
        final_nos = subset_nos
    final_groups = final_nos.groupby(by = 'One_suite')
    larger_suites = []
    all_lengths = final_groups.agg(len)['NOS Title'].values
    all_lengths[::-1].sort()
    print('Number of NOS in suites belonging to this cluster: ',all_lengths)
    #th_supers = ['engineering': 40, 'financialservices': ]
    for name, group in final_groups:
        if isinstance(SELECT_MODE, int):
            larger_suites.append(name)
        elif len(group)> all_lengths[15]:#th_supers[SELECT_MODE]:
            #print(name, len(group))
            larger_suites.append(name)

    return final_nos, final_groups, larger_suites, cluster_name,  \
                    cluster_name_save, cluster_name_figs


In [None]:
# change the parameters if needed
paramsn = {}
paramsn['ngrams'] = 'uni'
paramsn['pofs'] = 'nv'
paramsn['tfidf_min'] = 3
paramsn['tfidf_max'] = 0.4

paramsn['bywhich'] = 'docs' #'docs' #'suites'
paramsn['mode'] = 'tfidf' #'tfidf' #'meantfidf' #'combinedtfidf' #'meantfidf'

# select the relevant NOS

# get the transform tfidf
tfidf_n = define_tfidf(paramsn, stopwords0)

# get the matrix again (even though if the parameters stay the same, this one is the 
# same still)
# this is to fit the transform on the entire NOS corpus (from supersuites only)
_, feature_names_n, tfidf_n, _ = get_tfidf_matrix(paramsn, df_nos_select, 
                                                        tfidf_n, col= 'pruned_lemmas')

In [None]:
suites_clusters = pd.read_csv(''.join(['/Users/stefgarasto/Google Drive/Documents/results',
            '/NOS/nlp_analysis/all_supersuites_hierarchical_results_postjoining_final',
                                       '_no_dropped_suites_combinedtfidf_uni.csv']))

clusters2use = [([17], 'port operations',3),
                ([4], 'pension schemes',3),
                ([54],'rail engineering',18),
                ([51], 'temperature control engineering',5),
                ([18], 'food and hospitality',39), # note that 39 is a local max in the knee
                ([21], 'entertainment industry',29),
                ([31], 'social care',22),
                ([46], 'network engineering (utilities)',29)]

# note: this clusters are kinda robust under changes of parameters for the TFIDF - that is,
# they are roughly the same when using bi-grams, uni-grams with max=0.4 and uni-grams with 
# max = 0.6

SAVEHC = False
# so far, groups 0 and 3 don't work very well, 4 so and so, 5 needs 0.8, 
# 1,2 are decent with 0.4 (0.5?), 7 can also do with 0.8,
# note that 3 and 6 have most NOS without SOC codes
for SELECT_MODE in range(2,3):#8):
    df_nos_n, final_groups, larger_suites, cluster_name, cluster_name_save, \
                cluster_name_figs = select_subdf(SELECT_MODE, clusters2use, 
                                                 suites_clusters,df_nos_select)
    print('Computing clusters for {}'.format(cluster_name_figs))

    # remove legacy nos
    print('nb with legacy nos: ',len(df_nos_n))
    df_nos_n = df_nos_n[df_nos_n['NOS Title'].map(lambda x: 'legacy' not in x)]
    print('nb without legacy nos 1: ',len(df_nos_n))
    df_nos_n = df_nos_n[df_nos_n.index.map(lambda x: not x[-5:]=='l.pdf')]
    print('nb without legacy nos 2: ',len(df_nos_n))
    suites_in_clus = {}
    groups_clus = df_nos_n.groupby('One_suite')
    for name, group in groups_clus:
        suites_in_clus[name] = list(group['NOS Title'].values)

    # this is to get the restricted corpus (to transform, not for fitting)
    textfortoken = df_nos_n['pruned_lemmas']
    tfidfm_n = tfidf_n.transform(textfortoken)

    # get labels
    if paramsn['bywhich'] == 'suites':
        standard_labels_n = list(df_nos_n.groupby('One_suite').groups.keys())
    else:
        standard_labels_n = list(df_nos_n['NOS Title'].values)

    for ix,t in enumerate(standard_labels_n):
        if len(t)>500:
            # manual correction because of pdf extraction
            standard_labels_n[ix] = standard_labels_n[ix][:50]

    #print('tfidfm and features shapes: ',tfidfm_n.shape, len(feature_names_n))


    # check best features in a few NOS
    for s_idx in range(1): #34):
        s_idx = standard_labels_n.index(
            'lift and move permanent way materials, components and equipment')
        TF= tfidfm_n[s_idx,:].T.todense()
        print(standard_labels_n[s_idx])
        ix = np.argsort(TF, axis = 0)
        for i in ix[-20:][::-1]: #enumerate(feature_names_n):
            i = np.array(i)
            print(feature_names_n[i[0][0]],np.around(TF[i[0][0]][0,0],3))
        print()



    #print('Where do we find the feature \'system\'''?: ',feature_names_n.index('system'))


    # perform hierarchical clustering
    distances_n, linkage_matrix_n, c_n, _ = do_hierarch_clustering(tfidfm_n, DOPLOTS= False)


    # Plotting the distance between successive clusters: is there a knee?
    z = linkage_matrix_n[::-1,2]
    fig = plt.figure(figsize = (6,6))
    plt.plot(range(1, len(z)+1), z)
    knee = np.diff(z, 2)
    plt.plot(range(2, len(linkage_matrix_n)), knee)
    plt.xlabel('partition')
    plt.ylabel('cluster distance')
    plt.title(cluster_name_figs)
    goodness = []
    for i in range(3,len(z)-2):
        a1 = scipy.stats.linregress(range(1,i+1), z[:i])
        a2 = scipy.stats.linregress(range(i, len(z)), z[i:])
        goodness.append(np.around(a1[2]**2 + a2[2]**2,4))
    plt.figure(figsize = (6,6))
    #print(goodness)
    plt.plot(np.arange(3,len(z)-2),goodness)
    plt.title(cluster_name_figs)
    ixg = np.array(goodness).argmax()+3
    print('best t-point: ',ixg)

    num_ideal = np.ceil(len(df_nos_n)/10)
    print('The ideal number of clusters would be: ',num_ideal)
    num_clust1 = knee.argmax() + 2
    knee[knee.argmax()] = 0
    num_clust2 = knee.argmax() + 2
    
    '''if num_clust1 == 2:
        num_clust = num_clust2
    elif num_clust2 == 2:
        num_clust = num_clust1
    elif num_clust2>num_ideal:
        if num_clust1 > num_ideal:
            num_clust = num_ideal
        else:
            num_clust = min([num_clust1,num_clust2])
    else:
        if np.abs(num_clust1-num_ideal)<np.abs(num_clust2-num_ideal):
            num_clust = num_clust1
        else:
            num_clust = num_clust2'''
    num_clust = clusters2use[SELECT_MODE][2]
    print('The two peaks are, in order: ',num_clust1, num_clust2)
    print('The selected num clust is ',num_clust)
    #num_clust = max([num_clust1,num_clust2])

    for t in np.arange(0,1,0.05):
        labels_n = fcluster(linkage_matrix_n, t, criterion='distance')
        n_clust = len(set(labels_n))
        if n_clust <= num_clust:
            cutting_th_n = t
            break
    print('cutting threshold: {}'.format(cutting_th_n))       
    
    #Plot the dendrogram (cutting at threshold)
    #cutting_th_n = 0.6
    h = .5*len(df_nos_n)
    fig, ax = plt.subplots(figsize=(28, h)) # set size
    ax = dendrogram(linkage_matrix_n, 
                    labels = [t.capitalize() for t in standard_labels_n], 
                    orientation = 'right', 
                    leaf_font_size=6,
                   color_threshold = cutting_th_n,
                   truncate_mode = 'level', p =30)#,
                   #above_threshold_color = 'k');

    plt.tick_params(axis= 'y',
                    labelsize = 24)
    plt.title('Hierarchical clustering for {}'.format(cluster_name_figs), fontsize = 30)
#              'Hierarchical Clustering Dendrogram of Selected NOS', fontsize = 20)
    plt.xlabel('Distance', fontsize = 30)
    plt.ylabel('NOS title',fontsize = 30)
    
    s_patch = []
    for ix, which_suite in enumerate(suites_in_clus):
        s_patch.append(mpatches.Patch(color= nesta_colours[ix],label= 
                                      which_suite.capitalize()))
        #plt.plot(0,0,color = nesta_colours[ix],label = which_suite.capitalize())
    plt.legend(handles = s_patch, bbox_to_anchor=(1.01, .81), loc=2,
           ncol=1, borderaxespad=0., fontsize = 24)
    T = plt.yticks()
    for t in T[1]:
        for ix, which_suite in enumerate(suites_in_clus):
            if t.get_text().lower() in suites_in_clus[which_suite]:
                t.set_color(nesta_colours[ix])
                break
    plt.tight_layout()
    if SAVEHC:
        plt.savefig(os.path.join(output_dir, 
            'suitesnos_clusters_nv_final_no_dropped/all_nos_cut_dendrogram_in_{}_{}_{}.png'.format(
            cluster_name_save,qualifier,params['ngrams'])), bbox_inches = "tight")   
        
    # now get and save the clusters
    labels_n = fcluster(linkage_matrix_n, cutting_th_n, criterion='distance')
    short_df_n = df_nos_n.reset_index()[['index','NOS Title', 'One_suite']]

    short_df_n['hierarchical'] = labels_n
    short_df_n = short_df_n.set_index('index')
    if SAVEHC:
        short_df_n.to_csv(os.path.join(output_dir, 
            'suitesnos_clusters_nv_final_no_dropped/all_nos_cut_labels_in_{}_{}_{}.csv'.format(
            cluster_name_save,qualifier,params['ngrams'])))
    print()


In [None]:
df_nos_n['NOS Title'].value_counts().values

In [None]:
Mclust = np.floor(len(standard_labels_n)/5)
print(Mclust)
labels_n = fcluster(linkage_matrix_n, Mclust, criterion='maxclust')
print(labels_n)
#labels_n = fcluster(linkage_matrix_n, cutting_th_n, criterion='distance')
#print(labels_n)
short_df_n = df_nos_n[['NOS Title', 'One_suite']]

short_df_n['hierarchical'] = labels_n
#print(short_df_n)



## Perform hierarchical clustering only on the suites in one super-suite (e.g. Management)

In [None]:

which_super_suite = 'Construction'
match_name = all_match_names[which_super_suite]
match = all_matches[which_super_suite]


In [None]:
# perform hierarchical clustering on the suites belonging to the super suite alone
distances_super, linkage_matrix_super, c_super, c_dist_super = do_hierarch_clustering(
                                                            tfidfm[match,:], DOPLOTS = False)

#Plot the dendrogram again for this super-suite
standard_labels_super = [standard_labels[t].capitalize() for t in match]

fig, ax = plt.subplots(figsize=(10, 20)) # set size
ax = dendrogram(linkage_matrix_super, 
                labels = standard_labels_super, 
                orientation = 'right', 
                leaf_font_size=6,
               color_threshold = 0,
               truncate_mode = 'level', p =40,
               above_threshold_color = 'k');

plt.tick_params(axis= 'y',
                labelsize = 14)
plt.title('Hierarchical Clustering Dendrogram of NOS (in super-suite {})'.format(
    which_super_suite), fontsize = 18)
plt.xlabel('Distance', fontsize = 18)
plt.ylabel('NOS suites',fontsize = 18)
if SAVEHC:
    plt.savefig(os.path.join(output_dir, 'nos_dendrogram_{}_{}_{}_{}.png'.format(
            which_super_suite, qualifier,params['bywhich'],params['mode'])), 
                bbox_inches = "tight")     
    

In [None]:
labels_h = fcluster(linkage_matrix_super, 0.9, criterion='distance')

if params['bywhich']== 'suites':
    short_df = pd.DataFrame(standard_labels_super)
    short_df.columns = ['Suite_names']
else:
    short_df = df_nos[['NOS Title', 'One_suite']].iloc

short_df['hierarchical'] = labels_h

n_clusters = len(collections.Counter(labels_h))
print(n_clusters,len(labels_h))


# Collect and save data for graph visualisation

In [None]:
def get_one_soc(x):
    if isinstance(x,list):
        try:
            return x[0]
        except:
            return np.nan
    else:
        return x
    
def get_one_occupation(x):
    if isinstance(x,list):
        y= ''.join(x)
    else:
        y= x
    if isinstance(y,str):
        return y.split(';')
    else:
        return y
    

In [None]:
# collect metadata for all suites in the super-suite
metadata = {}
nb_of_nos = []
datadriven_keywords = []
expert_keywords = []
top_developed = []
top_originating = []
top_approved_year = []
top_SOC = []
top_occupation = []
# group by suite
groups = df_nos.groupby('One_suite')
# cycle through suites
for suite in match_name:
    group = groups.get_group(suite)
    # how many nos in this suite
    nb_of_nos.append(len(group))
    # suite keywords (identified before)
    datadriven_keywords.append(top_terms_dict[name + ' (top features)'])
    # list of actual keywords (ordered by popularity)
    expert_keywords.append(top_terms_dict[name + ' (keywords)'])
    # most common developing and originating organisation
    top_originating.append(group['Originating_organisation'].value_counts().index[0])
    top_developed.append(group['Developed By'].value_counts().index[0])
    # most common year approved
    top_approved_year.append(group['Date_approved_year'].value_counts().index[0])
    # top SOC code (in order)
    try:
        top_SOC.append(group['Clean SOC Code'].map(get_one_soc).value_counts().index[0])
    except:
        top_SOC.append(np.nan)
    # most common occupations
    tmp = []
    for t in group['Occupations'].map(get_one_occupation):
        tmp += t
    top_occupation.append(pd.DataFrame(tmp)[0].value_counts().index[0].strip())
    if not top_occupation[-1]:
        top_occupation[-1] = np.nan

# now make dictionary
metadata['nb_of_nos'] = nb_of_nos
metadata['datadriven_keywords'] = datadriven_keywords
metadata['expert_keywords'] = expert_keywords
metadata['top_originating_organisation'] = top_originating
metadata['top_developed_by'] = top_developed
metadata['top_approved_year'] = top_approved_year
metadata['top_soc'] = top_SOC
metadata['top_occupation'] = top_occupation
#metadata['suite_name'] = match_name
# change to dataframe
df_metadata = pd.DataFrame.from_dict(metadata)
#df_metadata = df_metadata.set_index('suite_name')


In [None]:
df_management_cluster = df_metadata.join(short_df).set_index('Suite_names')#.drop('Suite_names')
print(df_management_cluster.head(n=3))
df_management_cluster.isnull().sum()


In [None]:
SAVEDATAHC = False
if SAVEDATAHC:
    with open(os.path.join(output_dir, 'for_graph_visualisation/data_from_management_clustering.pickle'), 'wb') as f:
        pickle.dump((df_management_cluster, distances_super, linkage_matrix_super, c_super, c_dist_super), f)
        

### For each suite in the management super-suite, collect the same information as for the whole management supersuite and save it

In [None]:
params2 = {}
params2['ngrams'] = 'uni'
params2['pofs'] = 'nv'
params2['tfidf_min'] = 3
params2['tfidf_max'] = 0.4

params2['bywhich'] = 'docs' #'docs' #'suites'
params2['mode'] = 'tfidf' #'tfidf' #'meantfidf' #'combinedtfidf' #'meantfidf'

# get the transform tfidf
tfidf2 = define_tfidf(params2, stopwords0)

# get the matrix again (even though if the parameters stay the same, this one is the same still)
# get the features
tfidfm2, feature_names2 = get_tfidf_matrix(params2, df_nos, tfidf2, col = 'pruned_lemmas')

# get labels
if params['bywhich'] == 'suites':
    standard_labels2 = list(df_nos.groupby('One_suite').groups.keys())
else:
    standard_labels2 = list(df_nos['NOS Title'].values)

In [None]:
tfidfm2.shape

In [None]:
groups = df_nos.groupby('One_suite')
metadata_cols = ['NOS Title', 'URN',
       'Originating_organisation','Date_approved_year',
       'Clean Ind Review Year', 'Version_number', 'Developed By', 'Validity',
       'Keywords', 'Clean SOC Code', 'NOS Document Status', 'NOSCategory', 'Occupations',
       'One_suite']
fields_to_check = ['Overview',
       'Knowledge_and_understanding', 'Performance_criteria', 'Scope_range',
       'Glossary', 'Behaviours', 'Skills', 'Values', 'External_Links' ,
       'Links_to_other_NOS']
SAVEDATAHC = False
COLLECTDATAHC = False
t0 = time.time()
if COLLECTDATAHC:
    for suite in df_management_cluster.index[::-1]:
        print(suite)
        idx = (df_nos['One_suite'] == suite).values
        if idx.sum()>19:
            tfidfm_red = tfidfm2[idx]
            group = groups.get_group(suite)
            df_metadata_nos = group[metadata_cols]
            df_metadata_nos['fields present'] = 1
            for nos in group.index:
                fields_present = []
                for field in fields_to_check:
                    if not group[field].loc[nos] == np.nan:
                        fields_present.append(field)
                df_metadata_nos['fields present'].loc[nos] = fields_present
            # now get the rest, that is the cosine distances
            distances_suite, linkage_matrix_suite, c_suite, c_dist_suite = do_hierarch_clustering(tfidfm_red, 
                                                                                          DOPLOTS = False)
    #        if suite == 'supply chain management':
    #            distances_scm, linkage_matrix_scm, c_scm, c_dist_scm = 
    #            distances_suite, linkage_matrix_suite, c_suite, c_dist_suite
    #            break
            if SAVEDATAHC:
                with open(os.path.join(output_dir, 
                        'for_graph_visualisation/data_from_{}_suite_clustering.pickle'.format(suite)), 
                          'wb') as f:
                    pickle.dump((df_metadata_nos, distances_suite, linkage_matrix_suite, c_suite, c_dist_suite), f)
print(time.time() - t0)

In [None]:
# do the clustering for the supply chain management suite
suite = 'supply chain management'
groups = df_nos.groupby('One_suite')
idx = (df_nos['One_suite'] == suite).values
tfidfm_red = tfidfm2[idx]
group = groups.get_group(suite)
nos_names = group['NOS Title'].values
nos_names = [t.capitalize() for t in nos_names]
# now get the rest, that is the cosine distances
distances_scm, linkage_matrix_scm, c_scm, c_dist_scm = do_hierarch_clustering(tfidfm_red, 
                                                                              DOPLOTS = False)

#Plot the dendrogram again for this suite
fig, ax = plt.subplots(figsize=(10, 50)) # set size
ax = dendrogram(linkage_matrix_scm, 
                labels = nos_names, 
                orientation = 'right', 
                leaf_font_size=6,
               color_threshold = 0,
               truncate_mode = 'level', p =40,
               above_threshold_color = 'k');

plt.tick_params(axis= 'y',
                labelsize = 14)
plt.title('Hierarchical Clustering Dendrogram of NOS (in the {} suite)'.format(suite), fontsize = 18)
plt.xlabel('Distance', fontsize = 18)
plt.ylabel('NOS titles',fontsize = 18)
if SAVEHC or True:
    plt.savefig(os.path.join(output_dir, 'nos_dendrogram_{}_{}_{}_{}_new.pdf'.format(suite.replace(' ','_'),
                                            qualifier,params['bywhich'],params['mode'])), bbox_inches = "tight")    

### Ad hoc requests

In [None]:
print(distance.squareform(distances_scm).shape)
groups = df_nos.groupby('One_suite')

In [None]:
group = groups.get_group('welding supervision')
print(group['tagged_tokens'].loc['eciws01.pdf'][:20])
group

In [None]:
group = groups.get_group('engineering leadership and management suite 5')
group

In [None]:
group = groups.get_group('engineering leadership and management suite 3')
group

## K-means

Hierarchical clustering is very informative, but does not scale well since we are calculating pairwise distances. So if we had many more standards than 400+ apprenticeship standards, we would have to explore other options.
One solution would be to pre-cluster standards into a large number of smaller clusters using a faster method, such as k-means and then do the hierarchical clustering.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances

In [None]:
SAVEKM = False

In [None]:
params = {}
params['ngrams'] = 'uni'
params['pofs'] = 'nv'
params['tfidf_min'] = 3
params['tfidf_max'] = 0.4

params['bywhich'] = 'suites' #'docs' #'suites'
params['mode'] = 'combinetfidf' #'tfidf' #'meantfidf' #'combinedtfidf' #'meantfidf'

# get the transform tfidf
tfidf = define_tfidf(params, stopwords0)

# get the matrix again (even though if the parameters stay the same, this one is the same still)
# get the features
tfidfm, feature_names = get_tfidf_matrix(params, df_nos, tfidf, col = 'pruned_lemmas')

# get labels
if params['bywhich'] == 'suites':
    standard_labels = list(df_nos.groupby('One_suite').groups.keys())
else:
    standard_labels = list(df_nos['NOS Title'].values)

In [None]:
def get_distance_k(df_row):
    #distance = pairwise_distances(df_row.values[:4103].reshape(1, -1), 
    #                              centroids[int(df_row['k_cluster'])].reshape(1, -1)) # why until 4103?
    L = len(df_row.values) # last column is the cluster class
    distance = pairwise_distances(df_row.values[:L-1].reshape(1, -1), 
                                  centroids[int(df_row['k_cluster'])].reshape(1, -1))
    distance = distance[0][0]
    return distance

In [None]:
# takes 5 seconds for N = 400, 20 seconds for N = 1000, 50 seconds for N = 2000, 110 seconds for 4000
N = 400
t0 = time.time()
# use approx the number of super suites SDS gave you
k = 40 
#use the number of clusters detected by the hierarchical algorithm above
#k = n_clusters 
km = KMeans(n_clusters = k, random_state = 111)
km.fit(tfidfm.toarray())
k_clusters = km.labels_.tolist()
print_elapsed(t0, task = 'kmean clustering')

In [None]:
centroids = km.cluster_centers_

In [None]:
short_df['k_means'] = k_clusters

In [None]:
tfidfm_df = pd.DataFrame(tfidfm.toarray())

In [None]:
tfidfm_df['k_cluster'] = k_clusters

In [None]:
short_df['k_distance'] = tfidfm_df.apply(get_distance_k, axis =1)

In [None]:
# collect the centroids (that is, the suite closest to the centroid) and print the result of the clustering
kmeans_dict= {}
most_central = []
igroup = 0
for name, group in short_df.groupby('k_means'):
    kmeans_dict['{}'.format(name)] = group['Suite_names'].values
    if igroup < 10:
        print(name, group.sort_values(by = 'k_distance').head(3))
    igroup += 1
    most_central.append(group.sort_values(by = 'k_distance').head(1))
if SAVEKM:
    pd.DataFrame.from_dict(kmeans_dict, orient = 'index').to_csv(output_dir +
                                            '/Kmeans_results_{}_{}_{}.csv'.format(qualifier,bywhich,mode))

#### HIERARCHICAL 2

We can now select representatives from each small cluster and perform hierarchical clustering again

In [None]:
select_df = pd.concat(most_central)

In [None]:
print(len(set(select_df['hierarchical'])))
# print the most representative suite for each cluster
select_df

In [None]:
# find the indices of the central suites/docs
select_ix = np.array([short_df.index.get_loc(ix) for ix in select_df.index])

In [None]:
select_standards = tfidfm.toarray()[select_ix, :]

In [None]:
select_standards.shape

In [None]:
#We calculate cosine distance between tf-idf vectors of the documents/suites
distances_s, linkage_matrix_s, c_s = do_hierarch_clustering(select_standards, DOPLOTS = False)
'''
distances_s = distance.pdist(select_standards, metric = 'cosine')

#We then build linkage matrix using the distances and specifying the method. For euclidean distances typically 'Ward'
#produces best results. For cosine we can only use 'average' and 'single'.
linkage_matrix_s = scipy.cluster.hierarchy.linkage(distances_s,
                                                 method = 'average',
                                                 metric = 'cosine')
#We can test how well the groupings reflect actual distances. If c > 0.75 this is considered to be sufficiently
#good representation
c, coph_dists = cophenet(linkage_matrix_s, 
                         distance.pdist(select_standards, metric = 'cosine'))
'''
1

In [None]:
if params['bywhich'] == 'suites':
    standard_labels = list(select_df['Suite_names'].values)
else:
    standard_labels = list(select_df['NOS Title'].values)

fig, ax = plt.subplots(figsize=(10, 15)) # set size
ax = dendrogram(linkage_matrix_s, 
                labels = standard_labels, 
                orientation = 'right', 
                leaf_font_size=6,
               color_threshold = 0.8);

plt.tick_params(axis= 'y',
                labelsize = 12)
plt.title('Hierarchical Clustering Dendrogram of Pre-clustered Standards', fontsize = 12)
plt.xlabel('Distance', fontsize = 12)
if params['bywhich'] == 'suites':
    plt.ylabel('NOS suites',fontsize = 12)
else:
    plt.ylabel('NOS titles',fontsize = 12)
if SAVEKM:
    plt.savefig(os.path.join(output_dir, 'nos_dendrogram_centroids_{}_{}_{}.svg'.format(qualifier,bywhich,mode)), 
            bbox_inches = "tight") 

Everything below is just backup, with code that is now being replaced, but the replacement has not been fully tested yet

In [None]:
'''
The next three lines compute the tfidf matrix for the keywords extraction
'''

textfortoken= df_nos['pruned']
tfidfm = tfidf.fit_transform(textfortoken)
feature_names = tfidf.get_feature_names()

'''
The code below was used to compute the tfidf matrix to feed to the clustering algorithm. However, I want to uniform 
the way the matrix is computed for both the keywords extraction and the clustering
'''



def get_tfidfm(bywhich, mode, df_nos)
    t0 = time.time()
    if bywhich == 'suites':
        if mode == 'meantfidf':
            # this is the case where I group by suite and use the average tfidf vectore as the features to cluster
            row_names = df_nos['One_suite'].value_counts().index.values
            tfidfm = scipy.sparse.lil_matrix(np.zeros((len(row_names),len(feature_names)), dtype = np.float32))
            igroup = 0
            for name, group in df_nos.groupby('One_suite'):
                tmp = get_mean_tfidf(group['pruned'], tfidf)
                tfidfm[igroup] = tmp
                igroup += 1
        elif mode == 'combinedtfidf':
            # this is the case where I group by suite, concatenate all tokens and compute the tfidf vectors
            row_names = df_nos['One_suite'].value_counts().index.values
            tfidfm = scipy.sparse.lil_matrix(np.zeros((len(row_names),len(feature_names)), dtype = np.float32))
            igroup = 0
            for name, group in df_nos.groupby('One_suite'):
                joint_tokens = []
                for idoc in group['pruned'].index:
                    joint_tokens += group['pruned'].loc[idoc]
                tmp = tfidf.transform([joint_tokens])
                tfidfm[igroup] = tmp
                igroup += 1
        # TODO: top word embedding modes + t
    elif bywhich == 'docs':
        if mode == 'tfidf':
            # this is where I keep everything the same
            tfidfm = tfidf.fit_transform(df_nos['pruned'])
    print_elapsed(t0, 'computing the feature vector')
    tfidfm