In [None]:
%matplotlib inline
import matplotlib.pyplot as plt


import os
import itertools
import json
import numpy as np
import pandas as pd
import pickle
import requests
import seaborn as sns
import collections
from collections import Counter
import scipy
import time

import matplotlib as mpl
import matplotlib.gridspec as gridspec
from matplotlib.patches import Rectangle


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition.pca import PCA


import nltk
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
import gensim
import re
from fuzzywuzzy import process


In [None]:
from utils_nos import nesta_colours, nesta_colours_combos
print(nesta_colours, nesta_colours_combos)

In [None]:
# set up plot style
print(plt.style.available)
plt.style.use(['seaborn-darkgrid','seaborn-poster','ggplot'])

#### TODOs for data cleaning:

1. remove square brackets
2. make everything lower case


## Overview

This notebook contains a few functions and snippets of code that are useful for analysing text. Most of the techniques used are unsupervised. Functions are defined up front and then used in sections below.

This notebook is to apply:
- Tokenizers (based on n-grams and 'as_is')
- LSH

This specific instance of the notebook will be applied to the analysis of NOS


In [None]:
# flatten lists of lists
def flatten_lol(t):
    return list(itertools.chain.from_iterable(t))
flatten_lol([[1,2],[3],[4,5,6]])


In [None]:
#These two functions are useful for analysing bi and tri-grams with w2v models in gensim

def convert_to_undersc(skill):
    '''
    convert spaces in skill phrases into underscores to use with trained
    w2v model.
    '''
    if len(skill.split(' ')) >1:
        new_i = '-'.join(skill.split(' '))
    else:
        new_i = skill
    return(new_i)

def convert_from_undersc(skill):
    '''
    convert underscores between terms in skill phrases back to spaces.
    '''
    if len(skill.split('_')) >1:
        new_i = ' '.join(skill.split('_'))
    else:
        new_i = skill
    return(new_i)


In [None]:
#A few functions for tyding up text
def tag_for_lemmatise(s):
    pos_to_wornet_dict = {
        'JJ': 'a',
        'JJR': 'a',
        'JJS': 'a',
        'RB': 'r',
        'RBR': 'r',
        'RBS': 'r',
        'NN': 'n',
        'NNP': 'n',
        'NNS': 'n',
        'NNPS': 'n',
        'VB': 'v',
        'VBG': 'v',
        'VBD': 'v',
        'VBN': 'v',
        'VBP': 'v',
        'VBZ': 'v',
    }
    try:
        return pos_to_wornet_dict[nltk.pos_tag([s])[0][1]]
    except:
        return 'n'
    
def lemmatise(title_terms):
    """
    Takes list as input.
    Removes suffixes if the new words exists in the nltk dictionary.
    The purpose of the function is to convert plural forms into singular.
    Allows some nouns to remain in plural form (the to_keep_asis is manually curated).
    Returns a list.
    >>> lemmatise(['teachers'])
    ['teacher']
    >>> lemmatise(['analytics'])
    ['analytics']
    """
    keep_asis = ['sales', 'years', 'goods', 'operations', 'systems',
                    'communications', 'events', 'loans', 'grounds',
                    'lettings', 'claims', 'accounts', 'relations',
                    'complaints', 'services']
    wnl = nltk.WordNetLemmatizer()
    processed_terms = [wnl.lemmatize(i) if i not in keep_asis else i for i in title_terms]
    #processed_terms = [wnl.lemmatize(i, pos = tag_for_lemmatise(i)) 
    #            if i not in keep_asis else i for i in title_terms]
    return processed_terms

def remove_digits(s):
    """
    Takes a string as input.
    Removes digits in a string.
    Returns a string.
    >>> remove_digits('2 recruitment consultants')
    ' recruitment consultants'
    """
    result = ''.join(i for i in s if not i.isdigit())
    return result

def remove_list_enumeration(s):
    '''
    This is a specific requirement of the NOS that comes from
    the presence of lists enumerated by strings like K+number
    or P+number. Therefore, after "lowerising" and removing 
    digits, I look for and remove strings like "k " and "p "
    '''
    result = re.sub('( k )+',' ',s)
    result = re.sub('( p )+', ' ', result)
    # it might not be necessary if I add 'k' and 'p' to stopwords
    return result

select_punct = set('!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~') #only removed "'"
extra_chars = set('–-•’”“µ¾âãéˆﬁ[€™¢±ï…˜')
all_select_chars = select_punct.union(extra_chars)
def replace_punctuation(s):
    """
    Takes string as input.
    Removes punctuation from a string if the character is in select_punct.
    Returns a string.
   >>> replace_punctuation('sales executives/ - london')
   'sales executives   london'
    """
    for i in set(all_select_chars): #set(select_punct):
        if i in s:
            s = s.replace(i, ' ')
    return s

def tidy_desc(desc):
    clean_data = desc.replace('\r\n', '').replace('\xa0', '')
    nodigits = remove_digits(clean_data.lower())
    nopunct = replace_punctuation(nodigits)
    #nopunct = remove_list_enumeration(nopunct)
    lemm = lemmatise(nopunct.split())
    return ' '.join(lemm)

def tokenize(text):
    """
    Takes string as input.
    Returns list of tokens. The function is used as an argument for
    TfidfVectorizer.
    >>> tokenize('some job title')
    ['some', 'job', 'title']
    """
    tokens = nltk.word_tokenize(text)
    return tokens

def tokenize_asis(some_list):
    """
    Takes list as input.
    Returns the list with elements converted to lower case. The function is 
    used as an argument for TfidfVectorizer.
    
    In [57]: tokenize(['Accounting', 'Microsoft Excel'])
    Out[57]: ['accounting', 'microsoft excel']
    """
    tokens = [elem.lower() for elem in some_list]
    return tokens

In [None]:
#This set of functions is useful for identifying terms with highest tf-idf weights 
#in a single document or set of documents

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding 
        feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25, sparse_output = False):
    ''' Return the top n features that on average are most important 
        amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    if sparse_output:
        return scipy.sparse.csr_matrix(top_tfidf_feats(tfidf_means, features, top_n))
    else:
        return top_tfidf_feats(tfidf_means, features, top_n)

def all_mean_feats(Xtr, grp_ids=None, min_tfidf=0.1):
    ''' Return the average
        amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return tfidf_means

def get_top_words_weights(desc, feature_names, vect, n = 25):
    response = vect.transform(desc)
    words = top_mean_feats(response, feature_names, grp_ids = None, top_n = n)
    return words

def get_mean_tfidf(desc, vect):
    response = vect.transform(desc)
    tfidf_values = all_mean_feats(response, grp_ids = None)
    return tfidf_values

def get_top_words(desc, feature_names, vect, n = 25):
    response = vect.transform(desc)
    words = top_mean_feats(response, feature_names, grp_ids = None, top_n = n)
    return words['feature'].values

In [None]:
#Function to parse html

from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):

#HTML Parser Methods
#Initializing lists
    lsData = list()
    
    def handle_data(self, data):
        self.lsData.append(data)
        
    def get_data(self):
        return ''.join(self.lsData)

           
def strip_tags(some_html):
    """
    Takes string as input.
    Removes html tags.
    Returns a string.
    """
    s = MyHTMLParser()
    s.lsData = list()
    s.feed(some_html)
    data = s.get_data()
    s.reset
    return data


In [None]:
def print_elapsed(t0_local, task = 'current task'):
    print('Done with {}. Elapsed time: {:4f}'.format(task,time.time()-t0_local))
    

In [None]:
qualifier = 'postjoining_final_no_dropped'
qualifier0 = 'postjoining_final_no_dropped'
pofs = 'n'


In [None]:
output_dir = '/Users/stefgarasto/Google Drive/Documents/results/NOS/nlp_analysis/'


In [None]:
lookup_dir = '/Users/stefgarasto/Google Drive/Documents/results/NOS/extracted/'


In [None]:
#Loading a pre-trained glove model into gensim
from gensim.scripts.glove2word2vec import glove2word2vec

glove_dir = '/Users/stefgarasto/Local-Data/wordvecs/glove.twitter.27B'

# to make the glove model file compatible with gensim
#for dim in ['25','50','100','200']:
##    glove_file = os.path.join(glove_dir,'glove.twitter.27B.{}d.txt'.format(dim))
#    tmp_file = os.path.join(glove_dir, 'word2vec.glove.twitter.27B.{}d.txt'.format(dim) )
#    _ = glove2word2vec(glove_file, tmp_file)

LOADGLOVE = False
if LOADGLOVE:
    # load the glove model
    model = gensim.models.KeyedVectors.load_word2vec_format\
    (os.path.join(glove_dir, 'word2vec.glove.twitter.27B.100d.txt'))
    #model = api.load("glove-wiki-gigaword-100")  # load pre-trained word-vectors from gensim-data
    #model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
    #word_vectors = model.wv
print('Done')


In [None]:
#Get the NOS data for approved apprenticeship standards from api
#r2 = requests.get("https://www.instituteforapprenticeships.org/api/fullstandards/")
#df_api= pd.DataFrame(r2.json())
df_nos = pd.read_pickle(lookup_dir + 'all_nos_input_for_nlp_{}.zip'.format(qualifier0))

# load the cleaned and tokenised dataset
df_nos = df_nos.join(pd.read_pickle(lookup_dir + 'all_nos_input_for_nlp_{}_pruned_{}.zip'.format(qualifier,pofs)))
print('Done')


In [None]:
# manually remove "k"s and "p"s from the pruned columns
def remove_pk(x):
    return [t for t in x if t not in ['k','p']]
df_nos['pruned'] = df_nos['pruned'].map(remove_pk)

In [None]:
df_nos.sample(n=3)


In [None]:
# Load stopwords
with open(lookup_dir + 'stopwords_for_nos_{}_{}.pickle'.format(qualifier,pofs),'rb') as f:
    stopwords0, no_idea_why_here_stopwords, more_stopwords = pickle.load(f)
stopwords = stopwords0 + no_idea_why_here_stopwords 
stopwords += tuple(['¤', '¨', 'μ', 'บ', 'ย', 'ᶟ', '‰', '©', 'ƒ', '°', '„'])
stopwords0 += tuple(['¤', '¨', 'μ', 'บ', 'ย', 'ᶟ', '‰', '©', 'ƒ', '°', '„'])


## LSH

In [None]:
from datasketch import MinHashLSHEnsemble, MinHash, MinHashLSH


In [None]:
def shingles(text, char_ngram=5):
    '''
    This function splits strings into continuous sets of characters of length n. In the current example n = 5.
    '''
    if len(text) == 5:
        res = set([text, text])
    else:
        res = set(text[head:head + char_ngram] \
               for head in range(0, len(text) - char_ngram))
    return res


In [None]:
t0 = time.time()
shingled_desc = [shingles(desc) for desc in df_nos['clean_full_text']]
print_elapsed(t0, 'splitting the text into groups of characters')


In [None]:
#Create hash signatures for shingles
t0 = time.time()
hash_objects = []
for i in range(len(shingled_desc)):
    m = MinHash(num_perm=200)
    hash_objects.append(m)
print_elapsed(t0, 'creating hash signatures')


In [None]:
SAVELSH = False


In [None]:
t0 = time.time()
for ix, desc in enumerate(shingled_desc):
    for d in desc:
        hash_objects[ix].update(d.encode('utf8'))
print_elapsed(t0, 'encoding hash objects')


In [None]:
content = []
standard_labels = list(df_nos.index.values) #df_nos['URN'].values)
title_labels = list(df_nos['NOS Title'].values)
urn_labels = list(df_nos['URN'].values)
for ix, desc in enumerate(shingled_desc):
    content.append((standard_labels[ix], hash_objects[ix]))
    

In [None]:
#Define LSH and Jaccard similarity threshold
LSH_th = 0.8
lsh = MinHashLSH(threshold=LSH_th, num_perm=200)


In [None]:
for ix,elem in enumerate(content):
    #lsh.insert('{}'.format(ix), elem[1]) #elem[0], elem[1])
    lsh.insert(elem[0], elem[1])
    

In [None]:
#For each standard search all signatures and identify potential clashes (e.g. other standards 
# with Jaccard similarity of shingle sets greater or equal to the threshold). 
# Note: some of the candidates might be false positives.
candidates = {}
singletons = []
candidates2 = []
for ix, desc in enumerate(shingled_desc):
    result = lsh.query(hash_objects[ix])
    if len(result) >1:
        full_result = []
        for res in result:
            full_result.append((urn_labels[standard_labels.index(res)
                                          ],df_nos['NOS Title'].loc[res],res))
        
        candidates[standard_labels[ix]] = full_result
        candidates2.append(result)
        if ix<40:
            print(urn_labels[ix], ': ', full_result)
            print('***************')
    else:
        singletons.append(standard_labels[ix])
        

In [None]:
# Amount of NOS that found at least a match = total NOS - number of singletons
print('Nb. of NOS that were not matched with anything: {}'.format(len(singletons)))
print('Nb. of NOS that matched: {}'.format(len(title_labels) - len(singletons)))


In [None]:

'''# If we make the assumptions that groups are closed and don't form chains, then:
# nb of groups of N = nb of matches of length N / N
all_lengths = np.array([len(t) for t in candidates2])
unique_lengths = list(set(all_lengths))
len_counts = []
for ln in unique_lengths:
    len_counts.append((ln,np.sum(all_lengths == ln)))
print(np.sum([t[1] for t in len_counts]))
print(len(candidates2),len(candidates))
print(len_counts)
'''
print('Done')

In [None]:
# I think that the first thing should be to create an adjacency matrix
t0 = time.time()
Nmatched = len(title_labels) - len(singletons)
Adj_matrix = np.zeros((Nmatched,Nmatched))
# create dictionary of indices
indices = {}
indices_reverse = {}
for ix, candidate in enumerate(candidates):
    indices[candidate] = ix
    indices_reverse[ix] = candidate
# now cycle again through the matched NOS and populate the adjacency matrix
for ix, candidate in enumerate(candidates):
    idx1 = ix
    for k in candidates[candidate]:
        # now this is a list of tuples, where the first element is the urn label
        idx2 = indices[k[2]]
        Adj_matrix[idx1,idx2] = 1

print_elapsed(t0,'creating the adjacency matrix')
#plt.figure(figsize = (5,5))
#plt.imshow(Adj_matrix[:200,:200])
print('The highest degree in the adjacency matrix is: ', np.max(np.sum(Adj_matrix,axis=1)))
print('The number of matched couples are: ', np.sum(np.sum(Adj_matrix, axis = 1)==2))


In [None]:
# group the NOS that were matched as similar
t0 = time.time()
matched_groups = []
matched_indices = []
for ix in range(Adj_matrix.shape[0]):
    idx_used = []
    # find the adjacent nodes
    where_list = list(np.where(Adj_matrix[ix])[0])
    where_list_cumul = []
    # don't go into the rabbit hole of nodes with very high degree - also, don't use indices already matched
    if len(where_list)<60000 and (ix not in matched_indices):
        for ix2 in where_list:
            # if the neighborhood has connections to indices that we haven't included yet, 
            # add them to the list to be analysed later
            where_list_cumul += list(np.where(Adj_matrix[ix2])[0])
            idx_used.append(ix2)
            # grow the neighbourhood by adding the new connections
            new_list = [t for t in where_list_cumul if t not in where_list]
            # don't go into the rabbit hole of nodes with very high degree
            if len(new_list)>60000:
                break # this one tells it to break the inner for cycle - it goes to the next if
            if len(new_list):
                # if the length is zero it means there are no new connected nodes
                where_list+=new_list
        # if it has never gone into a rabbit hole then add the group just found
        # if and only if the neighbourhood is self-contained, that is if the nodes for which we have collected the
        if (set(idx_used) == set(where_list)) and (len(new_list)<6):
            if len(new_list)>6:
                print('got here after breaking', idx_used)
            matched_groups.append(tuple(idx_used))
            matched_indices += idx_used #[t for t in idx_used if t not in matched_indices]
print_elapsed(t0, 'grouping the similar NOS')


In [None]:
#for t in matched_couples:
#    if not (t in matched_groups):
#        print(t)
# show some of the groups
groups_length = [len(t) for t in matched_groups]
print('Number of groups of size up to 3: ', np.sum([t<4 for t in groups_length]))
nbprint = 0
for t in matched_groups:
    if len(t)>4 and len(t)<7:
        print(t)
        for it in t:
            print(np.where(Adj_matrix[it])[0])
        nbprint+=1
    if nbprint>2:
        break


In [None]:
# check that all the groups are not overlapping: if it prints something is bad
print('If something gets printed next, then the groups indentified have overlaps')
matched_groups2 = []
for it,t in enumerate(matched_groups):
    if it>40000:
        break
    flag = 0
    for t2 in matched_groups:
        if t != t2:
            if set(t).intersection(set(t2)):
                print(t,t2)
                flag += 1
                #if (set(t)-set(t2)) and (set(t2) - set(t)):
                #    print(t,t2)
    if flag == 0:
        matched_groups2.append(t)
    else:
        print(flag)
        

In [None]:
import copy
N = Adj_matrix.shape[0]
Nmatched= len(matched_indices)
print('number of grouped NOS', N, 'number of matched NOS', Nmatched)
# finally, check that the grouped and the non-grouped indices have a separate adjacency matrix
Adj_matrix2 = copy.deepcopy(Adj_matrix)
leftout_indices = list(set(range(N)) - set(matched_indices))
neworder_indices = np.array(matched_indices + leftout_indices)
# rearrange rows and columns of the adjacency matrix
Adj_matrix2 = Adj_matrix2[:, neworder_indices][neworder_indices]
print('For the two parts of the Adjacency matrix to be separate, there has to be no edges between the two.')
print('The number of edges is: ')
print(np.sum(Adj_matrix2[:Nmatched,Nmatched:]))
# check the matrix is symmetric
print('if 1 the matrix is symmetric: ', np.mean(Adj_matrix2 == Adj_matrix2.T))


In [None]:
print(leftout_indices)

In [None]:
# now redo the dictionary to save, with placing the self-contained groups first and then the unpaired ones
from collections import OrderedDict
candidates_grouped = OrderedDict()
candidates_paired = OrderedDict()

#print(indices_reverse)
pair_counter = 0
group_counter = 0
for ix, group in enumerate(matched_groups):
    full_result = []
    if len(group)==2:
        for ig in group:
            res = indices_reverse[ig]
            full_result.append((urn_labels[standard_labels.index(res)
                                          ],df_nos['NOS Title'].loc[res],res))
        candidates_paired['Pair {}'.format(pair_counter)] = full_result
        pair_counter+=1
for ix, group in enumerate(matched_groups):
    full_result = []
    if len(group)>2:
        for ig in group:
            res = indices_reverse[ig]
            full_result.append((urn_labels[standard_labels.index(res)
                                          ],df_nos['NOS Title'].loc[res],res))
        candidates_grouped['Group {}'.format(group_counter)] = full_result
        group_counter+=1
    
#candidates_grouped['Finished with groups'] = '-'

'''# now add the ones that weren't matched
for ix in leftout_indices:
    res = indices_reverse[ig]
    candidates_grouped[res] = candidates[res]
'''
# try to save it, just to see what it looks like
if SAVELSH or True:
    tmp = pd.DataFrame.from_dict(candidates_grouped, orient = 'index')
    print(tmp.columns)
    tmp.to_csv(output_dir +  '/LSH_results_grouped_no_pairs_{}_th{}.csv'.format(qualifier,LSH_th))

if SAVELSH or True:
    pd.DataFrame.from_dict(candidates_paired, orient = 'index', columns = [
                                                                    'NOS 1','NOS 2']).to_csv(output_dir + 
                                                '/LSH_results_paired_{}_th{}.csv'.format(qualifier,LSH_th))


In [None]:
if SAVELSH and False:
    pd.DataFrame.from_dict(candidates, orient = 'index').to_csv(output_dir + '/LSH_results_{}_th{}.csv'.format(
        qualifier,LSH_th))
    with open(output_dir + '/Candidates_nos_{}_th{}.pickle'.format(qualifier,LSH_th),'wb') as f:
        pickle.dump(candidates,f)
        

In [None]:
# Plot how many duplicates per suite / per originating organisation
# get the value count
for col in ['One_suite', 'Developed By']:
    duplicated_suites = df_nos.loc[list(candidates.keys())][col].value_counts()
    all_suites = df_nos[col].value_counts()
    if col == 'One_suite':
        N = 70
        fig =plt.figure(figsize = (11,18))
        plt.ylabel('Suite',fontsize = 18)
    else:
        N = 32
        fig = plt.figure(figsize = (7,12))
        plt.ylabel('Developing organisation', fontsize = 18)
    plt.xlabel('Counts', fontsize = 18)
    with sns.plotting_context('talk'):
        duplicated_suites[:N][::-1].plot('barh', color = nesta_colours[3])
    ax = plt.gca()
    fig.canvas.draw()
    labels = [item.get_text().capitalize() for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels)
    plt.tight_layout()
    if SAVELSH or True:
        plt.savefig(os.path.join(output_dir,'{}_counts_for_duplicates.png'.format(col)))

    # now divide by the sum (so we get fractions)
    #duplicated_suites = duplicated_suites.map(lambda x: x/duplicated_suites.sum())
    #all_suites = all_suites.map(lambda x: x/all_suites.sum())
    # get the ratio of proportions with respect to the full distribution
    suites_ratio = {}
    for row in all_suites.index[:N]:
        try:
            suites_ratio[row] = duplicated_suites.loc[row]/all_suites.loc[row]
        except:
            suites_ratio[row] = 0
    suites_ratio = pd.DataFrame.from_dict(suites_ratio, orient = 'index', columns = ['ratio'])
    ## order by decreasing ratios 
    #suites_ratio= suites_ratio.sort_values(by='ratio', ascending = False)
    # plot the ratio
    if col == 'One_suite':
        fig=plt.figure(figsize = (12,18))
        ix = range(N)
        plt.ylabel('Suite',fontsize = 18)
    else:
        fig =plt.figure(figsize = (7,12))
        ix = range(N)
        plt.ylabel('Developing organisation', fontsize = 18)
    plt.xlabel('Proportion of NOS', fontsize = 18)
    plt.xlim([0,1])
    with sns.plotting_context('talk'):
        suites_ratio['ratio'].iloc[ix][::-1].plot('barh', color = nesta_colours[3])
    ax = plt.gca()
    fig.canvas.draw()
    labels = [item.get_text().capitalize() for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels)
    plt.tight_layout()
    if SAVELSH or True:
        plt.savefig(os.path.join(output_dir,'{}_ratios_for_duplicates_{}.png'.format(col,LSH_th)))



### Compute level of overlap for each group identified via the LSH algorithm
- Use the average level of pairwise overlap between NOS in each group


In [None]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    res = float(intersection / union)
    return res

In [None]:
# this is just to assign keys and NOS to a dictionary (?)
#print(candidates_grouped['Group 0'])
content_dict = {}
for ix,elem in enumerate(content):
    content_dict[elem[0]] = elem[1]
    
print(content_dict[lsh.query(elem[1])[0]].jaccard(content_dict[lsh.query(elem[1])[0]]))
print(lsh.query(elem[1]))

In [None]:
# compute the average level of overlap within groups and compare to the average number of common words
#candidates_grouped_new = copy.deepcopy(candidates_grouped)
#A = []
#B = []
all_j_values = []
all_w_values = []
t0 = time.time()
for ig,g in enumerate(candidates_paired.keys()):
    if g != 'Finished with groups':
        G = candidates_paired[g]
        j_values = []
        g_suites = []
        g_cat = []
        w_values = []
        
        for t,ix1 in enumerate(G):
            # create bag of words that retain duplicated words
            if len(G)<50:
                t1 = []
                for tt in df_nos.loc[ix1[2]]['clean_full_text'].split():
                    if tt not in t1:
                        t1.append(tt)
                    else:
                        t1.append(tt+'1')
            for ix2 in G[t+1:]:
                j_values.append(content_dict[ix1[2]].jaccard(content_dict[ix2[2]]))
                if len(G)<50:
                    # create bag of words that retain duplicates
                    t2 = []
                    for tt in df_nos.loc[ix2[2]]['clean_full_text'].split():
                        if tt not in t2:
                            t2.append(tt)
                        else:
                            t2.append(tt + '1')
                    w_values.append(len(set(t1).intersection(set(t2)))/ (len(set(t1))+len(set(t2))) *2)
            g_suites.append(df_nos.loc[ix1[2]]['One_suite'])
            g_cat.append(df_nos.loc[ix1[2]]['NOSCategory'])
        
        if len(G)<50:
            # keep jaccard similarities
            all_j_values += j_values
            # keep proportion of common words
            all_w_values += w_values
        # append average jaccard similarity and number of unique suites in the group divided by groups size
        tmp = [t == 'generic' for t in g_cat]
    if ig%1000 == 999:
        print('Update. Time = {:.4f}s'.format(time.time()-t0))
print('Done in {:.4f}s'.format(time.time()-t0))
# print number of common words
#print(np.mean(all_w_values[all_j_values>.9]))

In [None]:
# compute the average level of overlap within groups bigger than two
candidates_grouped_new = copy.deepcopy(candidates_grouped)
#A = []
#B = []
all_j_values = []
all_w_values = []
t0 = time.time()
for ig,g in enumerate(candidates_grouped.keys()):
    if g != 'Finished with groups':
        G = candidates_grouped[g]
        j_values = []
        g_suites = []
        g_cat = []
        w_values = []
        
        for t,ix1 in enumerate(G):
            for ix2 in G[t+1:]:
                j_values.append(content_dict[ix1[2]].jaccard(content_dict[ix2[2]]))
        candidates_grouped_new[g] = [np.around(np.mean(j_values),3)] + G
    if ig%1000 == 999:
        print('Update. Time = {:.4f}s'.format(time.time()-t0))
print('Done in {:.4f}s'.format(time.time()-t0))

#candidates_grouped_new.pop('Finished with groups')
# create the dataframe
tmp = pd.DataFrame.from_dict(candidates_grouped_new, orient = 'index').rename(columns = {0:'Avg group similarity'})
# rename the columns
rename_dict= {}
for ii in range(len(tmp.columns)):
    rename_dict[ii] = 'NOS {}'.format(ii)
tmp = tmp.rename(columns = rename_dict)
# sort by descending values of the average similarity
tmp = tmp.sort_values(by = 'Avg group similarity', ascending = False)
# rename the rows after the sorting
tmp = tmp.reset_index(drop=True)
rename_dict = {}
for ii in range(len(tmp)):
    rename_dict[ii] = 'Group {}'.format(ii)
tmp = tmp.rename(index = rename_dict)

if SAVELSH or True:
#   tmp = pd.DataFrame.from_dict(candidates_grouped_new, orient = 'index').rename(columns= {0:'Avg group similarity'})
    tmp.to_csv(output_dir + '/LSH_results_grouped_no_pairs_with_score_{}_th{}.csv'.format(qualifier,LSH_th))


In [None]:
# compute the average level of overlap within pairs
candidates_paired_new = copy.deepcopy(candidates_paired)
t0 = time.time()
for ig,g in enumerate(candidates_paired.keys()):
    if g != 'Finished with groups':
        G = candidates_paired[g]
        j_values = []
        for t,ix1 in enumerate(G):
            for ix2 in G[t+1:]:
                j_values = [content_dict[ix1[2]].jaccard(content_dict[ix2[2]])]
                if ig%30>-1:
                    if ix1[2]=='mpqmg31.pdf':
                        print(j_values,content_dict[ix1[2]] )
                    #print(ix1[2],ix2[2])
        candidates_paired_new[g] = [np.around(np.mean(j_values),3)] + G
    if ig%1000 == 999:
        print('Update. Time = {:.4f}s'.format(time.time()-t0))
print('Done in {:.4f}s'.format(time.time()-t0))

#candidates_grouped_new.pop('Finished with groups')
# create the dataframe
tmp = pd.DataFrame.from_dict(candidates_paired_new, orient = 'index').rename(columns = {0:'Avg group similarity'})
# rename the columns
rename_dict= {}
for ii in range(len(tmp.columns)):
    rename_dict[ii] = 'NOS {}'.format(ii)
tmp = tmp.rename(columns = rename_dict)
# sort by descending values of the average similarity
tmp = tmp.sort_values(by = 'Avg group similarity', ascending = False)
# rename the rows after the sorting
tmp = tmp.reset_index(drop=True)
rename_dict = {}
for ii in range(len(tmp)):
    rename_dict[ii] = 'Pair {}'.format(ii)
tmp = tmp.rename(index = rename_dict)
if SAVELSH or True:
#   tmp = pd.DataFrame.from_dict(candidates_grouped_new, orient = 'index').rename(columns= {0:'Avg group similarity'})
    tmp.to_csv(output_dir + '/LSH_results_pairs_with_score_{}_th{}.csv'.format(qualifier,LSH_th))


In [None]:
print('90%: ',np.array(all_w_values)[np.array(all_j_values)>0.9].mean())
print('80%: ',np.array(all_w_values)[(np.array(all_j_values)>0.8) & (np.array(all_j_values)<0.9)].mean())

In [None]:
print(df_nos['clean_full_text'].map(lambda x: len(x.split())).mean())

### Ad hoc requests

In [None]:
duplicated_nos = df_nos.loc[list(candidates.keys())]
duplicated_cos = duplicated_nos[duplicated_nos['Developed By']=='construction skills']
duplicated_nos = duplicated_nos[duplicated_nos['Developed By']=='semta']
tmp = list(duplicated_nos.index)
print([t for t in tmp if 'l' in t[-7:]])
print(duplicated_nos.columns)
print(duplicated_nos['Version_number'].value_counts())


In [None]:
'''
# to be old and new updated nos, it would have to be the following conditions:
1. group of two NOS
2. two different versions (1 and higher)
3. same or very similar suite (that is, the same aside from numbers)
'''
semta_versions = []
mixed_version = []
semta_suites = []
semta_titles = []
for g in candidates_grouped_new:
    group = candidates_grouped_new[g]
    if group[1][0][:3] == 'sem':
        #print(len(group))
        semta_versions.append([])
        semta_suites.append([])
        semta_titles.append([])
        if len(group)==3:
            # only take groups of pairs
            for ix in range(1,len(group)):
                #print(duplicated_nos.loc[group[ix][2]]['Version_number'])
                try:
                    semta_versions[-1].append(duplicated_nos.loc[group[ix][2]]['Version_number'])
                    semta_suites[-1].append(duplicated_nos.loc[group[ix][2]]['One_suite'])
                    semta_titles[-1].append(group[ix][1])
                except:
                    1
            
            if (1.0 in semta_versions[-1]) & ((2.0 in semta_versions[-1]) or (3.0 in semta_versions[-1])):
                # check the suites
                suite1 = ''.join([t for t in semta_suites[-1][0] if not t.isdigit()])
                suite2 = ''.join([t for t in semta_suites[-1][1] if not t.isdigit()])
                title1 = semta_titles[-1][0]
                title2 = semta_titles[-1][1]
                out = process.extract(title1, [title2])
                out2 = process.extract(suite1, [suite2])
                # assume two titles are similar if the fuzzy matching is higher than 90
                if out[0][1]>89:
                    # if we also require the two suite names to have a match higher than 90
                    if out2[0][1]>89:
                        mixed_version.append(group)
                        print(semta_suites[-1][0],semta_suites[-1][1])
                        
#print(mixed_version)
len(mixed_version), len(semta_versions), len(duplicated_nos)


In [None]:
# construction skills should be responsible for the very big group
print(len(duplicated_cos))
for g in candidates_grouped_new:
    group = candidates_grouped_new[g]
    if group[1][0][:3] == 'cos':
        if len(group)>200:
            tmp = [group[ix][0][:3] for ix in range(1,len(group))]
            print(sum([t=='cos' for t in tmp]))
