# Spanish P.S. Cluster Analysis

In [None]:
import numpy as np
import pickle
import os.path
import scipy
import scipy.spatial
import sklearn
import sklearn.cluster
import pandas
import collections
import random
import json

## Preprocessing

- Reading in the corpus data
- Splitting into historical/original and modernised portion
- Creating distance matrix from the historical data

In [None]:
INPUT = '../../histnorm/datasets/historical/spanish/spanish-ps<n>.dev.txt'
ENCODING = 'utf-8'
CORPUS_NAME = 'spanish-ps'
FILTER = ('"', "'", '#', '.', ',', '(', ')', ';', '—', '/')

tokens_raw = []

# Loading input file, which has the original and modernised token in each line separated by a \t
for n in range(16,20):
    inputfile = INPUT.replace('<n>', str(n))
    with open(inputfile, 'r', encoding=ENCODING) as infile:
        tokens_raw += [line.strip().split('\t') for line in infile]

# Filter out lines with control characters
tokens = [token for token in tokens_raw if len(token)>1 and not token[0].startswith(FILTER)]

In [None]:
# Getting the original and modernised tokens and types
tokens_original = [token[0].lower() for token in tokens]
tokens_modernised = [token[1].lower() for token in tokens]

types_original = list(set(tokens_original))
types_modernised = list(set(tokens_modernised))

In [None]:
# Token and hapax count
tokens_original_count = collections.Counter(tokens_original)
tokens_modernised_count = collections.Counter(tokens_modernised)

hapax_original_count = len([val for val in tokens_original_count.values() if val == 1])
hapax_modernised_count = len([val for val in tokens_modernised_count.values() if val == 1])

In [None]:
# Generate evaluation clustering from modernised tokens

evaluation_cluster = dict()

for token in tokens:
    hist = token[0].lower()
    cont = token[1].lower()
    if cont in evaluation_cluster:
        evaluation_cluster[cont].append(hist)
    else:
        evaluation_cluster[cont] = [hist]

# Reduce Items to unique Items 
evaluation_cluster_types = dict()

for idx, items in evaluation_cluster.items():
    evaluation_cluster_types[idx] = list(set(items))

In [None]:
# Levenshtein Distance
def levenshtein(string1, string2):
    if string1 == string2:
        return 0

    if not string2:
        return len(string1)
    if not string1:
        return len(string2)

    rows = len(string1) + 1
    cols = len(string2) + 1
    dist = [[0 for c in range(cols)] for r in range(rows)]

    for j in range(1, rows):
        dist[j][0] = j
    for i in range(1, cols):
        dist[0][i] = i

    for col in range(1, cols):
        for row in range(1, rows):
            cost = 1
            if string1[row - 1] == string2[col - 1]:
                cost = 0
            dist[row][col] = min(dist[row - 1][col] + 1, dist[row][col - 1] + 1, dist[row - 1][col - 1] + cost)

    return dist[row][col]

assert levenshtein('', '') == 0
assert levenshtein('foobar', 'foobar') == 0
assert levenshtein('foobar', 'foubar') == 1
assert levenshtein('foobar', 'fuubar') == 2
assert levenshtein('foobar', 'fuuar') == 3
assert levenshtein('foobar', '') == 6

In [None]:
# Jaro Similarily
def jaro(string1, string2):

    length1 = len(string1)
    length2 = len(string2)
   
    if length1 == 0:
        return 0.0
    
    if string1 == string2:
        return 1.0   

    match_bound = max(length1, length2) // 2 - 1

    matches = 0  
    transpositions = 0

    flagged_1 = [] 
    flagged_2 = []

    for i in range(length1):
        upperbound = min(i + match_bound, length2 - 1)
        lowerbound = max(0, i - match_bound)
        for j in range(lowerbound, upperbound + 1):
            if string1[i] == string2[j] and j not in flagged_2:
                matches += 1
                flagged_1.append(i)
                flagged_2.append(j)
                break

    flagged_2.sort()

    for i, j in zip(flagged_1, flagged_2):
        if string1[i] != string2[j]:
            transpositions += 1

    if matches == 0:
        return 0.0

    return (1/3 * ( matches / length1 + matches / length2 + (matches - transpositions // 2) / matches))

assert jaro('', '') == 0.0
assert jaro('foobar', '') == 0.0
assert jaro('foobar', 'foobar') == 1.0
assert jaro('foobar', 'barfoo') == 0.4444444444444444
assert jaro('duane', 'dwayne') == 0.8222222222222222
assert jaro('hans', 'gruber') == 0.0

In [None]:
# IBM (LCS-Levenshtein Normalized)

# Contractor, D., Faruquie, T. A., & Subramaniam, L. V. (2010, August). 
# Unsupervised cleansing of noisy text. 
# In Proceedings of the 23rd International Conference on Computational Linguistics:
# Posters (pp. 189-196). Association for Computational Linguistics.

from itertools import groupby

# Longest Common Substring
def longest_common_string(string1, string2):
    if string1 == string2:
        return len(string1)

    if not string1 or not string2:
        return 0
    
    rows = len(string1) + 1
    cols = len(string2) + 1
    table = [[0 for c in range(cols)] for r in range(rows)]

    longest = 0
    for col in range(cols):
        for row in range(rows):
            if col == 0 and row == 0:
                table[row][col] = 0
            if string1[row - 1] == string2[col - 1]:
                table[row][col] = table[row - 1][col - 1] + 1
                longest = max(longest, table[row][col])
            else:
                table[row][col] = 0
    
    return longest

assert longest_common_string('', '') == 0
assert longest_common_string('foobar', '') == 0
assert longest_common_string('foobar', 'foobar') == 6
assert longest_common_string('foobar', 'foo') == 3
assert longest_common_string('foobar', 'f') == 1


def lcs_ratio(string1, string2):
    if not string1 or not string2:
        return 0.0
    ratio = longest_common_string(string1, string2) / len(string1)
    return ratio

assert lcs_ratio('', '') == 0.0
assert lcs_ratio('foo', '') == 0.0
assert lcs_ratio('foobar', 'foobar') == 1.0
assert lcs_ratio('foo', 'bar') == 0.0
assert lcs_ratio('word', 'deoxyribonucleic') == 0.25


def consonant_skeleton(string, vowels='aeiouy'):
    without_vowels = ''.join([char for char in string if char not in vowels])     
    deduplicated_consonants = ''.join(char for char, _ in groupby(without_vowels))
    return deduplicated_consonants

assert consonant_skeleton('') == ''
assert consonant_skeleton('aeio') == ''
assert consonant_skeleton('foobar') == 'fbr'
assert consonant_skeleton('ffoobbar') == 'fbr'
assert consonant_skeleton('barfoobar') == 'brfbr'


def ibm_similarity(string1, string2, vowels='aeiouy'):
    similarity = lcs_ratio(string1, string2) / (levenshtein (consonant_skeleton(string1, vowels), consonant_skeleton(string2, vowels)) + 1)
    return similarity

assert ibm_similarity('', '') == 0.0
assert ibm_similarity('foobar', '') == 0.0
assert ibm_similarity('foobar', 'foobar') == 1.0
assert ibm_similarity('foo', 'bar') == 0.0
assert ibm_similarity('word', 'deoxyribonucleic') == 0.03125
assert ibm_similarity('foobar', 'aeiou') == 0.041666666666666664

In [None]:
# Load precalculated data from cache
if os.path.exists('spanish_types_original_pairwise_distance_levenshtein.pickle'):
    types_original_reshaped = np.array(types_original).reshape(-1,1)
    types_original_pairwise_distance_levenshtein = pickle.load(open('spanish_types_original_pairwise_distance_levenshtein.pickle', 'rb'))
    types_original_pairwise_distance_jaro = pickle.load(open('spanish_types_original_pairwise_distance_jaro.pickle', 'rb'))
    types_original_pairwise_distance_ibm = pickle.load(open('spanish_types_original_pairwise_distance_ibm.pickle', 'rb'))

In [None]:
%%time

# Compute the Pairwise Distance for each Similarity Measure
# Skip this step if you have cached data, it might take a while

ibm_vowels = 'aeoiuyjóáéàòâÿíôêè'

types_original_reshaped = np.array(types_original).reshape(-1,1)
types_original_pairwise_distance_levenshtein = scipy.spatial.distance.pdist(types_original_reshaped, lambda x,y: levenshtein(str(x[0]),str(y[0])))   
types_original_pairwise_distance_jaro = scipy.spatial.distance.pdist(types_original_reshaped, lambda x,y: jaro(str(x[0]),str(y[0])))   
types_original_pairwise_distance_ibm = scipy.spatial.distance.pdist(types_original_reshaped, lambda x,y: ibm_similarity(str(x[0]),str(y[0]),ibm_vowels)**2)

In [None]:
# Store precalculated data in cache
pickle.dump(types_original_pairwise_distance_levenshtein, open('spanish_types_original_pairwise_distance_levenshtein.pickle', 'wb'))
pickle.dump(types_original_pairwise_distance_jaro, open('spanish_types_original_pairwise_distance_jaro.pickle', 'wb'))
pickle.dump(types_original_pairwise_distance_ibm, open('spanish_types_original_pairwise_distance_ibm.pickle', 'wb'))

In [None]:
%%time

# Transform the Pairwise Distance for each Similarity Measure into full similarity matrix

original_distance_matrix_levenshtein = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance_levenshtein), index=types_original, columns=types_original)
original_distance_matrix_jaro = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance_jaro), index=types_original, columns=types_original)
original_distance_matrix_ibm = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance_ibm), index=types_original, columns=types_original)

## Evaluation Functions

In [None]:
def eval_expected_n_clusters(model):
    """
    Number of clusters
    """
    return pandas.DataFrame(
        columns=['baseline'],
        index=['NoCl'],
        data=[len(model.items())]
    )
    
def eval_expected_avg_cluster_size(model):
    """
    Calculate expected average cluster size
    """

    avg_cluster_size = sum([len(val) for val in model.values()]) / len(model.values())

    return pandas.DataFrame(
        columns=['baseline'],
        index=['AvgClSize'],
        data=[avg_cluster_size]
    )

def eval_expected_cluster_similarity_stats(model):
    """
    Calculate inter object similarity for evaluation cluster
    """
    
    avg_similarities = []

    for cluster in model.values():
        reshaped = np.array(cluster).reshape(-1,1)
        similarity = scipy.spatial.distance.pdist(reshaped, lambda x,y: jaro(str(x[0]),str(y[0])))
        avg_similarities.append(np.mean(similarity))

    return pandas.DataFrame(
        columns=['baseline'],
        index=['ObjSimMean', 'ObjSimMedian', 'ObjSimSTD', 'ObjSimVAR'],
        data=[
            np.nanmean(avg_similarities),
            np.nanmedian(avg_similarities),
            np.nanstd(avg_similarities),
            np.nanvar(avg_similarities)
        ]
    )

def eval_expected_largest_clusters(model, n=10):
    """
    Largest clusters
    """
    clusters = list(model.values())
    clusters.sort(key=len)    
    
    c_length = [len(cl) for cl in clusters[-n:]]
    c_tokens = [cl for cl in clusters[-n:]]
    
    return pandas.DataFrame(
        data={
            'Length': c_length,
            'Tokens': c_tokens
        }
    )

In [None]:
def eval_n_clusters(model):
    """
    Number of clusters
    """
    n_clusters = 0
    try:
        # Agglomerative Hierarchical Clustering
        n_clusters = model.n_clusters_
    except AttributeError:
        # Affinity Propagation
        n_clusters= len(model.cluster_centers_indices_)
    
    return pandas.DataFrame(
        columns=['actual'],
        index=['NoCl'],
        data=[n_clusters]
    )    

def eval_random_clusters(model, n=10):
    """
    Random clusters
    """
    rand_clusters = []
    for cluster_id in random.choices(np.unique(model.labels_), k=10):
        cluster = types_original_reshaped[np.nonzero(model.labels_ == cluster_id)]
        rand_clusters.append([item for sublist in cluster for item in sublist])

    return pandas.Series(rand_clusters)

def eval_largest_clusters(model, n=10):
    """
    Largest clusters
    """
    clusters = []

    for cluster_id in np.unique(model.labels_):
        cluster = types_original_reshaped[np.nonzero(model.labels_ == cluster_id)]
        clusters.append([item for sublist in cluster for item in sublist])
    
    clusters.sort(key=len)

    c_length = [len(cl) for cl in clusters[-n:]]
    c_tokens = [cl for cl in clusters[-n:]]
    
    return pandas.DataFrame(
        data={
            'Length': c_length,
            'Tokens': c_tokens
        }
    )

def eval_cluster_similarity_stats(model):
    """
    Calculate inter object similarity
    """

    avg_similarities = []
    for cluster_id in np.unique(model.labels_):
        cluster = types_original_reshaped[np.nonzero(model.labels_ == cluster_id)]
        similarity = scipy.spatial.distance.pdist(cluster, lambda x,y: jaro(str(x[0]),str(y[0])))
        avg_similarities.append(np.mean(similarity))

    return pandas.DataFrame(
        columns=['actual'],
        index=['ObjSimMean', 'ObjSimMedian', 'ObjSimSTD', 'ObjSimVAR'],
        data=[
            np.nanmean(avg_similarities),
            np.nanmedian(avg_similarities),
            np.nanstd(avg_similarities),
            np.nanvar(avg_similarities)
        ]
    )


    
def eval_avg_cluster_size(model):
    """
    Calculate average cluster size
    """

    summary = collections.Counter(model.labels_)
    avg_cluster_size = sum(summary.values()) / len(summary.items())
    
    return pandas.DataFrame(
        columns=['actual'],
        index=['AvgClSize'],
        data=[avg_cluster_size]
    )

In [None]:
# Calculate various values from manual clustering, for evaluation 
expected_n_clusters = eval_expected_n_clusters(evaluation_cluster_types)
expected_largest_clusters = eval_expected_largest_clusters(evaluation_cluster_types)
expected_avg_cluster_size = eval_expected_avg_cluster_size(evaluation_cluster_types)
expected_stats = eval_expected_cluster_similarity_stats(evaluation_cluster_types)

In [None]:
expected_largest_clusters

## Affinity Propagation Clustering

- Damping factor (between 0.5 and 1) is the extent to which the current value is maintained relative to incoming values (weighted 1 - damping). 
- This in order to avoid numerical oscillations when updating these values (messages).

In [None]:
%%time

# Parameters
damping_factor = 0.9

# Calculation
apc_levenshtein_euclidean = sklearn.cluster.AffinityPropagation(
    affinity='euclidean', 
    damping=damping_factor, 
    random_state=None).fit(original_distance_matrix_levenshtein)

eval_largest_clusters(apc_levenshtein_euclidean)

In [None]:
apc_levenshtein_largest_clusters = eval_largest_clusters(apc_levenshtein_euclidean)

apc_levenshtein_n_clusters = eval_n_clusters(apc_levenshtein_euclidean)
apc_levenshtein_n_clusters = apc_levenshtein_n_clusters.rename(columns={'actual': 'apc_levenshtein'})

apc_levenshtein_avg_cluster_size = eval_avg_cluster_size(apc_levenshtein_euclidean)
apc_levenshtein_avg_cluster_size = apc_levenshtein_avg_cluster_size.rename(columns={'actual': 'apc_levenshtein'})

apc_levenshtein_stats = eval_cluster_similarity_stats(apc_levenshtein_euclidean)
apc_levenshtein_stats = apc_levenshtein_stats.rename(columns={'actual': 'apc_levenshtein'})

pandas.concat([
    pandas.concat([expected_stats, apc_levenshtein_stats], axis=1),
    pandas.concat([expected_n_clusters, apc_levenshtein_n_clusters], axis=1), 
    pandas.concat([expected_avg_cluster_size, apc_levenshtein_avg_cluster_size], axis=1)
]).transpose()

In [None]:
%%time

# Parameters
damping_factor = 0.9

# Calculation
apc_jaro_euclidean = sklearn.cluster.AffinityPropagation(
    affinity='euclidean',
    damping=damping_factor, 
    random_state=None).fit(original_distance_matrix_jaro)

eval_largest_clusters(apc_jaro_euclidean)

In [None]:
apc_jaro_largest_clusters = eval_largest_clusters(apc_jaro_euclidean)

apc_jaro_n_clusters = eval_n_clusters(apc_jaro_euclidean)
apc_jaro_n_clusters = apc_jaro_n_clusters.rename(columns={'actual': 'apc_jaro'})

apc_jaro_avg_cluster_size = eval_avg_cluster_size(apc_jaro_euclidean)
apc_jaro_avg_cluster_size = apc_jaro_avg_cluster_size.rename(columns={'actual': 'apc_jaro'})

apc_jaro_stats = eval_cluster_similarity_stats(apc_jaro_euclidean)
apc_jaro_stats = apc_jaro_stats.rename(columns={'actual': 'apc_jaro'})

pandas.concat([
    pandas.concat([expected_stats, apc_jaro_stats], axis=1),
    pandas.concat([expected_n_clusters, apc_jaro_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, apc_jaro_avg_cluster_size], axis=1)
]).transpose()

In [None]:
%%time

# Parameters
damping_factor = 0.9

# Calculation
apc_ibm_euclidean = sklearn.cluster.AffinityPropagation(
    affinity='euclidean',
    damping=damping_factor, 
    random_state=None).fit(original_distance_matrix_ibm)

eval_largest_clusters(apc_ibm_euclidean)

In [None]:
apc_ibm_largest_clusters = eval_largest_clusters(apc_ibm_euclidean)

apc_ibm_n_clusters = eval_n_clusters(apc_ibm_euclidean)
apc_ibm_n_clusters = apc_ibm_n_clusters.rename(columns={'actual': 'apc_ibm'})

apc_ibm_avg_cluster_size = eval_avg_cluster_size(apc_ibm_euclidean)
apc_ibm_avg_cluster_size = apc_ibm_avg_cluster_size.rename(columns={'actual': 'apc_ibm'})

apc_ibm_stats = eval_cluster_similarity_stats(apc_ibm_euclidean)
apc_ibm_stats = apc_ibm_stats.rename(columns={'actual': 'apc_ibm'})

pandas.concat([
    pandas.concat([expected_stats, apc_ibm_stats], axis=1),
    pandas.concat([expected_n_clusters, apc_ibm_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, apc_ibm_avg_cluster_size], axis=1)
]).transpose()

### APC Evaluation

- Silhouette Score
- Number of Clusters
- Largest Clusters
- Average Cluster Size
- Average Cluster Object Jaro Similarity

In [None]:
sc_levenshtein_euclidean = sklearn.metrics.silhouette_score(original_distance_matrix_levenshtein, apc_levenshtein_euclidean.labels_, metric='euclidean')
sc_jaro_euclidean = sklearn.metrics.silhouette_score(original_distance_matrix_jaro, apc_jaro_euclidean.labels_, metric='euclidean')
sc_ibm_euclidean = sklearn.metrics.silhouette_score(original_distance_matrix_ibm, apc_ibm_euclidean.labels_, metric='euclidean')

pandas.DataFrame(index=['silhouette_score'], data={
    'Levenshtein': [sc_levenshtein_euclidean],
    'Jaro': [sc_jaro_euclidean],
    'IBM': [sc_ibm_euclidean]
})

In [None]:
pandas.concat([
    pandas.concat([expected_stats, apc_levenshtein_stats, apc_jaro_stats,  apc_ibm_stats], axis=1),
    pandas.concat([expected_n_clusters, apc_levenshtein_n_clusters, apc_jaro_n_clusters,  apc_ibm_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, apc_levenshtein_avg_cluster_size, apc_jaro_avg_cluster_size,  apc_ibm_avg_cluster_size], axis=1)    
]).transpose()

## Agglomerative Hierarchical Clustering

- Linkage distance threshold above which, clusters will not be merged. 
- If not None, n_clusters must be None and compute_full_tree must be True.
- Metric used to compute the linkage. Can be “euclidean”, “l1”, “l2”, “manhattan”, “cosine”, or “precomputed”. 
- If linkage is “ward”, only “euclidean” is accepted. 
- If “precomputed”, a distance matrix (instead of a similarity matrix) is needed as input for the fit method.

## AHC Levenshtein

In [None]:
%%time

# Parameters
linkage_method = 'single'
distance_threshold = 25

# Calculation
ahc_levenshtein_single = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_levenshtein)

eval_largest_clusters(ahc_levenshtein_single)

In [None]:
ahc_levenshtein_single_largest_clusters = eval_largest_clusters(ahc_levenshtein_single)

ahc_levenshtein_single_n_clusters = eval_n_clusters(ahc_levenshtein_single)
ahc_levenshtein_single_n_clusters = ahc_levenshtein_single_n_clusters.rename(columns={'actual': 'ahc_levenshtein_single'})

ahc_levenshtein_single_avg_cluster_size = eval_avg_cluster_size(ahc_levenshtein_single)
ahc_levenshtein_single_avg_cluster_size = ahc_levenshtein_single_avg_cluster_size.rename(columns={'actual': 'ahc_levenshtein_single'})

ahc_levenshtein_single_stats = eval_cluster_similarity_stats(ahc_levenshtein_single)
ahc_levenshtein_single_stats = ahc_levenshtein_single_stats.rename(columns={'actual': 'ahc_levenshtein_single'})

pandas.concat([
    pandas.concat([expected_stats, ahc_levenshtein_single_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_levenshtein_single_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_levenshtein_single_avg_cluster_size], axis=1)
]).transpose()

In [None]:
%%time

# Parameters
linkage_method = 'complete'
distance_threshold = 27

# Calculation
ahc_levenshtein_complete = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_levenshtein)

eval_largest_clusters(ahc_levenshtein_complete)

In [None]:
ahc_levenshtein_complete_largest_clusters = eval_largest_clusters(ahc_levenshtein_complete)

ahc_levenshtein_complete_n_clusters = eval_n_clusters(ahc_levenshtein_complete)
ahc_levenshtein_complete_n_clusters = ahc_levenshtein_complete_n_clusters.rename(columns={'actual': 'ahc_levenshtein_complete'})

ahc_levenshtein_complete_avg_cluster_size = eval_avg_cluster_size(ahc_levenshtein_complete)
ahc_levenshtein_complete_avg_cluster_size = ahc_levenshtein_complete_avg_cluster_size.rename(columns={'actual': 'ahc_levenshtein_complete'})

ahc_levenshtein_complete_stats = eval_cluster_similarity_stats(ahc_levenshtein_complete)
ahc_levenshtein_complete_stats = ahc_levenshtein_complete_stats.rename(columns={'actual': 'ahc_levenshtein_complete'})

pandas.concat([
    pandas.concat([expected_stats, ahc_levenshtein_complete_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_levenshtein_complete_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_levenshtein_complete_avg_cluster_size], axis=1)
]).transpose()

In [None]:
%%time

# Parameters
linkage_method = 'average'
distance_threshold = 25

# Calculation
ahc_levenshtein_average = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_levenshtein)

eval_largest_clusters(ahc_levenshtein_average)

In [None]:
ahc_levenshtein_average_largest_clusters = eval_largest_clusters(ahc_levenshtein_average)

ahc_levenshtein_average_n_clusters = eval_n_clusters(ahc_levenshtein_average)
ahc_levenshtein_average_n_clusters = ahc_levenshtein_average_n_clusters.rename(columns={'actual': 'ahc_levenshtein_average'})

ahc_levenshtein_average_avg_cluster_size = eval_avg_cluster_size(ahc_levenshtein_average)
ahc_levenshtein_average_avg_cluster_size = ahc_levenshtein_average_avg_cluster_size.rename(columns={'actual': 'ahc_levenshtein_average'})

ahc_levenshtein_average_stats = eval_cluster_similarity_stats(ahc_levenshtein_average)
ahc_levenshtein_average_stats = ahc_levenshtein_average_stats.rename(columns={'actual': 'ahc_levenshtein_average'})

pandas.concat([
    pandas.concat([expected_stats, ahc_levenshtein_average_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_levenshtein_average_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_levenshtein_average_avg_cluster_size], axis=1)
]).transpose()

In [None]:
%%time

# Parameters
linkage_method = 'ward'
distance_threshold = 25

# Calculation
ahc_levenshtein_ward = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_levenshtein)

eval_largest_clusters(ahc_levenshtein_ward)

In [None]:
ahc_levenshtein_ward_largest_clusters = eval_largest_clusters(ahc_levenshtein_ward)

ahc_levenshtein_ward_n_clusters = eval_n_clusters(ahc_levenshtein_ward)
ahc_levenshtein_ward_n_clusters = ahc_levenshtein_ward_n_clusters.rename(columns={'actual': 'ahc_levenshtein_ward'})

ahc_levenshtein_ward_avg_cluster_size = eval_avg_cluster_size(ahc_levenshtein_ward)
ahc_levenshtein_ward_avg_cluster_size = ahc_levenshtein_ward_avg_cluster_size.rename(columns={'actual': 'ahc_levenshtein_ward'})

ahc_levenshtein_ward_stats = eval_cluster_similarity_stats(ahc_levenshtein_ward)
ahc_levenshtein_ward_stats = ahc_levenshtein_ward_stats.rename(columns={'actual': 'ahc_levenshtein_ward'})

pandas.concat([
    pandas.concat([expected_stats, ahc_levenshtein_ward_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_levenshtein_ward_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_levenshtein_ward_avg_cluster_size], axis=1)
]).transpose()

## AHC Levenshtein Evaluation

- Number of Clusters
- Largest Clusters
- Average Cluster Size (Length of elemets)
- Average Cluster Object Jaro Similarity

In [None]:
sc_levenshtein_single = sklearn.metrics.silhouette_score(original_distance_matrix_levenshtein, ahc_levenshtein_single.labels_, metric='euclidean')
sc_levenshtein_complete = sklearn.metrics.silhouette_score(original_distance_matrix_levenshtein, ahc_levenshtein_complete.labels_, metric='euclidean')
sc_levenshtein_average = sklearn.metrics.silhouette_score(original_distance_matrix_levenshtein, ahc_levenshtein_average.labels_, metric='euclidean')
sc_levenshtein_ward = sklearn.metrics.silhouette_score(original_distance_matrix_levenshtein, ahc_levenshtein_ward.labels_, metric='euclidean')

pandas.DataFrame(index=['silhouette_score'], data={
    'AHC (single)': [sc_levenshtein_single],
    'AHC (complete)': [sc_levenshtein_complete],
    'AHC (average)': [sc_levenshtein_average],
    'AHC (ward)': [sc_levenshtein_ward]
})

In [None]:
pandas.concat([
    pandas.concat([expected_stats, ahc_levenshtein_single_stats, ahc_levenshtein_complete_stats, ahc_levenshtein_average_stats, ahc_levenshtein_ward_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_levenshtein_single_n_clusters, ahc_levenshtein_complete_n_clusters, ahc_levenshtein_average_n_clusters, ahc_levenshtein_ward_n_clusters], axis=1),    
    pandas.concat([expected_avg_cluster_size, ahc_levenshtein_single_avg_cluster_size, ahc_levenshtein_complete_avg_cluster_size, ahc_levenshtein_average_avg_cluster_size, ahc_levenshtein_ward_avg_cluster_size], axis=1)    
]).transpose()

## AHC Jaro

In [None]:
%%time

# Parameters
linkage_method = 'single'
distance_threshold = 5.0

# Calculation
ahc_jaro_single = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_jaro)

eval_largest_clusters(ahc_jaro_single)

In [None]:
ahc_jaro_single_largest_clusters = eval_largest_clusters(ahc_jaro_single)

ahc_jaro_single_n_clusters = eval_n_clusters(ahc_jaro_single)
ahc_jaro_single_n_clusters = ahc_jaro_single_n_clusters.rename(columns={'actual': 'ahc_jaro_single'})

ahc_jaro_single_avg_cluster_size = eval_avg_cluster_size(ahc_jaro_single)
ahc_jaro_single_avg_cluster_size = ahc_jaro_single_avg_cluster_size.rename(columns={'actual': 'ahc_jaro_single'})

ahc_jaro_single_stats = eval_cluster_similarity_stats(ahc_jaro_single)
ahc_jaro_single_stats = ahc_jaro_single_stats.rename(columns={'actual': 'ahc_jaro_single'})

pandas.concat([
    pandas.concat([expected_stats, ahc_jaro_single_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_jaro_single_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_jaro_single_avg_cluster_size], axis=1)
]).transpose()

In [None]:
%%time

# Parameters
linkage_method = 'complete'
distance_threshold = 5.5

# Calculation
ahc_jaro_complete = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_jaro)

eval_largest_clusters(ahc_jaro_complete)

In [None]:
ahc_jaro_complete_largest_clusters = eval_largest_clusters(ahc_jaro_complete)

ahc_jaro_complete_n_clusters = eval_n_clusters(ahc_jaro_complete)
ahc_jaro_complete_n_clusters = ahc_jaro_complete_n_clusters.rename(columns={'actual': 'ahc_jaro_complete'})

ahc_jaro_complete_avg_cluster_size = eval_avg_cluster_size(ahc_jaro_complete)
ahc_jaro_complete_avg_cluster_size = ahc_jaro_complete_avg_cluster_size.rename(columns={'actual': 'ahc_jaro_complete'})

ahc_jaro_complete_stats = eval_cluster_similarity_stats(ahc_jaro_complete)
ahc_jaro_complete_stats = ahc_jaro_complete_stats.rename(columns={'actual': 'ahc_jaro_complete'})

pandas.concat([
    pandas.concat([expected_stats, ahc_jaro_complete_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_jaro_complete_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_jaro_complete_avg_cluster_size], axis=1)
]).transpose()

In [None]:
%%time

# Parameters
linkage_method = 'average'
distance_threshold = 5.5

# Calculation
ahc_jaro_average = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_jaro)

eval_largest_clusters(ahc_jaro_average)

In [None]:
ahc_jaro_average_largest_clusters = eval_largest_clusters(ahc_jaro_average)

ahc_jaro_average_n_clusters = eval_n_clusters(ahc_jaro_average)
ahc_jaro_average_n_clusters = ahc_jaro_average_n_clusters.rename(columns={'actual': 'ahc_jaro_average'})

ahc_jaro_average_avg_cluster_size = eval_avg_cluster_size(ahc_jaro_average)
ahc_jaro_average_avg_cluster_size = ahc_jaro_average_avg_cluster_size.rename(columns={'actual': 'ahc_jaro_average'})

ahc_jaro_average_stats = eval_cluster_similarity_stats(ahc_jaro_average)
ahc_jaro_average_stats = ahc_jaro_average_stats.rename(columns={'actual': 'ahc_jaro_average'})

pandas.concat([
    pandas.concat([expected_stats, ahc_jaro_average_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_jaro_average_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_jaro_average_avg_cluster_size], axis=1)
]).transpose()

In [None]:
%%time

# Parameters
linkage_method = 'ward'
distance_threshold = 5.5

# Calculation
ahc_jaro_ward = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_jaro)

eval_largest_clusters(ahc_jaro_ward)

In [None]:
ahc_jaro_ward_largest_clusters = eval_largest_clusters(ahc_jaro_ward)

ahc_jaro_ward_n_clusters = eval_n_clusters(ahc_jaro_ward)
ahc_jaro_ward_n_clusters = ahc_jaro_ward_n_clusters.rename(columns={'actual': 'ahc_jaro_ward'})

ahc_jaro_ward_avg_cluster_size = eval_avg_cluster_size(ahc_jaro_ward)
ahc_jaro_ward_avg_cluster_size = ahc_jaro_ward_avg_cluster_size.rename(columns={'actual': 'ahc_jaro_ward'})

ahc_jaro_ward_stats = eval_cluster_similarity_stats(ahc_jaro_ward)
ahc_jaro_ward_stats = ahc_jaro_ward_stats.rename(columns={'actual': 'ahc_jaro_ward'})

pandas.concat([
    pandas.concat([expected_stats, ahc_jaro_ward_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_jaro_ward_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_jaro_ward_avg_cluster_size], axis=1)
]).transpose()

## AHC Jaro Evaluation

In [None]:
sc_jaro_single = sklearn.metrics.silhouette_score(original_distance_matrix_jaro, ahc_jaro_single.labels_, metric='euclidean')
sc_jaro_complete = sklearn.metrics.silhouette_score(original_distance_matrix_jaro, ahc_jaro_complete.labels_, metric='euclidean')
sc_jaro_average = sklearn.metrics.silhouette_score(original_distance_matrix_jaro, ahc_jaro_average.labels_, metric='euclidean')
sc_jaro_ward = sklearn.metrics.silhouette_score(original_distance_matrix_jaro, ahc_jaro_ward.labels_, metric='euclidean')

pandas.DataFrame(index=['silhouette_score'], data={
    'AHC (single)': [sc_jaro_single],
    'AHC (complete)': [sc_jaro_complete],
    'AHC (average)': [sc_jaro_average],
    'AHC (ward)': [sc_jaro_ward]
})

In [None]:
pandas.concat([
    pandas.concat([expected_stats, ahc_jaro_single_stats, ahc_jaro_complete_stats, ahc_jaro_average_stats, ahc_jaro_ward_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_jaro_single_n_clusters, ahc_jaro_complete_n_clusters, ahc_jaro_average_n_clusters, ahc_jaro_ward_n_clusters], axis=1),    
    pandas.concat([expected_avg_cluster_size, ahc_jaro_single_avg_cluster_size, ahc_jaro_complete_avg_cluster_size, ahc_jaro_average_avg_cluster_size, ahc_jaro_ward_avg_cluster_size], axis=1)    
]).transpose()

## AHC IBM

In [None]:
%%time

# Parameters
linkage_method = 'single'
distance_threshold = 0.22

# Calculation
ahc_ibm_single = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_ibm)

eval_largest_clusters(ahc_ibm_single)

In [None]:
ahc_ibm_single_largest_clusters = eval_largest_clusters(ahc_ibm_single)

ahc_ibm_single_n_clusters = eval_n_clusters(ahc_ibm_single)
ahc_ibm_single_n_clusters = ahc_ibm_single_n_clusters.rename(columns={'actual': 'ahc_ibm_single'})

ahc_ibm_single_avg_cluster_size = eval_avg_cluster_size(ahc_ibm_single)
ahc_ibm_single_avg_cluster_size = ahc_ibm_single_avg_cluster_size.rename(columns={'actual': 'ahc_ibm_single'})

ahc_ibm_single_stats = eval_cluster_similarity_stats(ahc_ibm_single)
ahc_ibm_single_stats = ahc_ibm_single_stats.rename(columns={'actual': 'ahc_ibm_single'})

pandas.concat([
    pandas.concat([expected_stats, ahc_ibm_single_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_ibm_single_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_ibm_single_avg_cluster_size], axis=1)
]).transpose()

In [None]:
%%time

# Parameters
linkage_method = 'complete'
distance_threshold = 0.3

# Calculation
ahc_ibm_complete = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_ibm)

eval_largest_clusters(ahc_ibm_complete)

In [None]:
ahc_ibm_complete_largest_clusters = eval_largest_clusters(ahc_ibm_complete)

ahc_ibm_complete_n_clusters = eval_n_clusters(ahc_ibm_complete)
ahc_ibm_complete_n_clusters = ahc_ibm_complete_n_clusters.rename(columns={'actual': 'ahc_ibm_complete'})

ahc_ibm_complete_avg_cluster_size = eval_avg_cluster_size(ahc_ibm_complete)
ahc_ibm_complete_avg_cluster_size = ahc_ibm_complete_avg_cluster_size.rename(columns={'actual': 'ahc_ibm_complete'})

ahc_ibm_complete_stats = eval_cluster_similarity_stats(ahc_ibm_complete)
ahc_ibm_complete_stats = ahc_ibm_complete_stats.rename(columns={'actual': 'ahc_ibm_complete'})

pandas.concat([
    pandas.concat([expected_stats, ahc_ibm_complete_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_ibm_complete_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_ibm_complete_avg_cluster_size], axis=1)
]).transpose()

In [None]:
%%time

# Parameters
linkage_method = 'average'
distance_threshold = 0.8

# Calculation
ahc_ibm_average = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_ibm)

eval_largest_clusters(ahc_ibm_average)

In [None]:
ahc_ibm_average_largest_clusters = eval_largest_clusters(ahc_ibm_average)

ahc_ibm_average_n_clusters = eval_n_clusters(ahc_ibm_average)
ahc_ibm_average_n_clusters = ahc_ibm_average_n_clusters.rename(columns={'actual': 'ahc_ibm_average'})

ahc_ibm_average_avg_cluster_size = eval_avg_cluster_size(ahc_ibm_average)
ahc_ibm_average_avg_cluster_size = ahc_ibm_average_avg_cluster_size.rename(columns={'actual': 'ahc_ibm_average'})

ahc_ibm_average_stats = eval_cluster_similarity_stats(ahc_ibm_average)
ahc_ibm_average_stats = ahc_ibm_average_stats.rename(columns={'actual': 'ahc_ibm_average'})

pandas.concat([
    pandas.concat([expected_stats, ahc_jaro_average_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_jaro_average_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_jaro_average_avg_cluster_size], axis=1)
]).transpose()

In [None]:
%%time

# Parameters
linkage_method = 'ward'
distance_threshold = 0.3

# Calculation
ahc_ibm_ward = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, 
    distance_threshold=distance_threshold, 
    affinity='euclidean', 
    linkage=linkage_method).fit(original_distance_matrix_ibm)

eval_largest_clusters(ahc_ibm_ward)

In [None]:
ahc_ibm_ward_largest_clusters = eval_largest_clusters(ahc_ibm_ward)

ahc_ibm_ward_n_clusters = eval_n_clusters(ahc_ibm_ward)
ahc_ibm_ward_n_clusters = ahc_ibm_ward_n_clusters.rename(columns={'actual': 'ahc_ibm_ward'})

ahc_ibm_ward_avg_cluster_size = eval_avg_cluster_size(ahc_ibm_ward)
ahc_ibm_ward_avg_cluster_size = ahc_ibm_ward_avg_cluster_size.rename(columns={'actual': 'ahc_ibm_ward'})

ahc_ibm_ward_stats = eval_cluster_similarity_stats(ahc_ibm_ward)
ahc_ibm_ward_stats = ahc_ibm_ward_stats.rename(columns={'actual': 'ahc_ibm_ward'})

pandas.concat([
    pandas.concat([expected_stats, ahc_ibm_ward_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_ibm_ward_n_clusters], axis=1),
    pandas.concat([expected_avg_cluster_size, ahc_ibm_ward_avg_cluster_size], axis=1)
]).transpose()

## AHC IBM Evaluation

In [None]:
sc_ibm_single = sklearn.metrics.silhouette_score(original_distance_matrix_ibm, ahc_ibm_single.labels_, metric='euclidean')
sc_ibm_complete = sklearn.metrics.silhouette_score(original_distance_matrix_ibm, ahc_ibm_complete.labels_, metric='euclidean')
sc_ibm_average = sklearn.metrics.silhouette_score(original_distance_matrix_ibm, ahc_ibm_average.labels_, metric='euclidean')
sc_ibm_ward = sklearn.metrics.silhouette_score(original_distance_matrix_ibm, ahc_ibm_ward.labels_, metric='euclidean')

pandas.DataFrame(index=['silhouette_score'], data={
    'AHC (single)': [sc_ibm_single],
    'AHC (complete)': [sc_ibm_complete],
    'AHC (average)': [sc_ibm_average],
    'AHC (ward)': [sc_ibm_ward]
})

In [None]:
pandas.concat([
    pandas.concat([expected_stats, ahc_ibm_single_stats, ahc_ibm_complete_stats, ahc_ibm_average_stats, ahc_ibm_ward_stats], axis=1),
    pandas.concat([expected_n_clusters, ahc_ibm_single_n_clusters, ahc_ibm_complete_n_clusters, ahc_ibm_average_n_clusters, ahc_ibm_ward_n_clusters], axis=1),    
    pandas.concat([expected_avg_cluster_size, ahc_ibm_single_avg_cluster_size, ahc_ibm_complete_avg_cluster_size, ahc_ibm_average_avg_cluster_size, ahc_ibm_ward_avg_cluster_size], axis=1)    
]).transpose()

# Summary

In [None]:
pandas.concat([
    pandas.concat([expected_stats,
        ahc_levenshtein_single_stats, 
        ahc_levenshtein_complete_stats, 
        ahc_levenshtein_average_stats, 
        ahc_levenshtein_ward_stats,
        ahc_jaro_single_stats, 
        ahc_jaro_complete_stats, 
        ahc_jaro_average_stats, 
        ahc_jaro_ward_stats,
        ahc_ibm_single_stats, 
        ahc_ibm_complete_stats, 
        ahc_ibm_average_stats, 
        ahc_ibm_ward_stats
    ], axis=1),
    pandas.concat([expected_n_clusters, 
        ahc_levenshtein_single_n_clusters, 
        ahc_levenshtein_complete_n_clusters, 
        ahc_levenshtein_average_n_clusters, 
        ahc_levenshtein_ward_n_clusters,
        ahc_jaro_single_n_clusters, 
        ahc_jaro_complete_n_clusters, 
        ahc_jaro_average_n_clusters, 
        ahc_jaro_ward_n_clusters,
        ahc_ibm_single_n_clusters, 
        ahc_ibm_complete_n_clusters, 
        ahc_ibm_average_n_clusters, 
        ahc_ibm_ward_n_clusters
    ], axis=1),    
    pandas.concat([expected_avg_cluster_size,
        ahc_levenshtein_single_avg_cluster_size, 
        ahc_levenshtein_complete_avg_cluster_size, 
        ahc_levenshtein_average_avg_cluster_size, 
        ahc_levenshtein_ward_avg_cluster_size,
        ahc_jaro_single_avg_cluster_size, 
        ahc_jaro_complete_avg_cluster_size, 
        ahc_jaro_average_avg_cluster_size, 
        ahc_jaro_ward_avg_cluster_size,
        ahc_ibm_single_avg_cluster_size, 
        ahc_ibm_complete_avg_cluster_size, 
        ahc_ibm_average_avg_cluster_size, 
        ahc_ibm_ward_avg_cluster_size
    ], axis=1)    
]).transpose()

# JSON Export

Transforms the Clustering into JSON

Example Agglomerative Hierarchical output:
```
{
  "name": "Lorem ipsum",
  "distance": 5.5,
  "children": [
    {
      "name": "dolor",
      "distance": 1.5,
      "children": [
        {
          "name": "sit",
          "distance": 0.0,
          "children": []
        },
...
}
```

Example Affinity Propagation output:

```
[
 [
 "lorem",
 "impsum"
 ],
 [
 "dolor",
 "sit"
 ]
]
```

In [None]:
LABELS = dict(enumerate(types_original))

def add_nodes(node, parent):
    """
    Recursively build tree as dict
    """
    new_node = dict(node_id=node.id, children=[], distance=node.dist)
    parent['children'].append(new_node)
    if node.left: add_nodes(node.left, new_node)
    if node.right: add_nodes(node.right, new_node)

def add_labels(node):
    """
    Recursively add labels to the tree
    """
    is_leaf = len(node['children']) == 0

    if is_leaf:
        node['name'] = LABELS[node['node_id']]
    else:
        list(map(add_labels, node['children']))
    del node['node_id']

In [None]:
def create_linkage_matrix(model):
    """
    Creates a scipy-compatible linkage matrix, 
    so that the hierarchy.to_tree can be used
    """
    
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)

    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([
        model.children_, 
        model.distances_, 
        counts]).astype(float)

    return linkage_matrix

In [None]:
def export_ahc_model(model, file_name):
    """
    Export a AHC model to JSON
    """

    if os.path.exists(file_name):
        return
    
    linkage_matrix = create_linkage_matrix(model)
    scipy_tree = scipy.cluster.hierarchy.to_tree(linkage_matrix, rd=False)

    tree = dict(name='root', children=[], distance=scipy_tree.dist)

    add_nodes(scipy_tree, tree)
    add_labels(tree['children'][0])

    with open(file_name, 'w') as fp_clustering:
        json.dump(tree, fp_clustering, indent=1)

In [None]:
def export_apc_model(model, file_name):
    """
    Export a APC model to JSON
    """
    
    if os.path.exists(file_name):
        return
    
    clusters = []

    for cluster_id in model.labels_:
        cluster = types_original_reshaped[np.nonzero(model.labels_ == cluster_id)]
        clusters.append([item for sublist in cluster for item in sublist])
    
    # clusters.sort(key=len)
    with open(file_name, 'w') as fp_clustering:
        json.dump(clusters, fp_clustering, indent=1)    

In [None]:
export_apc_model(apc_levenshtein_euclidean, CORPUS_NAME + '-apc_levenshtein.json')
export_apc_model(apc_jaro_euclidean, CORPUS_NAME + '-apc_jaro.json')
export_apc_model(apc_ibm_euclidean, CORPUS_NAME + '-apc_ibm.json')

In [None]:
import sys
sys.setrecursionlimit(10**6)

In [None]:
export_ahc_model(ahc_levenshtein_single, CORPUS_NAME + '-ahc_levenshtein_single.json')
export_ahc_model(ahc_levenshtein_complete, CORPUS_NAME + '-ahc_levenshtein_complete.json')
export_ahc_model(ahc_levenshtein_average, CORPUS_NAME + '-ahc_levenshtein_average.json')
export_ahc_model(ahc_levenshtein_ward, CORPUS_NAME + '-ahc_levenshtein_ward.json')

In [None]:
export_ahc_model(ahc_jaro_single, CORPUS_NAME + '-ahc_jaro_single.json')
export_ahc_model(ahc_jaro_complete, CORPUS_NAME + '-ahc_jaro_complete.json')
export_ahc_model(ahc_jaro_average, CORPUS_NAME + '-ahc_jaro_average.json')
export_ahc_model(ahc_jaro_ward, CORPUS_NAME + '-ahc_jaro_ward.json')

In [None]:
export_ahc_model(ahc_ibm_single, CORPUS_NAME + '-ahc_ibm_single.json')
export_ahc_model(ahc_ibm_complete, CORPUS_NAME + '-ahc_ibm_complete.json')
export_ahc_model(ahc_ibm_average, CORPUS_NAME + '-ahc_ibm_average.json')
export_ahc_model(ahc_ibm_ward, CORPUS_NAME + '-ahc_ibm_ward.json')