In [None]:
import numpy as np
import scipy
import sklearn.decomposition
import sklearn.cluster
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import pandas
import random
import os
import collections

In [None]:
INPUT = '../../histnorm/datasets/historical/german/german-anselm.test.txt'
ENCODING = 'utf-8'
CORPUS_NAME = 'german-anselm'

# Loading input file, which has the original and modernised token in each line separated by a \t
with open(INPUT, 'r', encoding=ENCODING) as infile:
    tokens = [line.strip().split('\t') for line in infile]

In [None]:
# Getting the original and modernised tokens and types
tokens_original = [token[0] for token in tokens[:5000]]
tokens_modernised = [token[1] for token in tokens[:5000]]
types_original = list(set(tokens_original))
types_modernised = list(set(tokens_modernised))

In [None]:
# Levenshtein Distance
def levenshtein(string1, string2):
    if string1 == string2:
        return 0

    if not string2:
        return len(string1)
    if not string1:
        return len(string2)

    rows = len(string1) + 1
    cols = len(string2) + 1
    dist = [[0 for c in range(cols)] for r in range(rows)]

    for j in range(1, rows):
        dist[j][0] = j
    for i in range(1, cols):
        dist[0][i] = i

    for col in range(1, cols):
        for row in range(1, rows):
            cost = 1
            if string1[row - 1] == string2[col - 1]:
                cost = 0
            dist[row][col] = min(dist[row - 1][col] + 1, dist[row][col - 1] + 1, dist[row - 1][col - 1] + cost)

    # Enable for Debugging
    # print('\n'.join([''.join(['{:4}'.format(elem) for elem in row]) for row in dist]))
    return dist[row][col]

assert levenshtein('', '') == 0
assert levenshtein('foobar', 'foobar') == 0
assert levenshtein('foobar', 'foubar') == 1
assert levenshtein('foobar', 'fuubar') == 2
assert levenshtein('foobar', 'fuuar') == 3
assert levenshtein('foobar', '') == 6

In [None]:
# Jaro Similarily
def jaro(string1, string2):

    length1 = len(string1)
    length2 = len(string2)
   
    if length1 == 0:
        return 0.0
    
    if string1 == string2:
        return 1.0   

    match_bound = max(length1, length2) // 2 - 1

    matches = 0  
    transpositions = 0

    flagged_1 = [] 
    flagged_2 = []

    for i in range(length1):
        upperbound = min(i + match_bound, length2 - 1)
        lowerbound = max(0, i - match_bound)
        for j in range(lowerbound, upperbound + 1):
            if string1[i] == string2[j] and j not in flagged_2:
                matches += 1
                flagged_1.append(i)
                flagged_2.append(j)
                break

    flagged_2.sort()

    for i, j in zip(flagged_1, flagged_2):
        if string1[i] != string2[j]:
            transpositions += 1

    if matches == 0:
        return 0.0

    return (1/3 * ( matches / length1 + matches / length2 + (matches - transpositions // 2) / matches))

assert jaro('', '') == 0.0
assert jaro('foobar', '') == 0.0
assert jaro('foobar', 'foobar') == 1.0
assert jaro('foobar', 'barfoo') == 0.4444444444444444
assert jaro('duane', 'dwayne') == 0.8222222222222222
assert jaro('hans', 'gruber') == 0.0

In [None]:
# IBM (LCS-Levenshtein Normalized)

# Contractor, D., Faruquie, T. A., & Subramaniam, L. V. (2010, August). 
# Unsupervised cleansing of noisy text. 
# In Proceedings of the 23rd International Conference on Computational Linguistics:
# Posters (pp. 189-196). Association for Computational Linguistics.

from itertools import groupby

# Longest Common Substring
def longest_common_string(string1, string2):
    if string1 == string2:
        return len(string1)

    if not string1 or not string2:
        return 0
    
    rows = len(string1) + 1
    cols = len(string2) + 1
    table = [[0 for c in range(cols)] for r in range(rows)]

    longest = 0
    for col in range(cols):
        for row in range(rows):
            if col == 0 and row == 0:
                table[row][col] = 0
            if string1[row - 1] == string2[col - 1]:
                table[row][col] = table[row - 1][col - 1] + 1
                longest = max(longest, table[row][col])
            else:
                table[row][col] = 0
    
    return longest

assert longest_common_string('', '') == 0
assert longest_common_string('foobar', '') == 0
assert longest_common_string('foobar', 'foobar') == 6
assert longest_common_string('foobar', 'foo') == 3
assert longest_common_string('foobar', 'f') == 1


def lcs_ratio(string1, string2):
    if not string1 or not string2:
        return 0.0
    ratio = longest_common_string(string1, string2) / len(string1)
    return ratio

assert lcs_ratio('', '') == 0.0
assert lcs_ratio('foo', '') == 0.0
assert lcs_ratio('foobar', 'foobar') == 1.0
assert lcs_ratio('foo', 'bar') == 0.0
assert lcs_ratio('word', 'deoxyribonucleic') == 0.25


def consonant_skeleton(string, vowels='aeiouy'):
    without_vowels = ''.join([char for char in string if char not in vowels])     
    deduplicated_consonants = ''.join(char for char, _ in groupby(without_vowels))
    return deduplicated_consonants

assert consonant_skeleton('') == ''
assert consonant_skeleton('aeio') == ''
assert consonant_skeleton('foobar') == 'fbr'
assert consonant_skeleton('ffoobbar') == 'fbr'
assert consonant_skeleton('barfoobar') == 'brfbr'


def ibm_similarity(string1, string2):
    similarity = lcs_ratio(string1, string2) / (levenshtein (consonant_skeleton(string1), consonant_skeleton(string2)) + 1)
    return similarity

assert ibm_similarity('', '') == 0.0
assert ibm_similarity('foobar', '') == 0.0
assert ibm_similarity('foobar', 'foobar') == 1.0
assert ibm_similarity('foo', 'bar') == 0.0
assert ibm_similarity('word', 'deoxyribonucleic') == 0.03125
assert ibm_similarity('foobar', 'aeiou') == 0.041666666666666664

In [None]:
%%time

types_original_reshaped = np.array(types_original).reshape(-1,1)
types_original_pairwise_distance_levenshtein = scipy.spatial.distance.pdist(types_original_reshaped, lambda x,y: levenshtein(str(x[0]),str(y[0])))   
types_original_pairwise_distance_jaro = scipy.spatial.distance.pdist(types_original_reshaped, lambda x,y: jaro(str(x[0]),str(y[0])))   
types_original_pairwise_distance_ibm = scipy.spatial.distance.pdist(types_original_reshaped, lambda x,y: ibm_similarity(str(x[0]),str(y[0])))

In [None]:
%%time
# Transforming pairwise distances into a full similarity matrix
original_distance_matrix_levenshtein = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance_levenshtein), index=types_original, columns=types_original)

original_distance_matrix_jaro = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance_jaro), index=types_original, columns=types_original)

original_distance_matrix_ibm = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance_ibm), index=types_original, columns=types_original)

In [None]:
def print_n_clusters(model):
    print('\n{} Number of Clusters'.format(model.n_clusters_))
    print('---')
    
def print_random_clusters(model, n=10):
    print('\n{} Random Clusters'.format(n))
    print('---')
    for cluster_id in random.choices(np.unique(model.labels_), k=10):
        cluster = types_original_reshaped[np.nonzero(model.labels_ == cluster_id)]
        flatten = [item for sublist in cluster for item in sublist]
        print(flatten)

def print_largest_clusters(model, n=10):
    print('\n{} Largest Clusters'.format(n))
    print('---')

    clusters = []
    for cluster_id in np.unique(model.labels_):
        cluster = types_original_reshaped[np.nonzero(model.labels_ == cluster_id)]
        clusters.append([item for sublist in cluster for item in sublist])
    clusters.sort(key=len)
    
    for cl in clusters[-n:]:
        print('{}: {}'.format(len(cl), cl))

# Affinity Propagation

Damping factor (between 0.5 and 1) is the extent to which the current value is maintained relative to incoming values 
(weighted 1 - damping). This in order to avoid numerical oscillations when updating these values (messages).

In [None]:
damping_factor = 0.9
affinity_levenshtein_euclidean = sklearn.cluster.AffinityPropagation(affinity='euclidean', damping=damping_factor, random_state=None).fit(original_distance_matrix_levenshtein)

In [None]:
damping_factor = 0.9
affinity_jaro_euclidean = sklearn.cluster.AffinityPropagation(affinity='euclidean', damping=damping_factor, random_state=None).fit(original_distance_matrix_jaro)

In [None]:
damping_factor = 0.9
affinity_ibm_euclidean = sklearn.cluster.AffinityPropagation(affinity='euclidean', damping=damping_factor, random_state=None).fit(original_distance_matrix_ibm)

In [None]:
print('\nLevenshtein')
print_random_clusters(affinity_levenshtein_euclidean)
print_largest_clusters(affinity_levenshtein_euclidean)

print('\nJaro')
print_random_clusters(affinity_jaro_euclidean)
print_largest_clusters(affinity_jaro_euclidean)

print('\nIBM')
print_random_clusters(affinity_ibm_euclidean)
print_largest_clusters(affinity_ibm_euclidean)

# DBSCAN

eps, The maximum distance between two samples for one to be considered as in the neighborhood of the other.

min_samples, The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. 

In [None]:
eps_factor = 0.5
min_samples = 5
dbscan_levenshtein = sklearn.cluster.DBSCAN(eps=eps_factor,
                                            min_samples=min_samples).fit(original_distance_matrix_levenshtein)

In [None]:
eps_factor = 0.5
min_samples = 5
dbscan_jaro = sklearn.cluster.DBSCAN(eps=eps_factor,
                                            min_samples=min_samples).fit(original_distance_matrix_jaro)

In [None]:
eps_factor = 0.5
min_samples = 5
dbscan_ibm = sklearn.cluster.DBSCAN(eps=eps_factor,
                                            min_samples=min_samples).fit(original_distance_matrix_ibm)

In [None]:
print('\nLevenshtein')
# print_random_clusters(dbscan_levenshtein)
print_largest_clusters(dbscan_levenshtein)

print('\nJaro')
# print_random_clusters(dbscan_jaro)
print_largest_clusters(dbscan_jaro)

print('\nIBM')
# print_random_clusters(dbscan_ibm)
print_largest_clusters(dbscan_ibm)

# Agglomerative Clustering

The linkage distance threshold above which, clusters will not be merged. 

If not None, n_clusters must be None and compute_full_tree must be True.

In [None]:
linkage_method = 'single'
distance_threshold = 2
agglomerative_single_levenshtein = sklearn.cluster.AgglomerativeClustering(n_clusters=None, 
                                                                           distance_threshold=distance_threshold, 
                                                                           affinity='precomputed', 
                                                                           linkage=linkage_method).fit(original_distance_matrix_levenshtein)

In [None]:
print('\nLevenshtein')
print_random_clusters(agglomerative_single_levenshtein)
print_largest_clusters(agglomerative_single_levenshtein)
print_n_clusters(agglomerative_single_levenshtein)

In [None]:
linkage_method = 'complete'
distance_threshold = 2
agglomerative_complete_levenshtein = sklearn.cluster.AgglomerativeClustering(n_clusters=None, 
                                                                           distance_threshold=distance_threshold, 
                                                                           affinity='precomputed', 
                                                                           linkage=linkage_method).fit(original_distance_matrix_levenshtein)

In [None]:
linkage_method = 'complete'
distance_threshold = 0.3
agglomerative_complete_jaro = sklearn.cluster.AgglomerativeClustering(n_clusters=None, 
                                                                           distance_threshold=distance_threshold, 
                                                                           affinity='precomputed', 
                                                                           linkage=linkage_method).fit(original_distance_matrix_jaro)

In [None]:
linkage_method = 'complete'
distance_threshold = 0.3
agglomerative_complete_ibm = sklearn.cluster.AgglomerativeClustering(n_clusters=None, 
                                                                           distance_threshold=distance_threshold, 
                                                                           affinity='precomputed', 
                                                                           linkage=linkage_method).fit(original_distance_matrix_ibm)

In [None]:
print('\nLevenshtein')
# print_random_clusters(agglomerative_complete_levenshtein)
print_largest_clusters(agglomerative_complete_levenshtein)
print_n_clusters(agglomerative_complete_levenshtein)

print('\nJaro')
# print_random_clusters(agglomerative_complete_jaro)
print_largest_clusters(agglomerative_complete_jaro)
print_n_clusters(agglomerative_complete_jaro)

print('\nIBM')
# print_random_clusters(agglomerative_complete_ibm)
print_largest_clusters(agglomerative_complete_ibm)
print_n_clusters(agglomerative_complete_ibm)

In [None]:
linkage_method = 'average'
distance_threshold = 2
agglomerative_average_levenshtein = sklearn.cluster.AgglomerativeClustering(n_clusters=None, 
                                                                           distance_threshold=distance_threshold, 
                                                                           affinity='precomputed', 
                                                                           linkage=linkage_method).fit(original_distance_matrix_levenshtein)

In [None]:
linkage_method = 'average'
distance_threshold = 0.1
agglomerative_average_jaro = sklearn.cluster.AgglomerativeClustering(n_clusters=None, 
                                                                           distance_threshold=distance_threshold, 
                                                                           affinity='precomputed', 
                                                                           linkage=linkage_method).fit(original_distance_matrix_jaro)

In [None]:
linkage_method = 'average'
distance_threshold = 0.1
agglomerative_average_ibm = sklearn.cluster.AgglomerativeClustering(n_clusters=None, 
                                                                           distance_threshold=distance_threshold, 
                                                                           affinity='precomputed', 
                                                                           linkage=linkage_method).fit(original_distance_matrix_ibm)

In [None]:
print('\nLevenshtein')
# print_random_clusters(agglomerative_average_levenshtein)
print_largest_clusters(agglomerative_average_levenshtein)
print_n_clusters(agglomerative_average_levenshtein)

print('\nJaro')
# print_random_clusters(agglomerative_average_jaro)
print_largest_clusters(agglomerative_average_jaro)
print_n_clusters(agglomerative_average_jaro)

print('\nIBM')
# print_random_clusters(agglomerative_average_ibm)
print_largest_clusters(agglomerative_average_ibm)
print_n_clusters(agglomerative_average_ibm)

In [None]:
linkage_method = 'ward'
distance_threshold = 9
agglomerative_ward_levenshtein = sklearn.cluster.AgglomerativeClustering(n_clusters=None, 
                                                                           distance_threshold=distance_threshold, 
                                                                           affinity='euclidean', 
                                                                           linkage=linkage_method).fit(original_distance_matrix_levenshtein)

In [None]:
linkage_method = 'ward'
distance_threshold = 2
agglomerative_ward_jaro = sklearn.cluster.AgglomerativeClustering(n_clusters=None, 
                                                                           distance_threshold=distance_threshold, 
                                                                           affinity='euclidean', 
                                                                           linkage=linkage_method).fit(original_distance_matrix_jaro)

In [None]:
linkage_method = 'ward'
distance_threshold = 1
agglomerative_ward_ibm= sklearn.cluster.AgglomerativeClustering(n_clusters=None, 
                                                                           distance_threshold=distance_threshold, 
                                                                           affinity='euclidean', 
                                                                           linkage=linkage_method).fit(original_distance_matrix_ibm)

In [None]:
print('\nLevenshtein')
# print_random_clusters(agglomerative_ward_levenshtein)
print_largest_clusters(agglomerative_ward_levenshtein)
print_n_clusters(agglomerative_ward_levenshtein)

print('\nJaro')
# print_random_clusters(agglomerative_ward_jaro)
print_largest_clusters(agglomerative_ward_jaro)
print_n_clusters(agglomerative_ward_jaro)

print('\nIBM')
# print_random_clusters(agglomerative_ward_ibm)
print_largest_clusters(agglomerative_ward_ibm)
print_n_clusters(agglomerative_ward_ibm)
