In [None]:
import numpy as np
import scipy
import sklearn.manifold
import seaborn as sns
import matplotlib.pyplot as plt
import umap
import pandas
import pickle
import os
import collections

In [None]:
INPUT = '../../histnorm/datasets/historical/german/german-anselm.test.txt'
ENCODING = 'utf-8'
CORPUS_NAME = 'german-anselm'

# Loading input file, which has the original and modernised token in each line separated by a \t
with open(INPUT, 'r', encoding=ENCODING) as infile:
    tokens = [line.strip().split('\t') for line in infile]

In [None]:
# Getting the original and modernised tokens and types
tokens_original = [token[0] for token in tokens[:5000]]
tokens_modernised = [token[1] for token in tokens[:5000]]
types_original = list(set(tokens_original))
types_modernised = list(set(tokens_modernised))

In [None]:
def levenshtein(string1, string2):
    """
    Levenshtein Distance between two strings
    """
    if string1 == string2:
        return 0

    rows = len(string1) + 1
    cols = len(string2) + 1
    dist = [[0 for c in range(cols)] for r in range(rows)]

    for j in range(1, rows):
        dist[j][0] = j
    for i in range(1, cols):
        dist[0][i] = i

    for col in range(1, cols):
        for row in range(1, rows):
            cost = 1
            if string1[row - 1] == string2[col - 1]:
                cost = 0
            dist[row][col] = min(dist[row - 1][col] + 1, dist[row][col - 1] + 1, dist[row - 1][col - 1] + cost)

    return dist[row][col]

assert levenshtein('foobar', 'foobar') == 0
assert levenshtein('foobar', 'foubar') == 1
assert levenshtein('foobar', 'fuubar') == 2

In [None]:
def jaro(string1, string2):

    length1 = len(string1)
    length2 = len(string2)
   
    if length1 == 0:
        return 0.0
    
    if string1 == string2:
        return 1.0   

    match_bound = max(length1, length2) // 2 - 1

    matches = 0  
    transpositions = 0

    flagged_1 = [] 
    flagged_2 = []

    for i in range(length1):
        upperbound = min(i + match_bound, length2 - 1)
        lowerbound = max(0, i - match_bound)
        for j in range(lowerbound, upperbound + 1):
            if string1[i] == string2[j] and j not in flagged_2:
                matches += 1
                flagged_1.append(i)
                flagged_2.append(j)
                break

    flagged_2.sort()

    for i, j in zip(flagged_1, flagged_2):
        if string1[i] != string2[j]:
            transpositions += 1

    if matches == 0:
        return 0.0

    return (1/3 * ( matches / length1 + matches / length2 + (matches - transpositions // 2) / matches))

assert jaro('foobar', 'foobar') == 1.0
assert jaro('foobar', 'barfoo') == 0.4444444444444444
assert jaro('duane', 'dwayne') == 0.8222222222222222
assert jaro('hans', 'gruber') == 0.0

In [None]:
# Example of two similarity measures
tmp_reshape = np.array(types_original[:5]).reshape(-1,1)

tmp_levenshtein_distance = scipy.spatial.distance.pdist(tmp_reshape, lambda x,y: levenshtein(str(x[0]),str(y[0])))   
tmp_jaro_distance = scipy.spatial.distance.pdist(tmp_reshape, lambda x,y: jaro(str(x[0]),str(y[0])))   

tmp_levenshtein_matrix = pandas.DataFrame(scipy.spatial.distance.squareform(tmp_levenshtein_distance), index=types_original[:5], columns=types_original[:5])
tmp_jaro_matrix = pandas.DataFrame(scipy.spatial.distance.squareform(tmp_jaro_distance), index=types_original[:5], columns=types_original[:5])

print(tmp_levenshtein_matrix)
print(tmp_jaro_matrix)

In [None]:
%%time

types_original_reshaped = np.array(types_original).reshape(-1,1)
types_original_pairwise_distance_levenshtein = scipy.spatial.distance.pdist(types_original_reshaped, lambda x,y: levenshtein(str(x[0]),str(y[0])))   
types_original_pairwise_distance_jaro = scipy.spatial.distance.pdist(types_original_reshaped, lambda x,y: jaro(str(x[0]),str(y[0])))   

types_modernised_reshaped = np.array(types_modernised).reshape(-1,1)
types_modernised_pairwise_distance_levenshtein = scipy.spatial.distance.pdist(types_modernised_reshaped, lambda x,y: levenshtein(str(x[0]),str(y[0]))) 
types_modernised_pairwise_distance_jaro = scipy.spatial.distance.pdist(types_modernised_reshaped, lambda x,y: jaro(str(x[0]),str(y[0])))   

In [None]:
%%time
# Transforming pairwise distances into a full similarity matrix
original_distance_matrix_levenshtein = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance_levenshtein), index=types_original, columns=types_original)
modernised_distance_matrix_levenshtein = pandas.DataFrame(scipy.spatial.distance.squareform(types_modernised_pairwise_distance_levenshtein), index=types_modernised, columns=types_modernised)

original_distance_matrix_jaro = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance_jaro), index=types_original, columns=types_original)
modernised_distance_matrix_jaro = pandas.DataFrame(scipy.spatial.distance.squareform(types_modernised_pairwise_distance_jaro), index=types_modernised, columns=types_modernised)

In [None]:
%%time

original_similarity_embedded_umap_levenshtein = umap.UMAP(n_components=3).fit_transform(original_distance_matrix_levenshtein)
modernised_similarity_embedded_umap_levenshtein = umap.UMAP(n_components=3).fit_transform(modernised_distance_matrix_levenshtein)

original_similarity_embedded_umap_jaro = umap.UMAP(n_components=3).fit_transform(original_distance_matrix_jaro)
modernised_similarity_embedded_umap_jaro = umap.UMAP(n_components=3).fit_transform(modernised_distance_matrix_jaro)

In [None]:
original_umap_levenshtein = pandas.DataFrame()
original_umap_levenshtein['umap-x-original'] = original_similarity_embedded_umap_levenshtein[:,0]
original_umap_levenshtein['umap-y-original'] = original_similarity_embedded_umap_levenshtein[:,1]
original_umap_levenshtein['umap-z-original'] = original_similarity_embedded_umap_levenshtein[:,2]
original_umap_levenshtein['token'] = original_distance_matrix_levenshtein.index

modernised_umap_levenshtein = pandas.DataFrame()
modernised_umap_levenshtein['umap-x-modernised'] = modernised_similarity_embedded_umap_levenshtein[:,0]
modernised_umap_levenshtein['umap-y-modernised'] = modernised_similarity_embedded_umap_levenshtein[:,1]
modernised_umap_levenshtein['umap-z-modernised'] = modernised_similarity_embedded_umap_levenshtein[:,2]
modernised_umap_levenshtein['token'] = modernised_distance_matrix_levenshtein.index

In [None]:
original_umap_jaro = pandas.DataFrame()
original_umap_jaro['umap-x-original'] = original_similarity_embedded_umap_jaro[:,0]
original_umap_jaro['umap-y-original'] = original_similarity_embedded_umap_jaro[:,1]
original_umap_jaro['umap-z-original'] = original_similarity_embedded_umap_jaro[:,2]
original_umap_jaro['token'] = original_distance_matrix_jaro.index

modernised_umap_jaro = pandas.DataFrame()
modernised_umap_jaro['umap-x-modernised'] = modernised_similarity_embedded_umap_jaro[:,0]
modernised_umap_jaro['umap-y-modernised'] = modernised_similarity_embedded_umap_jaro[:,1]
modernised_umap_jaro['umap-z-modernised'] = modernised_similarity_embedded_umap_jaro[:,2]
modernised_umap_jaro['token'] = modernised_distance_matrix_jaro.index

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter3d(
    x=modernised_umap_levenshtein['umap-x-modernised'],
    y=modernised_umap_levenshtein['umap-y-modernised'],
    z=modernised_umap_levenshtein['umap-z-modernised'],
    mode='markers',
    text=modernised_umap_levenshtein['token'],
    marker=dict(
        size=2,
        opacity=1
    )
)])
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter3d(
    x=modernised_umap_jaro['umap-x-modernised'],
    y=modernised_umap_jaro['umap-y-modernised'],
    z=modernised_umap_jaro['umap-z-modernised'],
    mode='markers',
    text=modernised_umap_jaro['token'],
    marker=dict(
        color='orange',
        size=2,
        opacity=1
    )
)])
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
%%time

original_similarity_embedded_umap_levenshtein_2d = umap.UMAP(n_components=2).fit_transform(original_distance_matrix_levenshtein)
modernised_similarity_embedded_umap_levenshtein_2d = umap.UMAP(n_components=2).fit_transform(modernised_distance_matrix_levenshtein)

original_similarity_embedded_umap_jaro_2d = umap.UMAP(n_components=2).fit_transform(original_distance_matrix_jaro)
modernised_similarity_embedded_umap_jaro_2d = umap.UMAP(n_components=2).fit_transform(modernised_distance_matrix_jaro)

In [None]:
original_umap_levenshtein_2d = pandas.DataFrame()
original_umap_levenshtein_2d['umap-x-original'] = original_similarity_embedded_umap_levenshtein_2d[:,0]
original_umap_levenshtein_2d['umap-y-original'] = original_similarity_embedded_umap_levenshtein_2d[:,1]
original_umap_levenshtein_2d['token'] = original_distance_matrix_levenshtein.index

modernised_umap_levenshtein_2d = pandas.DataFrame()
modernised_umap_levenshtein_2d['umap-x-modernised'] = modernised_similarity_embedded_umap_levenshtein_2d[:,0]
modernised_umap_levenshtein_2d['umap-y-modernised'] = modernised_similarity_embedded_umap_levenshtein_2d[:,1]
modernised_umap_levenshtein_2d['token'] = modernised_distance_matrix_levenshtein.index

original_umap_jaro_2d = pandas.DataFrame()
original_umap_jaro_2d['umap-x-original'] = original_similarity_embedded_umap_jaro_2d[:,0]
original_umap_jaro_2d['umap-y-original'] = original_similarity_embedded_umap_jaro_2d[:,1]
original_umap_jaro_2d['token'] = original_distance_matrix_jaro.index

modernised_umap_jaro_2d = pandas.DataFrame()
modernised_umap_jaro_2d['umap-x-modernised'] = modernised_similarity_embedded_umap_jaro_2d[:,0]
modernised_umap_jaro_2d['umap-y-modernised'] = modernised_similarity_embedded_umap_jaro_2d[:,1]
modernised_umap_jaro_2d['token'] = modernised_distance_matrix_jaro.index

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter(
    x=modernised_umap_levenshtein_2d['umap-x-modernised'],
    y=modernised_umap_levenshtein_2d['umap-y-modernised'],
    mode='markers',
    text=modernised_umap_levenshtein_2d['token'],
    marker=dict(
        color='steelblue',
    )
)])
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter(
    x=modernised_umap_jaro_2d['umap-x-modernised'],
    y=modernised_umap_jaro_2d['umap-y-modernised'],
    mode='markers',
    text=modernised_umap_jaro_2d['token'],
    marker=dict(
        color='orange',
    )
)])
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()