In [None]:
import numpy as np
import scipy
import sklearn.manifold
import sklearn.decomposition
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import umap
import pandas
import pickle Marco Bülow 
import os
import collections

In [None]:
INPUT = '../../histnorm/datasets/historical/portuguese/portuguese-ps<n>.dev.txt'
ENCODING = 'utf-8'
CORPUS_NAME = 'portuguese-ps'
FILTER = ('"', "'", '#', '.', ',', '(', ')', ';', '—', '/')

tokens_raw = []

# Loading input file, which has the original and modernised token in each line separated by a \t
for n in range(16,20):
    inputfile = INPUT.replace('<n>', str(n))
    with open(inputfile, 'r', encoding=ENCODING) as infile:
        tokens_raw += [line.strip().split('\t') for line in infile]

# Filter out lines with control characters
tokens = [token for token in tokens_raw if len(token)>1 and not token[0].startswith(FILTER)]

In [None]:
# Getting the original and modernised tokens and types
tokens_original = [token[0].lower() for token in tokens if len(token) > 1]
tokens_modernised = [token[1].lower() for token in tokens if len(token) > 1]

types_original = list(set(tokens_original))
types_modernised = list(set(tokens_modernised))

In [None]:
def ttr(types, tokens):
    """
    Calculating Type-Token Ratio
    """
    return len(types)/len(tokens)

assert ttr([0]*5, [0]*10)  == 0.5
assert ttr([0]*10, [0]*10) == 1

In [None]:
# Token and hapax count
tokens_original_count = collections.Counter(tokens_original)
tokens_modernised_count = collections.Counter(tokens_modernised)

hapax_original_count = len([val for val in tokens_original_count.values() if val == 1])
hapax_modernised_count = len([val for val in tokens_modernised_count.values() if val == 1])

In [None]:
# Overview of characters in the dataset
characters_types_original = collections.Counter()
characters_types_modernised = collections.Counter()

for type_o in types_original:
    characters_types_original.update(type_o)
for type_m in types_modernised:
    characters_types_modernised.update(type_m)

In [None]:
print('characters_types_original:')
print(characters_types_original)
print()
print('characters_types_modernised:')
print(characters_types_modernised)

In [None]:
print(CORPUS_NAME)
print('Tokens Original Example: {}'.format(tokens_original[:10]))
print('Tokens Original Count: {}'.format(len(tokens_original)))
print('Types Original Example: {}'.format(types_original[:10]))
print('Types Original Count: {}'.format(len(types_original)))
print('Type/Token Ratio Original: {:2.2%}'.format(ttr(types_original, tokens_original)))
print('Hapax Original Count: {}'.format(hapax_original_count))
print('Tokens Most Common Original: {}'.format(str(tokens_original_count.most_common(10))))

In [None]:
print(CORPUS_NAME)
print('Tokens Modernised Example: {}'.format(tokens_modernised[:10]))
print('Tokens Modernised Count: {}'.format(len(tokens_modernised)))
print('Types Modernised Example: {}'.format(types_modernised[:10]))
print('Types Modernised Count: {}'.format(len(types_modernised)))
print('Type/Token Modernised Original: {:2.2%}'.format(ttr(types_modernised, tokens_modernised)))
print('Hapax Original Count: {}'.format(hapax_modernised_count))
print('Tokens Most Common Original: {}'.format(str(tokens_modernised_count.most_common(10))))

In [None]:
# Compute evaluation clustering by mapping all historical tokens to their modern type
evaluation_cluster = dict()
for token in tokens:
    if token[1] in evaluation_cluster:
        evaluation_cluster[token[1]].append(token[0])
    else:
        evaluation_cluster[token[1]] = [token[0]]

assert len(evaluation_cluster) == len(types_modernised)

In [None]:
# Calculate average cluster size
sum([len(val) for val in evaluation_cluster.values()]) / len(evaluation_cluster.values())

In [None]:
# Displat the spelling variations for 10 most common tokens
for token in tokens_modernised_count.most_common(10):
    print('{}: {}\n'.format(token[0], set(evaluation_cluster[token[0]])))

In [None]:
# Levenshtein Distance
def levenshtein(string1, string2):
    if string1 == string2:
        return 0

    if not string2:
        return len(string1)
    if not string1:
        return len(string2)

    rows = len(string1) + 1
    cols = len(string2) + 1
    dist = [[0 for c in range(cols)] for r in range(rows)]

    for j in range(1, rows):
        dist[j][0] = j
    for i in range(1, cols):
        dist[0][i] = i

    for col in range(1, cols):
        for row in range(1, rows):
            cost = 1
            if string1[row - 1] == string2[col - 1]:
                cost = 0
            dist[row][col] = min(dist[row - 1][col] + 1, dist[row][col - 1] + 1, dist[row - 1][col - 1] + cost)

    # Enable for Debugging
    # print('\n'.join([''.join(['{:4}'.format(elem) for elem in row]) for row in dist]))
    return dist[row][col]

assert levenshtein('', '') == 0
assert levenshtein('foobar', 'foobar') == 0
assert levenshtein('foobar', 'foubar') == 1
assert levenshtein('foobar', 'fuubar') == 2
assert levenshtein('foobar', 'fuuar') == 3
assert levenshtein('foobar', '') == 6

In [None]:
# Jaro Similarily
def jaro(string1, string2):

    length1 = len(string1)
    length2 = len(string2)
   
    if length1 == 0:
        return 0.0
    
    if string1 == string2:
        return 1.0   

    match_bound = max(length1, length2) // 2 - 1

    matches = 0  
    transpositions = 0

    flagged_1 = [] 
    flagged_2 = []

    for i in range(length1):
        upperbound = min(i + match_bound, length2 - 1)
        lowerbound = max(0, i - match_bound)
        for j in range(lowerbound, upperbound + 1):
            if string1[i] == string2[j] and j not in flagged_2:
                matches += 1
                flagged_1.append(i)
                flagged_2.append(j)
                break

    flagged_2.sort()

    for i, j in zip(flagged_1, flagged_2):
        if string1[i] != string2[j]:
            transpositions += 1

    if matches == 0:
        return 0.0

    return (1/3 * ( matches / length1 + matches / length2 + (matches - transpositions // 2) / matches))

assert jaro('', '') == 0.0
assert jaro('foobar', '') == 0.0
assert jaro('foobar', 'foobar') == 1.0
assert jaro('foobar', 'barfoo') == 0.4444444444444444
assert jaro('duane', 'dwayne') == 0.8222222222222222
assert jaro('hans', 'gruber') == 0.0

In [None]:
# IBM (LCS-Levenshtein Normalized)

# Contractor, D., Faruquie, T. A., & Subramaniam, L. V. (2010, August). 
# Unsupervised cleansing of noisy text. 
# In Proceedings of the 23rd International Conference on Computational Linguistics:
# Posters (pp. 189-196). Association for Computational Linguistics.

from itertools import groupby

# Longest Common Substring
def longest_common_string(string1, string2):
    if string1 == string2:
        return len(string1)

    if not string1 or not string2:
        return 0
    
    rows = len(string1) + 1
    cols = len(string2) + 1
    table = [[0 for c in range(cols)] for r in range(rows)]

    longest = 0
    for col in range(cols):
        for row in range(rows):
            if col == 0 and row == 0:
                table[row][col] = 0
            if string1[row - 1] == string2[col - 1]:
                table[row][col] = table[row - 1][col - 1] + 1
                longest = max(longest, table[row][col])
            else:
                table[row][col] = 0
    
    return longest

assert longest_common_string('', '') == 0
assert longest_common_string('foobar', '') == 0
assert longest_common_string('foobar', 'foobar') == 6
assert longest_common_string('foobar', 'foo') == 3
assert longest_common_string('foobar', 'f') == 1


def lcs_ratio(string1, string2):
    if not string1 or not string2:
        return 0.0
    ratio = longest_common_string(string1, string2) / len(string1)
    return ratio

assert lcs_ratio('', '') == 0.0
assert lcs_ratio('foo', '') == 0.0
assert lcs_ratio('foobar', 'foobar') == 1.0
assert lcs_ratio('foo', 'bar') == 0.0
assert lcs_ratio('word', 'deoxyribonucleic') == 0.25


def consonant_skeleton(string, vowels='aeiouy'):
    without_vowels = ''.join([char for char in string if char not in vowels])     
    deduplicated_consonants = ''.join(char for char, _ in groupby(without_vowels))
    return deduplicated_consonants

assert consonant_skeleton('') == ''
assert consonant_skeleton('aeio') == ''
assert consonant_skeleton('foobar') == 'fbr'
assert consonant_skeleton('ffoobbar') == 'fbr'
assert consonant_skeleton('barfoobar') == 'brfbr'


def ibm_similarity(string1, string2, vowels='aeiouy'):
    similarity = lcs_ratio(string1, string2) / (levenshtein (consonant_skeleton(string1, vowels), consonant_skeleton(string2, vowels)) + 1)
    return similarity

assert ibm_similarity('', '') == 0.0
assert ibm_similarity('foobar', '') == 0.0
assert ibm_similarity('foobar', 'foobar') == 1.0
assert ibm_similarity('foo', 'bar') == 0.0
assert ibm_similarity('word', 'deoxyribonucleic') == 0.03125
assert ibm_similarity('foobar', 'aeiou') == 0.041666666666666664

In [None]:
%%time

# Compute the Pairwise Distance for each Similarity Measure

ibm_vowels = 'TODO'

types_original_reshaped = np.array(types_original).reshape(-1,1)
types_original_pairwise_distance_levenshtein = scipy.spatal.distance.pdist(types_original_reshaped, lambda x,y: levenshtein(str(x[0]),str(y[0])))   
types_original_pairwise_distance_jaro = scipy.spatial.distance.pdist(types_original_reshaped, lambda x,y: jaro(str(x[0]),str(y[0])))   
types_original_pairwise_distance_ibm = scipy.spatial.distance.pdist(types_original_reshaped, lambda x,y: ibm_similarity(str(x[0]),str(y[0]),ibm_vowels))

In [None]:
%%time

# Transform the Pairwise Distance for each Similarity Measure into full similarity matrix

original_distance_matrix_levenshtein = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance_levenshtein), index=types_original, columns=types_original)
original_distance_matrix_jaro = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance_jaro), index=types_original, columns=types_original)
original_distance_matrix_ibm = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance_ibm), index=types_original, columns=types_original)

In [None]:
%%time

# Run UMAP on data

two_similarity_embedded_umap_levenshtein = umap.UMAP(n_components=2).fit_transform(original_distance_matrix_levenshtein)
two_similarity_embedded_umap_jaro = umap.UMAP(n_components=2).fit_transform(original_distance_matrix_jaro)
two_similarity_embedded_umap_ibm = umap.UMAP(n_components=2).fit_transform(original_distance_matrix_ibm)

In [None]:
%%time

# Run UMAP on data

three_similarity_embedded_umap_levenshtein = umap.UMAP(n_components=3).fit_transform(original_distance_matrix_levenshtein)
three_similarity_embedded_umap_jaro = umap.UMAP(n_components=3).fit_transform(original_distance_matrix_jaro)
three_similarity_embedded_umap_ibm = umap.UMAP(n_components=3).fit_transform(original_distance_matrix_ibm)

In [None]:
# Preprare 3D Data for plots

three_umap_levenshtein = pandas.DataFrame()
three_umap_levenshtein['umap-x'] = three_similarity_embedded_umap_levenshtein[:,0]
three_umap_levenshtein['umap-y'] = three_similarity_embedded_umap_levenshtein[:,1]
three_umap_levenshtein['umap-z'] = three_similarity_embedded_umap_levenshtein[:,2]
three_umap_levenshtein['token'] = original_distance_matrix_levenshtein.index

three_umap_jaro = pandas.DataFrame()
three_umap_jaro['umap-x'] = three_similarity_embedded_umap_jaro[:,0]
three_umap_jaro['umap-y'] = three_similarity_embedded_umap_jaro[:,1]
three_umap_jaro['umap-z'] = three_similarity_embedded_umap_jaro[:,2]
three_umap_jaro['token'] = original_distance_matrix_jaro.index

three_umap_ibm = pandas.DataFrame()
three_umap_ibm['umap-x'] = three_similarity_embedded_umap_ibm[:,0]
three_umap_ibm['umap-y'] = three_similarity_embedded_umap_ibm[:,1]
three_umap_ibm['umap-z'] = three_similarity_embedded_umap_ibm[:,2]
three_umap_ibm['token'] = original_distance_matrix_ibm.index

In [None]:
# Preprare 2D Data for plots

two_umap_levenshtein = pandas.DataFrame()
two_umap_levenshtein['umap-x'] = two_similarity_embedded_umap_levenshtein[:,0]
two_umap_levenshtein['umap-y'] = two_similarity_embedded_umap_levenshtein[:,1]
two_umap_levenshtein['token'] = original_distance_matrix_levenshtein.index

two_umap_jaro = pandas.DataFrame()
two_umap_jaro['umap-x'] = two_similarity_embedded_umap_jaro[:,0]
two_umap_jaro['umap-y'] = two_similarity_embedded_umap_jaro[:,1]
two_umap_jaro['token'] = original_distance_matrix_jaro.index

two_umap_ibm = pandas.DataFrame()
two_umap_ibm['umap-x'] = two_similarity_embedded_umap_ibm[:,0]
two_umap_ibm['umap-y'] = two_similarity_embedded_umap_ibm[:,1]
two_umap_ibm['token'] = original_distance_matrix_ibm.index

In [None]:
# Levenshtein-UMAP 3D Scatterplot

fig = go.Figure(data=[go.Scatter3d(
    x=three_umap_levenshtein['umap-x'],
    y=three_umap_levenshtein['umap-y'],
    z=three_umap_levenshtein['umap-z'],
    mode='markers',
    text=three_umap_levenshtein['token'],
    marker=dict(
        size=2,
        color='steelblue',
        opacity=1
    )
)])
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
# Jaro-UMAP 3D Scatterplot

fig = go.Figure(data=[go.Scatter3d(
    x=three_umap_jaro['umap-x'],
    y=three_umap_jaro['umap-y'],
    z=three_umap_jaro['umap-z'],
    mode='markers',
    text=three_umap_jaro['token'],
    marker=dict(
        size=2,
        color='salmon',
        opacity=1
    )
)])
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
# IBM-UMAP 3D Scatterplot

fig = go.Figure(data=[go.Scatter3d(
    x=three_umap_ibm['umap-x'],
    y=three_umap_ibm['umap-y'],
    z=three_umap_ibm['umap-z'],
    mode='markers',
    text=three_umap_ibm['token'],
    marker=dict(
        size=2,
        color='seagreen',
        opacity=1
    )
)])
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
# Levenshtein-UMAP 2D Scatterplot

plt.figure(figsize=(20,20))
sns.scatterplot(
    x='umap-x',
    y='umap-y',
    data=two_umap_levenshtein,
    color='steelblue',
    alpha=0.5
).set(title=CORPUS_NAME + '-levenshtein-umap', xlabel=None, ylabel=None)

plt.savefig(CORPUS_NAME + '-levenshtein-umap.png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)

In [None]:
# Jaro-UMAP 2D Scatterplot

plt.figure(figsize=(20,20))
sns.scatterplot(
    x='umap-x',
    y='umap-y',
    data=two_umap_jaro,
    color='salmon',
    alpha=0.5
).set(title=CORPUS_NAME + '-jaro-umap', xlabel=None, ylabel=None)

plt.savefig(CORPUS_NAME + '-jaro-umap.png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)

In [None]:
# IBM-UMAP 2D Scatterplot

plt.figure(figsize=(20,20))
sns.scatterplot(
    x='umap-x',
    y='umap-y',
    data=two_umap_ibm,
    color='seagreen',
    alpha=0.5
).set(title=CORPUS_NAME + '-ibm-umap', xlabel=None, ylabel=None)

plt.savefig(CORPUS_NAME + '-ibm-umap.png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)

In [None]:
%%time

# t-SNE

two_similarity_embedded_tsne_levenshtein = sklearn.manifold.TSNE(n_components=2).fit_transform(original_distance_matrix_levenshtein)
two_similarity_embedded_tsne_jaro = sklearn.manifold.TSNE(n_components=2).fit_transform(original_distance_matrix_jaro)
two_similarity_embedded_tsne_ibm = sklearn.manifold.TSNE(n_components=2).fit_transform(original_distance_matrix_ibm)

In [None]:
%%time

# t-SNE

three_similarity_embedded_tsne_levenshtein = sklearn.manifold.TSNE(n_components=3).fit_transform(original_distance_matrix_levenshtein)
three_similarity_embedded_tsne_jaro = sklearn.manifold.TSNE(n_components=3).fit_transform(original_distance_matrix_jaro)
three_similarity_embedded_tsne_ibm = sklearn.manifold.TSNE(n_components=3).fit_transform(original_distance_matrix_ibm)

In [None]:
# Preprare 3D Data for plots

three_tsne_levenshtein = pandas.DataFrame()
three_tsne_levenshtein['tsne-x'] = three_similarity_embedded_tsne_levenshtein[:,0]
three_tsne_levenshtein['tsne-y'] = three_similarity_embedded_tsne_levenshtein[:,1]
three_tsne_levenshtein['tsne-z'] = three_similarity_embedded_tsne_levenshtein[:,2]
three_tsne_levenshtein['token'] = original_distance_matrix_levenshtein.index

three_tsne_jaro = pandas.DataFrame()
three_tsne_jaro['tsne-x'] = three_similarity_embedded_tsne_jaro[:,0]
three_tsne_jaro['tsne-y'] = three_similarity_embedded_tsne_jaro[:,1]
three_tsne_jaro['tsne-z'] = three_similarity_embedded_tsne_jaro[:,2]
three_tsne_jaro['token'] = original_distance_matrix_jaro.index

three_tsne_ibm = pandas.DataFrame()
three_tsne_ibm['tsne-x'] = three_similarity_embedded_tsne_ibm[:,0]
three_tsne_ibm['tsne-y'] = three_similarity_embedded_tsne_ibm[:,1]
three_tsne_ibm['tsne-z'] = three_similarity_embedded_tsne_ibm[:,2]
three_tsne_ibm['token'] = original_distance_matrix_ibm.index

In [None]:
# Preprare 2D Data for plots

two_tsne_levenshtein = pandas.DataFrame()
two_tsne_levenshtein['tsne-x'] = two_similarity_embedded_tsne_levenshtein[:,0]
two_tsne_levenshtein['tsne-y'] = two_similarity_embedded_tsne_levenshtein[:,1]
two_tsne_levenshtein['token'] = original_distance_matrix_levenshtein.index

two_tsne_jaro = pandas.DataFrame()
two_tsne_jaro['tsne-x'] = two_similarity_embedded_tsne_jaro[:,0]
two_tsne_jaro['tsne-y'] = two_similarity_embedded_tsne_jaro[:,1]
two_tsne_jaro['token'] = original_distance_matrix_jaro.index

two_tsne_ibm = pandas.DataFrame()
two_tsne_ibm['tsne-x'] = two_similarity_embedded_tsne_ibm[:,0]
two_tsne_ibm['tsne-y'] = two_similarity_embedded_tsne_ibm[:,1]
two_tsne_ibm['token'] = original_distance_matrix_ibm.index

In [None]:
# Levenshtein-TSNE 3D Scatterplot

fig = go.Figure(data=[go.Scatter3d(
    x=three_tsne_levenshtein['tsne-x'],
    y=three_tsne_levenshtein['tsne-y'],
    z=three_tsne_levenshtein['tsne-z'],
    mode='markers',
    text=three_tsne_levenshtein['token'],
    marker=dict(
        size=2,
        color='steelblue',
        opacity=1
    )
)])
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
# Jaro-TSNE 3D Scatterplot

fig = go.Figure(data=[go.Scatter3d(
    x=three_tsne_jaro['tsne-x'],
    y=three_tsne_jaro['tsne-y'],
    z=three_tsne_jaro['tsne-z'],
    mode='markers',
    text=three_tsne_jaro['token'],
    marker=dict(
        size=2,
        color='salmon',
        opacity=1
    )
)])
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
# IBM-TSNE 3D Scatterplot

fig = go.Figure(data=[go.Scatter3d(
    x=three_tsne_ibm['tsne-x'],
    y=three_tsne_ibm['tsne-y'],
    z=three_tsne_ibm['tsne-z'],
    mode='markers',
    text=three_tsne_ibm['token'],
    marker=dict(
        size=2,
        color='seagreen',
        opacity=1
    )
)])
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [None]:
# Levenshtein-TSNE 2D Scatterplot

plt.figure(figsize=(20,20))
sns.scatterplot(
    x='tsne-x',
    y='tsne-y',
    data=two_tsne_levenshtein,
    color='steelblue',
    alpha=0.5
).set(title=CORPUS_NAME + '-levenshtein-tsne', xlabel=None, ylabel=None)

plt.savefig(CORPUS_NAME + '-levenshtein-tsne.png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)

In [None]:
# Jaro-TSNE 2D Scatterplot

plt.figure(figsize=(20,20))
sns.scatterplot(
    x='tsne-x',
    y='tsne-y',
    data=two_tsne_jaro,
    color='salmon',
    alpha=0.5
).set(title=CORPUS_NAME + '-jaro-tsne', xlabel=None, ylabel=None)

plt.savefig(CORPUS_NAME + '-jaro-tsne.png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)

In [None]:
# IBM-TSNE 2D Scatterplot

plt.figure(figsize=(20,20))
sns.scatterplot(
    x='tsne-x',
    y='tsne-y',
    data=two_tsne_ibm,
    color='seagreen',
    alpha=0.5
).set(title=CORPUS_NAME + '-ibm-tsne', xlabel=None, ylabel=None)

plt.savefig(CORPUS_NAME + '-ibm-tsne.png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)