In [None]:
import numpy as np
import scipy
import sklearn.manifold
import seaborn as sns
import matplotlib.pyplot as plt
import umap
import pandas
import pickle
import os
import collections

In [None]:
INPUT = '../../histnorm/datasets/historical/spanish/spanish-ps<n>.dev.txt'
ENCODING = 'utf-8'
CORPUS_NAME = 'spanish-ps'
FILTER = ('"', "'", '#', '.', ',', '(', ')', ';', '—', '/')

tokens_raw = []

# Loading input file, which has the original and modernised token in each line separated by a \t
for n in range(16,20):
    inputfile = INPUT.replace('<n>', str(n))
    with open(inputfile, 'r', encoding=ENCODING) as infile:
        tokens_raw += [line.strip().split('\t') for line in infile]

# Filter out lines with control characters
tokens = [token for token in tokens_raw if len(token)>1 and not token[0].startswith(FILTER)]

In [None]:
# Getting the original and modernised tokens and types
tokens_original = [token[0].lower() for token in tokens if len(token) > 1]
tokens_modernised = [token[1].lower() for token in tokens if len(token) > 1]

types_original = list(set(tokens_original))
types_modernised = list(set(tokens_modernised))

In [None]:
def ttr(types, tokens):
    """
    Calculating Type-Token Ration
    """
    return len(types)/len(tokens)

assert ttr([0]*5, [0]*10)  == 0.5
assert ttr([0]*10, [0]*10) == 1

In [None]:
tokens_original_count = collections.Counter(tokens_original)
tokens_modernised_count = collections.Counter(tokens_modernised)

hapax_original_count = len([val for val in tokens_original_count.values() if val == 1])
hapax_modernised_count = len([val for val in tokens_modernised_count.values() if val == 1])

In [None]:
print(CORPUS_NAME)
print('Tokens Original Example: {}'.format(tokens_original[:10]))
print('Tokens Original Count: {}'.format(len(tokens_original)))
print('Types Original Example: {}'.format(types_original[:10]))
print('Types Original Count: {}'.format(len(types_original)))
print('Type/Token Ratio Original: {:2.2%}'.format(ttr(types_original, tokens_original)))
print('Hapax Original Count: {}'.format(hapax_original_count))
print('Tokens Most Common Original: {}'.format(str(tokens_original_count.most_common(10))))

In [None]:
print(CORPUS_NAME)
print('Tokens Modernised Example: {}'.format(tokens_modernised[:10]))
print('Tokens Modernised Count: {}'.format(len(tokens_modernised)))
print('Types Modernised Example: {}'.format(types_modernised[:10]))
print('Types Modernised Count: {}'.format(len(types_modernised)))
print('Type/Token Modernised Original: {:2.2%}'.format(ttr(types_modernised, tokens_modernised)))
print('Hapax Original Count: {}'.format(hapax_modernised_count))
print('Tokens Most Common Original: {}'.format(str(tokens_modernised_count.most_common(10))))

In [None]:
# Evaluation Cluster
evaluation_cluster = dict()
for token in tokens:
    if token[1].lower() in evaluation_cluster:
        evaluation_cluster[token[1].lower()].append(token[0].lower())
    else:
        evaluation_cluster[token[1].lower()] = [token[0].lower()]

In [None]:
assert len(evaluation_cluster) == len(types_modernised)

In [None]:
# Average cluster size
sum([len(val) for val in evaluation_cluster.values()]) / len(evaluation_cluster.values())

In [None]:
# Variations for 10 most common tokens
for token in tokens_modernised_count.most_common(10):
    print('{}: {}\n'.format(token[0], set(evaluation_cluster[token[0]])))

In [None]:
def levenshtein(string1, string2):
    """
    Levenshtein Distance between two strings
    """
    if string1 == string2:
        return 0

    rows = len(string1) + 1
    cols = len(string2) + 1
    dist = [[0 for c in range(cols)] for r in range(rows)]

    for j in range(1, rows):
        dist[j][0] = j
    for i in range(1, cols):
        dist[0][i] = i

    for col in range(1, cols):
        for row in range(1, rows):
            cost = 1
            if string1[row - 1] == string2[col - 1]:
                cost = 0
            dist[row][col] = min(dist[row - 1][col] + 1, dist[row][col - 1] + 1, dist[row - 1][col - 1] + cost)

    return dist[row][col]

assert levenshtein('foobar', 'foobar') == 0
assert levenshtein('foobar', 'foubar') == 1
assert levenshtein('foobar', 'fuubar') == 2

In [None]:
%%time

similarity = levenshtein
cache_name = CORPUS_NAME + '-pairwise-distance.pickle'

if os.path.exists(cache_name):
    print('> Using Cache')
    # Unpacking pickled Pairwise Distances
    with open(cache_name, 'rb' ) as pickler:    
        cache = pickle.load(pickler)
        types_original_pairwise_distance = cache['types_original_pairwise_distance']
        types_modernised_pairwise_distance = cache['types_modernised_pairwise_distance']

else:
    # Calculating string distances for each type
    types_original_reshaped = np.array(types_original).reshape(-1,1)
    types_original_pairwise_distance = scipy.spatial.distance.pdist(types_original_reshaped, lambda x,y: similarity(str(x[0]),str(y[0])))   

    types_modernised_reshaped = np.array(types_modernised).reshape(-1,1)
    types_modernised_pairwise_distance = scipy.spatial.distance.pdist(types_modernised_reshaped, lambda x,y: similarity(str(x[0]),str(y[0])))   

    print('> Writing Cache')
    with open(cache_name, 'wb' ) as pickler:
        data = {
            'types_original_pairwise_distance': types_original_pairwise_distance,
            'types_modernised_pairwise_distance': types_modernised_pairwise_distance
        }
        pickle.dump(data, pickler) 

In [None]:
%%time
# Transforming pairwise distances into a full similarity matrix
original_distance_matrix = pandas.DataFrame(scipy.spatial.distance.squareform(types_original_pairwise_distance), index=types_original, columns=types_original)
modernised_distance_matrix = pandas.DataFrame(scipy.spatial.distance.squareform(types_modernised_pairwise_distance), index=types_modernised, columns=types_modernised)

In [None]:
%%time
# Running t-SNE on similarity matrix

cache_name = CORPUS_NAME + '-tsne.pickle'

if os.path.exists(cache_name):
    print('> Using Cache')
    # Unpacking pickled Pairwise Distances
    with open(cache_name, 'rb' ) as pickler:    
        cache = pickle.load(pickler)
        original_similarity_embedded = cache['original_similarity_embedded']
        modernised_similarity_embedded = cache['modernised_similarity_embedded']

else:
    original_similarity_embedded = sklearn.manifold.TSNE(n_components=2).fit_transform(original_distance_matrix)
    modernised_similarity_embedded = sklearn.manifold.TSNE(n_components=2).fit_transform(modernised_distance_matrix)
    
    print('> Writing Cache')
    with open(cache_name, 'wb' ) as pickler:
        data = {
            'original_similarity_embedded': original_similarity_embedded,
            'modernised_similarity_embedded': modernised_similarity_embedded
        }
        pickle.dump(data, pickler)

In [None]:
%%time
# Running UMAP on similarity matrix

cache_name = CORPUS_NAME + '-umap.pickle'

if os.path.exists(cache_name):
    print('> Using Cache')
    # Unpacking pickled Pairwise Distances
    with open(cache_name, 'rb' ) as pickler:    
        cache = pickle.load(pickler)
        original_similarity_embedded_umap = cache['original_similarity_embedded_umap']
        modernised_similarity_embedded_umap = cache['modernised_similarity_embedded_umap']

else:
    original_similarity_embedded_umap = umap.UMAP(n_components=2).fit_transform(original_distance_matrix)
    modernised_similarity_embedded_umap = umap.UMAP(n_components=2).fit_transform(modernised_distance_matrix)
    
    print('> Writing Cache')
    with open(cache_name, 'wb' ) as pickler:
        data = {
            'original_similarity_embedded_umap': original_similarity_embedded_umap,
            'modernised_similarity_embedded_umap': modernised_similarity_embedded_umap
        }
        pickle.dump(data, pickler)

In [None]:
original_tsne = pandas.DataFrame()
original_tsne['tsne-x-original'] = original_similarity_embedded[:,0]
original_tsne['tsne-y-original'] = original_similarity_embedded[:,1]

modernised_tsne = pandas.DataFrame()
modernised_tsne['tsne-x-modernised'] = modernised_similarity_embedded[:,0]
modernised_tsne['tsne-y-modernised'] = modernised_similarity_embedded[:,1]

In [None]:
original_umap = pandas.DataFrame()
original_umap['umap-x-original'] = original_similarity_embedded_umap[:,0]
original_umap['umap-y-original'] = original_similarity_embedded_umap[:,1]

modernised_umap = pandas.DataFrame()
modernised_umap['umap-x-modernised'] = modernised_similarity_embedded_umap[:,0]
modernised_umap['umap-y-modernised'] = modernised_similarity_embedded_umap[:,1]

In [None]:
df_tokens_original_count = pandas.DataFrame.from_dict(tokens_original_count, orient='index').reset_index()
df_tokens_original_count = df_tokens_original_count.rename(columns={'index': 'token', 0: 'occurrence'})
df_tokens_original_count = df_tokens_original_count.sort_values(by=['occurrence'], ascending=False)

df_tokens_modernised_count = pandas.DataFrame.from_dict(tokens_modernised_count, orient='index').reset_index()
df_tokens_modernised_count = df_tokens_modernised_count.rename(columns={'index': 'token', 0: 'occurrence'})
df_tokens_modernised_count = df_tokens_modernised_count.sort_values(by=['occurrence'], ascending=False)

In [None]:
TOP_N = 25 
plt.figure(figsize=(20,20))
sns.countplot(
    data=df_tokens_original_count,
    order = df_tokens_original_count['occurrence'].value_counts().iloc[:TOP_N].index,
    x='occurrence',
    color='steelblue',
).set_title(CORPUS_NAME + '-token-occurrences-original-top-' + str(TOP_N))

plt.savefig(CORPUS_NAME + '-token-occurrences-original-top-' + str(TOP_N) +' .png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)

In [None]:
TOP_N = 25 
plt.figure(figsize=(20,20))
sns.countplot(
    data=df_tokens_modernised_count,
    order = df_tokens_modernised_count['occurrence'].value_counts().iloc[:TOP_N].index,
    x='occurrence',
    color='steelblue',
).set_title(CORPUS_NAME + '-token-occurrences-modernised-top-' + str(TOP_N))

plt.savefig(CORPUS_NAME + '-token-occurrences-modernised-top-' + str(TOP_N) +' .png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)

In [None]:
plt.figure(figsize=(20,20))
sns.scatterplot(
    x='tsne-x-original', y='tsne-y-original',
    data=original_tsne,
    alpha=0.5
).set_title(CORPUS_NAME + '-tsne-original')

plt.savefig('spanish-ps-tsne-original.png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)

In [None]:
plt.figure(figsize=(20,20))
sns.scatterplot(
    x='tsne-x-modernised', y='tsne-y-modernised',
    data=modernised_tsne,
    alpha=0.5
).set_title(CORPUS_NAME + '-tsne-modernised')

plt.savefig(CORPUS_NAME + '-tsne-modernised.png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)

In [None]:
plt.figure(figsize=(20,20))
sns.scatterplot(
    x='umap-x-original', y='umap-y-original',
    data=original_umap,
    alpha=0.5
).set_title(CORPUS_NAME + '-umap-original')

plt.savefig(CORPUS_NAME + '-umap-original.png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)

In [None]:
plt.figure(figsize=(20,20))
sns.scatterplot(
    x='umap-x-modernised', y='umap-y-modernised',
    data=modernised_umap,
    alpha=0.5
).set_title(CORPUS_NAME + '-umap-modernised')

plt.savefig(CORPUS_NAME + '-umap-modernised.png', 
            facecolor='white',
            bbox_inches='tight', 
            dpi=100,
            pad_inches=0.1)

In [None]:
%%time

# Clustering the pairwise distances
linkage_method = 'ward'
original_clustering = scipy.cluster.hierarchy.linkage(types_original_pairwise_distance, linkage_method)

In [None]:
from functools import reduce
from json import dump

labels = dict(enumerate(types_original))

def add_nodes(node, parent):
    """
    Recursively build tree as dict
    """
    new_node = dict(node_id=node.id, children=[], distance=node.dist)
    parent['children'].append(new_node)
    if node.left: add_nodes(node.left, new_node)
    if node.right: add_nodes(node.right, new_node)

def add_labels(node):
    """
    Recursively add labels to the tree
    """
    is_leaf = len(node['children']) == 0

    if is_leaf: 
        node['name'] = labels[node['node_id']]
    else:
        list(map(add_labels, node['children']))  
    del node['node_id']

if not os.path.exists(CORPUS_NAME + '-cluster-original.json'):
    # Transforming Cluster into JSON Tree
    scipy_tree = scipy.cluster.hierarchy.to_tree(original_clustering, rd=False)
    tree = dict(name='root', children=[], distance=scipy_tree.dist)
    
    add_nodes(scipy_tree, tree)
    add_labels(tree['children'][0])
    
    with open(CORPUS_NAME + '-cluster-original.json', 'w') as clustering:
        dump(tree, clustering, indent=1)